how to detect the character encoding in a web page ?
iMath
redstone-cold at 163.com
Wed Jun 5 10:46:04 EDT 2013
在 2012年12月24日星期一UTC+8上午8时34分47秒,iMath写道:
> how to detect the character encoding in a web page ?
>
> such as this page
>
>
>
> http://python.org/
I found PyQt’s QtextStream can very accurately detect the character encoding in a web page .
even for this bad page
chardet and beautiful soup failed ,but QtextStream can get the right result .
here is my code
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtNetwork import *
import sys
def slotSourceDownloaded(reply):
redirctLocation=reply.header(QNetworkRequest.LocationHeader)
redirctLocationUrl=reply.url() if not redirctLocation else redirctLocation
print(redirctLocationUrl)
if (reply.error()!= QNetworkReply.NoError):
print('11111111', reply.errorString())
return
content=QTextStream(reply).readAll()
if content=='':
print('---------', 'cannot find any resource !')
return
print(content)
reply.deleteLater()
qApp.quit()
if __name__ == '__main__':
app =QCoreApplication(sys.argv)
manager=QNetworkAccessManager ()
url =input('input url :')
request=QNetworkRequest (QUrl.fromEncoded(QUrl.fromUserInput(url).toEncoded()))
request.setRawHeader("User-Agent" ,'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17 SE 2.X MetaSr 1.0')
manager.get(request)
manager.finished.connect(slotSourceDownloaded)
sys.exit(app.exec_())
More information about the Python-list
mailing list