2222Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
2323"""
2424
25+ import codecs
2526import gzip
2627import os
2728import re
3435from lib .core .common import urlEncodeCookieValues
3536from lib .core .data import conf
3637from lib .core .data import kb
38+ from lib .core .data import logger
3739from lib .parse .headers import headersParser
3840from lib .parse .html import htmlParser
3941
@@ -88,6 +90,20 @@ def parseResponse(page, headers):
8890 if absFilePath not in kb .absFilePaths :
8991 kb .absFilePaths .add (absFilePath )
9092
93+ def checkCharEncoding (encoding ):
94+ #http://philip.html5.org/data/charsets-2.html
95+ if encoding and encoding .startswith ('cp-' ):
96+ encoding = 'cp%s' % encoding [3 :]
97+ try :
98+ codecs .lookup (encoding )
99+ except LookupError :
100+ warnMsg = "unknown charset '%s'. " % encoding
101+ warnMsg += "please report by e-mail to [email protected] ." 102+
103+ logger .warn (warnMsg )
104+ encoding = conf .dataEncoding
105+ return encoding
106+
91107def decodePage (page , contentEncoding , contentType ):
92108 """
93109 Decode compressed/charset HTTP response
@@ -101,9 +117,11 @@ def decodePage(page, contentEncoding, contentType):
101117 data = gzip .GzipFile ('' , 'rb' , 9 , StringIO .StringIO (page ))
102118
103119 page = data .read ()
104-
120+
105121 #http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
106122 if contentType and (contentType .find ('charset=' ) != - 1 ):
107- page = unicode (page , contentType .split ('charset=' )[- 1 ]) #don't use getUnicode here. it needs to stay as is.
123+ charset = checkCharEncoding (contentType .split ('charset=' )[- 1 ])
124+ if charset :
125+ page = unicode (page , charset ) #don't use getUnicode here. it needs to stay as is.
108126
109127 return page
0 commit comments