Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0d08903

Browse files
committed
some charset fix up
1 parent 24428c1 commit 0d08903

1 file changed

Lines changed: 20 additions & 2 deletions

File tree

lib/request/basic.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
2323
"""
2424

25+
import codecs
2526
import gzip
2627
import os
2728
import re
@@ -34,6 +35,7 @@
3435
from lib.core.common import urlEncodeCookieValues
3536
from lib.core.data import conf
3637
from lib.core.data import kb
38+
from lib.core.data import logger
3739
from lib.parse.headers import headersParser
3840
from lib.parse.html import htmlParser
3941

@@ -88,6 +90,20 @@ def parseResponse(page, headers):
8890
if absFilePath not in kb.absFilePaths:
8991
kb.absFilePaths.add(absFilePath)
9092

93+
def checkCharEncoding(encoding):
94+
#http://philip.html5.org/data/charsets-2.html
95+
if encoding and encoding.startswith('cp-'):
96+
encoding = 'cp%s' % encoding[3:]
97+
try:
98+
codecs.lookup(encoding)
99+
except LookupError:
100+
warnMsg = "unknown charset '%s'. " % encoding
101+
warnMsg += "please report by e-mail to [email protected]."
102+
103+
logger.warn(warnMsg)
104+
encoding = conf.dataEncoding
105+
return encoding
106+
91107
def decodePage(page, contentEncoding, contentType):
92108
"""
93109
Decode compressed/charset HTTP response
@@ -101,9 +117,11 @@ def decodePage(page, contentEncoding, contentType):
101117
data = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(page))
102118

103119
page = data.read()
104-
120+
105121
#http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
106122
if contentType and (contentType.find('charset=') != -1):
107-
page = unicode(page, contentType.split('charset=')[-1]) #don't use getUnicode here. it needs to stay as is.
123+
charset = checkCharEncoding(contentType.split('charset=')[-1])
124+
if charset:
125+
page = unicode(page, charset) #don't use getUnicode here. it needs to stay as is.
108126

109127
return page

0 commit comments

Comments
 (0)