Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit eaef068

Browse files
committed
major bug fix (different HTTP content charsets are now properly handled)
1 parent 654d707 commit eaef068

2 files changed

Lines changed: 18 additions & 16 deletions

File tree

lib/request/basic.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,18 +89,22 @@ def parseResponse(page, headers):
8989
kb.absFilePaths.add(absFilePath)
9090

9191

92-
def decodePage(page, encoding):
92+
def decodePage(page, contentEncoding, contentType):
9393
"""
94-
Decode gzip/deflate HTTP response
94+
Decode compressed/charset HTTP response
9595
"""
9696

97-
if isinstance(encoding, basestring) and encoding.lower() in ('gzip', 'x-gzip', 'deflate'):
98-
if encoding == 'deflate':
97+
if isinstance(contentEncoding, basestring) and contentEncoding.lower() in ('gzip', 'x-gzip', 'deflate'):
98+
if contentEncoding == 'deflate':
9999
# http://stackoverflow.com/questions/1089662/python-inflate-and-deflate-implementations
100100
data = StringIO.StringIO(zlib.decompress(page, -15))
101101
else:
102102
data = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(page))
103103

104104
page = data.read()
105105

106+
#http://stackoverflow.com/questions/1020892/python-urllib2-read-to-unicode
107+
if contentType and (contentType.find('charset=') != -1):
108+
page = unicode(page, contentType.split('charset=')[-1])
109+
106110
return page

lib/request/connect.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -88,33 +88,32 @@ def getPage(**kwargs):
8888
try:
8989
if silent:
9090
socket.setdefaulttimeout(3)
91-
91+
9292
if direct:
9393
if "?" in url:
9494
url, params = url.split("?")
9595
params = urlencode(params)
9696
url = "%s?%s" % (url, params)
9797
requestMsg += "?%s" % params
98-
98+
9999
elif multipart:
100100
# Needed in this form because of potential circle dependency
101101
# problem (option -> update -> connect -> option)
102102
from lib.core.option import proxyHandler
103-
103+
104104
multipartOpener = urllib2.build_opener(proxyHandler, multipartpost.MultipartPostHandler)
105105
conn = multipartOpener.open(url, multipart)
106-
page = conn.read()
106+
page = conn.read()
107107
responseHeaders = conn.info()
108-
109-
encoding = responseHeaders.get("Content-Encoding")
110-
page = decodePage(page, encoding)
111-
108+
109+
page = decodePage(page, responseHeaders.get("Content-Encoding"), responseHeaders.get("Content-Type"))
110+
112111
return page
113-
112+
114113
else:
115114
if conf.parameters.has_key("GET") and not get:
116115
get = conf.parameters["GET"]
117-
116+
118117
if get:
119118
get = urlencode(get)
120119
url = "%s?%s" % (url, get)
@@ -190,8 +189,7 @@ def getPage(**kwargs):
190189
status = conn.msg
191190
responseHeaders = conn.info()
192191

193-
encoding = responseHeaders.get("Content-Encoding")
194-
page = decodePage(page, encoding)
192+
page = decodePage(page, responseHeaders.get("Content-Encoding"), responseHeaders.get("Content-Type"))
195193

196194
except urllib2.HTTPError, e:
197195
if e.code == 401:

0 commit comments

Comments
 (0)