@@ -334,41 +334,44 @@ def decodePage(page, contentEncoding, contentType, percentDecode=True):
334334
335335 # can't do for all responses because we need to support binary files too
336336 if isinstance (page , six .binary_type ) and "text/" in contentType :
337- # e.g. 	Ãëàâà
338- if b"&#" in page :
339- page = re .sub (b"&#x([0-9a-f]{1,2});" , lambda _ : decodeHex (_ .group (1 ) if len (_ .group (1 )) == 2 else "0%s" % _ .group (1 )), page )
340- page = re .sub (b"&#(\\ d{1,3});" , lambda _ : six .int2byte (int (_ .group (1 ))) if int (_ .group (1 )) < 256 else _ .group (0 ), page )
341-
342- # e.g. %20%28%29
343- if percentDecode :
344- if b"%" in page :
345- page = re .sub (b"%([0-9a-fA-F]{2})" , lambda _ : decodeHex (_ .group (1 )), page )
346-
347- # e.g. &
348- page = re .sub (b"&([^;]+);" , lambda _ : six .int2byte (HTML_ENTITIES [getText (_ .group (1 ))]) if HTML_ENTITIES .get (getText (_ .group (1 )), 256 ) < 256 else _ .group (0 ), page )
349-
350- kb .pageEncoding = kb .pageEncoding or checkCharEncoding (getHeuristicCharEncoding (page ))
351-
352- if (kb .pageEncoding or "" ).lower () == "utf-8-sig" :
353- kb .pageEncoding = "utf-8"
354- if page and page .startswith ("\xef \xbb \xbf " ): # Reference: https://docs.python.org/2/library/codecs.html (Note: noticed problems when "utf-8-sig" is left to Python for handling)
355- page = page [3 :]
356-
357- page = getUnicode (page , kb .pageEncoding )
358-
359- # e.g. ’…™
360- if "&#" in page :
361- def _ (match ):
362- retVal = match .group (0 )
363- try :
364- retVal = _unichr (int (match .group (1 )))
365- except (ValueError , OverflowError ):
366- pass
367- return retVal
368- page = re .sub (r"&#(\d+);" , _ , page )
369-
370- # e.g. ζ
371- page = re .sub (r"&([^;]+);" , lambda _ : _unichr (HTML_ENTITIES [_ .group (1 )]) if HTML_ENTITIES .get (_ .group (1 ), 0 ) > 255 else _ .group (0 ), page )
337+ if not kb .disableHtmlDecoding :
338+ # e.g. 	Ãëàâà
339+ if b"&#" in page :
340+ page = re .sub (b"&#x([0-9a-f]{1,2});" , lambda _ : decodeHex (_ .group (1 ) if len (_ .group (1 )) == 2 else "0%s" % _ .group (1 )), page )
341+ page = re .sub (b"&#(\\ d{1,3});" , lambda _ : six .int2byte (int (_ .group (1 ))) if int (_ .group (1 )) < 256 else _ .group (0 ), page )
342+
343+ # e.g. %20%28%29
344+ if percentDecode :
345+ if b"%" in page :
346+ page = re .sub (b"%([0-9a-fA-F]{2})" , lambda _ : decodeHex (_ .group (1 )), page )
347+
348+ # e.g. &
349+ page = re .sub (b"&([^;]+);" , lambda _ : six .int2byte (HTML_ENTITIES [getText (_ .group (1 ))]) if HTML_ENTITIES .get (getText (_ .group (1 )), 256 ) < 256 else _ .group (0 ), page )
350+
351+ kb .pageEncoding = kb .pageEncoding or checkCharEncoding (getHeuristicCharEncoding (page ))
352+
353+ if (kb .pageEncoding or "" ).lower () == "utf-8-sig" :
354+ kb .pageEncoding = "utf-8"
355+ if page and page .startswith ("\xef \xbb \xbf " ): # Reference: https://docs.python.org/2/library/codecs.html (Note: noticed problems when "utf-8-sig" is left to Python for handling)
356+ page = page [3 :]
357+
358+ page = getUnicode (page , kb .pageEncoding )
359+
360+ # e.g. ’…™
361+ if "&#" in page :
362+ def _ (match ):
363+ retVal = match .group (0 )
364+ try :
365+ retVal = _unichr (int (match .group (1 )))
366+ except (ValueError , OverflowError ):
367+ pass
368+ return retVal
369+ page = re .sub (r"&#(\d+);" , _ , page )
370+
371+ # e.g. ζ
372+ page = re .sub (r"&([^;]+);" , lambda _ : _unichr (HTML_ENTITIES [_ .group (1 )]) if HTML_ENTITIES .get (_ .group (1 ), 0 ) > 255 else _ .group (0 ), page )
373+ else :
374+ page = getUnicode (page , kb .pageEncoding )
372375
373376 return page
374377
0 commit comments