@@ -227,23 +227,27 @@ def decodePage(page, contentEncoding, contentType):
227227 kb .pageEncoding = conf .charset
228228
229229 # can't do for all responses because we need to support binary files too
230- if contentType and not isinstance (page , unicode ) and any ( map ( lambda _ : _ in contentType .lower (), ( "text/txt" , "text/raw" , "text/html" , "text/xml" )) ):
230+ if contentType and not isinstance (page , unicode ) and "text/" in contentType .lower ():
231231 # e.g. Ãëàâà
232232 if "&#" in page :
233- page = re .sub ('&#(\d{1,3});' , lambda _ : chr (int (_ .group (1 ))) if int (_ .group (1 )) < 256 else _ .group (0 ), page )
233+ page = re .sub (r"&#(\d{1,3});" , lambda _ : chr (int (_ .group (1 ))) if int (_ .group (1 )) < 256 else _ .group (0 ), page )
234+
235+ # e.g. %20%28%29
236+ if "%" in page :
237+ page = re .sub (r"%([0-9a-fA-F]{2})" , lambda _ : _ .group (1 ).decode ("hex" ), page )
234238
235239 # e.g. &
236- page = re .sub (' &([^;]+);' , lambda _ : chr (htmlEntities [_ .group (1 )]) if htmlEntities .get (_ .group (1 ), 256 ) < 256 else _ .group (0 ), page )
240+ page = re .sub (r" &([^;]+);" , lambda _ : chr (htmlEntities [_ .group (1 )]) if htmlEntities .get (_ .group (1 ), 256 ) < 256 else _ .group (0 ), page )
237241
238242 kb .pageEncoding = kb .pageEncoding or checkCharEncoding (getHeuristicCharEncoding (page ))
239243 page = getUnicode (page , kb .pageEncoding )
240244
241245 # e.g. ’…™
242246 if "&#" in page :
243- page = re .sub (' &#(\d+);' , lambda _ : unichr (int (_ .group (1 ))), page )
247+ page = re .sub (r" &#(\d+);" , lambda _ : unichr (int (_ .group (1 ))), page )
244248
245249 # e.g. ζ
246- page = re .sub (' &([^;]+);' , lambda _ : unichr (htmlEntities [_ .group (1 )]) if htmlEntities .get (_ .group (1 ), 0 ) > 255 else _ .group (0 ), page )
250+ page = re .sub (r" &([^;]+);" , lambda _ : unichr (htmlEntities [_ .group (1 )]) if htmlEntities .get (_ .group (1 ), 0 ) > 255 else _ .group (0 ), page )
247251
248252 return page
249253
0 commit comments