File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -1399,7 +1399,8 @@ def sanitizeAsciiString(subject):
13991399def getFilteredPageContent (page , onlyText = True ):
14001400 retVal = page
14011401
1402- if isinstance (page , basestring ):
1402+ # only if the page's charset has been successfully identified
1403+ if isinstance (page , unicode ):
14031404 retVal = re .sub (r"(?s)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else "" ), " " , page )
14041405
14051406 while retVal .find (" " ) != - 1 :
@@ -1412,7 +1413,8 @@ def getFilteredPageContent(page, onlyText=True):
14121413def getPageTextWordsSet (page ):
14131414 retVal = None
14141415
1415- if isinstance (page , basestring ):
1416+ # only if the page's charset has been successfully identified
1417+ if isinstance (page , unicode ):
14161418 page = getFilteredPageContent (page )
14171419 retVal = set (re .findall (r"\w+" , page ))
14181420
You can’t perform that action at this time.
0 commit comments