File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -1399,8 +1399,7 @@ def sanitizeAsciiString(subject):
13991399def getFilteredPageContent (page , onlyText = True ):
14001400 retVal = page
14011401
1402- # only if the page's charset had been successfully identified
1403- if isinstance (page , unicode ):
1402+ if isinstance (page , basestring ):
14041403 retVal = re .sub (r"(?s)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else "" ), " " , page )
14051404
14061405 while retVal .find (" " ) != - 1 :
@@ -1413,8 +1412,7 @@ def getFilteredPageContent(page, onlyText=True):
14131412def getPageTextWordsSet (page ):
14141413 retVal = None
14151414
1416- # only if the page's charset had been successfully identified
1417- if isinstance (page , unicode ):
1415+ if isinstance (page , basestring ):
14181416 page = getFilteredPageContent (page )
14191417 retVal = set (re .findall (r"\w+" , page ))
14201418
You can’t perform that action at this time.
0 commit comments