Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit af5fe45

Browse files
committed
revert of the revert (it's a good idea to have it like this because of problems with e.g. --text-only and binary content)
1 parent 8ec4bc9 commit af5fe45

1 file changed

Lines changed: 4 additions & 2 deletions

File tree

lib/core/common.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1399,7 +1399,8 @@ def sanitizeAsciiString(subject):
13991399
def getFilteredPageContent(page, onlyText=True):
14001400
retVal = page
14011401

1402-
if isinstance(page, basestring):
1402+
# only if the page's charset has been successfully identified
1403+
if isinstance(page, unicode):
14031404
retVal = re.sub(r"(?s)<script.+?</script>|<!--.+?-->|<style.+?</style>%s" % (r"|<[^>]+>|\t|\n|\r" if onlyText else ""), " ", page)
14041405

14051406
while retVal.find(" ") != -1:
@@ -1412,7 +1413,8 @@ def getFilteredPageContent(page, onlyText=True):
14121413
def getPageTextWordsSet(page):
14131414
retVal = None
14141415

1415-
if isinstance(page, basestring):
1416+
# only if the page's charset has been successfully identified
1417+
if isinstance(page, unicode):
14161418
page = getFilteredPageContent(page)
14171419
retVal = set(re.findall(r"\w+", page))
14181420

0 commit comments

Comments
 (0)