Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ebe8ee3

Browse files
committed
Fix for crawler and redirection case
1 parent 09e7f4f commit ebe8ee3

3 files changed

Lines changed: 5 additions & 1 deletion

File tree

lib/core/threads.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def reset(self):
4949
self.lastQueryDuration = 0
5050
self.lastRequestMsg = None
5151
self.lastRequestUID = 0
52+
self.lastRedirectURL = None
5253
self.resumed = False
5354
self.retriesCount = 0
5455
self.seqMatcher = difflib.SequenceMatcher(None)

lib/request/redirecthandler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,10 @@ def http_error_302(self, req, fp, code, msg, headers):
117117
else:
118118
result = fp
119119

120+
threadData.lastRedirectURL = (threadData.lastRequestUID, redurl)
121+
120122
result.redcode = code
121123
result.redurl = redurl
122-
123124
return result
124125

125126
http_error_301 = http_error_303 = http_error_307 = http_error_302

lib/utils/crawler.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ def crawlThread():
7272
href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
7373

7474
if href:
75+
if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
76+
current = threadData.lastRedirectURL[1]
7577
url = urlparse.urljoin(current, href)
7678

7779
# flag to know if we are dealing with the same target host

0 commit comments

Comments
 (0)