Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0116f99

Browse files
committed
Check robots.txt before many HEAD requests
1 parent 7ee9026 commit 0116f99

File tree

1 file changed

+16
-5
lines changed

1 file changed

+16
-5
lines changed

utils/spider.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,13 @@ def updateURLs(self, tree):
112112
newUrls.add(urllib_parse.urlunsplit(splitURL))
113113
urls = newUrls
114114

115+
toVisit = self.check_robots(urls)
116+
toVisit = self.check_headers(toVisit)
117+
118+
self.visitedURLs.update(urls)
119+
self.unvisitedURLs.update(toVisit)
120+
121+
def check_headers(self, urls):
115122
responseHeaders = {}
116123
# Now we want to find the content types of the links we haven't visited
117124
for url in urls:
@@ -128,8 +135,13 @@ def updateURLs(self, tree):
128135
'html' in responseHeaders[url].get('content-type', '') and
129136
responseHeaders[url]['status'] == "200"])
130137

138+
return toVisit
139+
140+
def check_robots(self, urls):
131141
# Now check we are allowed to spider the page
132-
for url in list(toVisit):
142+
toVisit = list(urls)
143+
144+
for url in toVisit:
133145
robotURL = list(urllib_parse.urlsplit(url)[:2])
134146
robotURL.extend(["robots.txt", "", ""])
135147
robotURL = urllib_parse.urlunsplit(robotURL)
@@ -138,15 +150,14 @@ def updateURLs(self, tree):
138150
self.robotParser.read()
139151
except Exception as e:
140152
print('Failed to read {0}: {1}'.format(robotURL, e), file=sys.stderr)
141-
toVisit.remove(url)
153+
urls.remove(url)
142154
continue
143155

144156
if not self.robotParser.can_fetch("*", url):
145157
print('{0} rejects {1}'.format(robotURL, url), file=sys.stderr)
146-
toVisit.remove(url)
158+
urls.remove(url)
147159

148-
self.visitedURLs.update(urls)
149-
self.unvisitedURLs.update(toVisit)
160+
return urls
150161

151162

152163
def main():

0 commit comments

Comments
 (0)