@@ -112,6 +112,13 @@ def updateURLs(self, tree):
112
112
newUrls .add (urllib_parse .urlunsplit (splitURL ))
113
113
urls = newUrls
114
114
115
+ toVisit = self .check_robots (urls )
116
+ toVisit = self .check_headers (toVisit )
117
+
118
+ self .visitedURLs .update (urls )
119
+ self .unvisitedURLs .update (toVisit )
120
+
121
+ def check_headers (self , urls ):
115
122
responseHeaders = {}
116
123
# Now we want to find the content types of the links we haven't visited
117
124
for url in urls :
@@ -128,8 +135,13 @@ def updateURLs(self, tree):
128
135
'html' in responseHeaders [url ].get ('content-type' , '' ) and
129
136
responseHeaders [url ]['status' ] == "200" ])
130
137
138
+ return toVisit
139
+
140
+ def check_robots (self , urls ):
131
141
# Now check we are allowed to spider the page
132
- for url in list (toVisit ):
142
+ toVisit = list (urls )
143
+
144
+ for url in toVisit :
133
145
robotURL = list (urllib_parse .urlsplit (url )[:2 ])
134
146
robotURL .extend (["robots.txt" , "" , "" ])
135
147
robotURL = urllib_parse .urlunsplit (robotURL )
@@ -138,15 +150,14 @@ def updateURLs(self, tree):
138
150
self .robotParser .read ()
139
151
except Exception as e :
140
152
print ('Failed to read {0}: {1}' .format (robotURL , e ), file = sys .stderr )
141
- toVisit .remove (url )
153
+ urls .remove (url )
142
154
continue
143
155
144
156
if not self .robotParser .can_fetch ("*" , url ):
145
157
print ('{0} rejects {1}' .format (robotURL , url ), file = sys .stderr )
146
- toVisit .remove (url )
158
+ urls .remove (url )
147
159
148
- self .visitedURLs .update (urls )
149
- self .unvisitedURLs .update (toVisit )
160
+ return urls
150
161
151
162
152
163
def main ():
0 commit comments