5050
5151Miscellaneous:
5252
53+ - Webchecker honors the "robots.txt" convention. Thanks to Skip
54+ Montanaro for his robotparser.py module (included in this directory)!
55+ The agent name is hardwired to "webchecker". URLs that are disallowed
56+ by the robots.txt file are reported as external URLs.
57+
5358- Because the HTML parser is a bit slow, very large HTML files are
54- skipped. The size limit can be set with the -m option.
59+ skipped. The size limit can be set with the -m option.
5560
5661- Before fetching a page, it guesses its type based on its extension.
5762If it is a known extension and the type is not text/http, the page is
103108import formatter
104109
105110import mimetypes
111+ import robotparser
106112
107113
108114# Tunable parameters
109115DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
110116MAXPAGE = 50000 # Ignore files bigger than this
111117ROUNDSIZE = 50 # Number of links processed per round
112118DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
119+ AGENTNAME = "webchecker" # Agent name for robots.txt parser
113120
114121
115122# Global variables
@@ -208,11 +215,32 @@ def __init__(self):
208215 self .bad = {}
209216 self .urlopener = MyURLopener ()
210217 self .round = 0
218+ self .robots = {}
219+
220+ def __getstate__ (self ):
221+ return (self .roots , self .todo , self .done ,
222+ self .ext , self .bad , self .round )
223+
224+ def __setstate__ (self , state ):
225+ (self .roots , self .todo , self .done ,
226+ self .ext , self .bad , self .round ) = state
227+ for root in self .roots :
228+ self .addrobot (root )
211229
212230 def addroot (self , root ):
213231 if root not in self .roots :
214232 self .roots .append (root )
215233 self .todo [root ] = []
234+ self .addrobot (root )
235+
236+ def addrobot (self , root ):
237+ self .robots [root ] = rp = robotparser .RobotFileParser ()
238+ if verbose > 3 :
239+ print "Parsing robots.txt file"
240+ rp .debug = 1
241+ url = urlparse .urljoin (root , "/robots.txt" )
242+ rp .set_url (url )
243+ rp .read ()
216244
217245 def run (self ):
218246 while self .todo :
@@ -332,7 +360,7 @@ def dopage(self, url):
332360 def inroots (self , url ):
333361 for root in self .roots :
334362 if url [:len (root )] == root :
335- return 1
363+ return self . robots [ root ]. can_fetch ( AGENTNAME , url )
336364 return 0
337365
338366 def getpage (self , url ):
@@ -348,6 +376,13 @@ def getpage(self, url):
348376 try :
349377 f = self .urlopener .open (url )
350378 except IOError , msg :
379+ if (type (msg ) == TupleType and
380+ len (msg ) >= 4 and
381+ msg [0 ] == 'http error' and
382+ type (msg [3 ]) == InstanceType ):
383+ # Remove the Message instance -- it may contain
384+ # a file object which prevents pickling.
385+ msg = msg [:3 ] + msg [4 :]
351386 if verbose > 0 :
352387 print "Error " , msg
353388 if verbose > 0 :
@@ -360,7 +395,7 @@ def getpage(self, url):
360395 ctype = string .lower (info ['content-type' ])
361396 if nurl != url :
362397 if verbose > 1 :
363- print "Redirected to" , nurl
398+ print " Redirected to" , nurl
364399 if not ctype :
365400 ctype , encoding = mimetypes .guess_type (nurl )
366401 if ctype != 'text/html' :
0 commit comments