7070links in <FORM> or <IMG> or whatever other tags might contain
7171hyperlinks. It does honor the <BASE> tag.
7272
73- - It could be argued that it should also check external links for
74- validity. This is true, but is is more error-prone. I think I will
75- make this an option in the future.
73+ - Checking external links is not done by default; use -x to enable
74+ this feature. This is done because checking external links usually
75+ takes a lot of time. When enabled, this check is executed during the
76+ report generation phase (so -x is ignored when -q is specified). Even
77+ when -x is enabled, only ``http:'' URLs are checked.
7678
7779
7880Usage: webchecker.py [option] ... [rooturl] ...
8587-q -- quiet operation (also suppresses external links report)
8688-r number -- number of links processed per round (default %(ROUNDSIZE)d)
8789-v -- verbose operation; repeating -v will increase verbosity
90+ -x -- check external links (during report phase)
8891
8992Arguments:
9093
@@ -131,9 +134,10 @@ def main():
131134 global verbose , maxpage , roundsize
132135 dumpfile = DUMPFILE
133136 restart = 0
137+ checkext = 0
134138
135139 try :
136- opts , args = getopt .getopt (sys .argv [1 :], 'Rd:m:qr:v ' )
140+ opts , args = getopt .getopt (sys .argv [1 :], 'Rd:m:qr:vx ' )
137141 except getopt .error , msg :
138142 sys .stdout = sys .stderr
139143 print msg
@@ -151,6 +155,8 @@ def main():
151155 roundsize = string .atoi (a )
152156 if o == '-v' :
153157 verbose = verbose + 1
158+ if o == '-x' :
159+ checkext = 1
154160
155161 if verbose :
156162 print AGENTNAME , "version" , __version__
@@ -180,8 +186,12 @@ def main():
180186 c .run ()
181187 except KeyboardInterrupt :
182188 if verbose > 0 :
183- print "[interrupted]"
184- c .report ()
189+ print "[run interrupted]"
190+ try :
191+ c .report (checkext )
192+ except KeyboardInterrupt :
193+ if verbose > 0 :
194+ print "[report interrupted]"
185195 if not needsave :
186196 if verbose > 0 :
187197 print
@@ -266,30 +276,42 @@ def run(self):
266276 self .done [url ] = self .todo [url ]
267277 del self .todo [url ]
268278
269- def report (self ):
279+ def report (self , checkext = 0 ):
270280 print
271281 if not self .todo : print "Final" ,
272282 else : print "Interim" ,
273283 print "Report (%d to do, %d done, %d external, %d bad)" % (
274284 len (self .todo ), len (self .done ),
275285 len (self .ext ), len (self .bad ))
276286 if verbose > 0 :
277- self .report_extrefs ()
287+ self .report_extrefs (checkext )
278288 # Report errors last because the output may get truncated
279289 self .report_errors ()
280290
281- def report_extrefs (self ):
291+ def report_extrefs (self , checkext = 0 ):
282292 if not self .ext :
283293 print
284294 print "No external URLs"
285295 return
286296 print
287- print "External URLs:"
297+ if checkext :
298+ print "External URLs (checking validity):"
299+ else :
300+ print "External URLs (not checked):"
288301 print
289302 urls = self .ext .keys ()
290303 urls .sort ()
291304 for url in urls :
292305 show ("HREF " , url , " from" , self .ext [url ])
306+ if not checkext :
307+ continue
308+ if verbose > 2 : print "Checking" , url , "..."
309+ try :
310+ f = self .urlopener .open (url )
311+ f .close ()
312+ if verbose > 3 : print "OK"
313+ except IOError , msg :
314+ print "Error:" , msg
293315
294316 def report_errors (self ):
295317 if not self .bad :
0 commit comments