Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit de66268

Browse files
committed
Added -x option to check external links. Slooooow!
1 parent 325a64f commit de66268

1 file changed

Lines changed: 32 additions & 10 deletions

File tree

Tools/webchecker/webchecker.py

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,11 @@
7070
links in <FORM> or <IMG> or whatever other tags might contain
7171
hyperlinks. It does honor the <BASE> tag.
7272
73-
- It could be argued that it should also check external links for
74-
validity. This is true, but is is more error-prone. I think I will
75-
make this an option in the future.
73+
- Checking external links is not done by default; use -x to enable
74+
this feature. This is done because checking external links usually
75+
takes a lot of time. When enabled, this check is executed during the
76+
report generation phase (so -x is ignored when -q is specified). Even
77+
when -x is enabled, only ``http:'' URLs are checked.
7678
7779
7880
Usage: webchecker.py [option] ... [rooturl] ...
@@ -85,6 +87,7 @@
8587
-q -- quiet operation (also suppresses external links report)
8688
-r number -- number of links processed per round (default %(ROUNDSIZE)d)
8789
-v -- verbose operation; repeating -v will increase verbosity
90+
-x -- check external links (during report phase)
8891
8992
Arguments:
9093
@@ -131,9 +134,10 @@ def main():
131134
global verbose, maxpage, roundsize
132135
dumpfile = DUMPFILE
133136
restart = 0
137+
checkext = 0
134138

135139
try:
136-
opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:v')
140+
opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:vx')
137141
except getopt.error, msg:
138142
sys.stdout = sys.stderr
139143
print msg
@@ -151,6 +155,8 @@ def main():
151155
roundsize = string.atoi(a)
152156
if o == '-v':
153157
verbose = verbose + 1
158+
if o == '-x':
159+
checkext = 1
154160

155161
if verbose:
156162
print AGENTNAME, "version", __version__
@@ -180,8 +186,12 @@ def main():
180186
c.run()
181187
except KeyboardInterrupt:
182188
if verbose > 0:
183-
print "[interrupted]"
184-
c.report()
189+
print "[run interrupted]"
190+
try:
191+
c.report(checkext)
192+
except KeyboardInterrupt:
193+
if verbose > 0:
194+
print "[report interrupted]"
185195
if not needsave:
186196
if verbose > 0:
187197
print
@@ -266,30 +276,42 @@ def run(self):
266276
self.done[url] = self.todo[url]
267277
del self.todo[url]
268278

269-
def report(self):
279+
def report(self, checkext=0):
270280
print
271281
if not self.todo: print "Final",
272282
else: print "Interim",
273283
print "Report (%d to do, %d done, %d external, %d bad)" % (
274284
len(self.todo), len(self.done),
275285
len(self.ext), len(self.bad))
276286
if verbose > 0:
277-
self.report_extrefs()
287+
self.report_extrefs(checkext)
278288
# Report errors last because the output may get truncated
279289
self.report_errors()
280290

281-
def report_extrefs(self):
291+
def report_extrefs(self, checkext=0):
282292
if not self.ext:
283293
print
284294
print "No external URLs"
285295
return
286296
print
287-
print "External URLs:"
297+
if checkext:
298+
print "External URLs (checking validity):"
299+
else:
300+
print "External URLs (not checked):"
288301
print
289302
urls = self.ext.keys()
290303
urls.sort()
291304
for url in urls:
292305
show("HREF ", url, " from", self.ext[url])
306+
if not checkext:
307+
continue
308+
if verbose > 2: print "Checking", url, "..."
309+
try:
310+
f = self.urlopener.open(url)
311+
f.close()
312+
if verbose > 3: print "OK"
313+
except IOError, msg:
314+
print "Error:", msg
293315

294316
def report_errors(self):
295317
if not self.bad:

0 commit comments

Comments
 (0)