Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 3edbb35

Browse files
committed
Added robots.txt support, using Skip Montanaro's parser.
Fixed occasional inclusion of unpicklable objects (Message in errors). Changed indent of a few messages.
1 parent bbf8c2f commit 3edbb35

1 file changed

Lines changed: 38 additions & 3 deletions

File tree

Tools/webchecker/webchecker.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,13 @@
5050
5151
Miscellaneous:
5252
53+
- Webchecker honors the "robots.txt" convention. Thanks to Skip
54+
Montanaro for his robotparser.py module (included in this directory)!
55+
The agent name is hardwired to "webchecker". URLs that are disallowed
56+
by the robots.txt file are reported as external URLs.
57+
5358
- Because the HTML parser is a bit slow, very large HTML files are
54-
skipped. The size limit can be set with the -m option.
59+
skipped. The size limit can be set with the -m option.
5560
5661
- Before fetching a page, it guesses its type based on its extension.
5762
If it is a known extension and the type is not text/http, the page is
@@ -103,13 +108,15 @@
103108
import formatter
104109

105110
import mimetypes
111+
import robotparser
106112

107113

108114
# Tunable parameters
109115
DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
110116
MAXPAGE = 50000 # Ignore files bigger than this
111117
ROUNDSIZE = 50 # Number of links processed per round
112118
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
119+
AGENTNAME = "webchecker" # Agent name for robots.txt parser
113120

114121

115122
# Global variables
@@ -208,11 +215,32 @@ def __init__(self):
208215
self.bad = {}
209216
self.urlopener = MyURLopener()
210217
self.round = 0
218+
self.robots = {}
219+
220+
def __getstate__(self):
221+
return (self.roots, self.todo, self.done,
222+
self.ext, self.bad, self.round)
223+
224+
def __setstate__(self, state):
225+
(self.roots, self.todo, self.done,
226+
self.ext, self.bad, self.round) = state
227+
for root in self.roots:
228+
self.addrobot(root)
211229

212230
def addroot(self, root):
213231
if root not in self.roots:
214232
self.roots.append(root)
215233
self.todo[root] = []
234+
self.addrobot(root)
235+
236+
def addrobot(self, root):
237+
self.robots[root] = rp = robotparser.RobotFileParser()
238+
if verbose > 3:
239+
print "Parsing robots.txt file"
240+
rp.debug = 1
241+
url = urlparse.urljoin(root, "/robots.txt")
242+
rp.set_url(url)
243+
rp.read()
216244

217245
def run(self):
218246
while self.todo:
@@ -332,7 +360,7 @@ def dopage(self, url):
332360
def inroots(self, url):
333361
for root in self.roots:
334362
if url[:len(root)] == root:
335-
return 1
363+
return self.robots[root].can_fetch(AGENTNAME, url)
336364
return 0
337365

338366
def getpage(self, url):
@@ -348,6 +376,13 @@ def getpage(self, url):
348376
try:
349377
f = self.urlopener.open(url)
350378
except IOError, msg:
379+
if (type(msg) == TupleType and
380+
len(msg) >= 4 and
381+
msg[0] == 'http error' and
382+
type(msg[3]) == InstanceType):
383+
# Remove the Message instance -- it may contain
384+
# a file object which prevents pickling.
385+
msg = msg[:3] + msg[4:]
351386
if verbose > 0:
352387
print "Error ", msg
353388
if verbose > 0:
@@ -360,7 +395,7 @@ def getpage(self, url):
360395
ctype = string.lower(info['content-type'])
361396
if nurl != url:
362397
if verbose > 1:
363-
print "Redirected to", nurl
398+
print " Redirected to", nurl
364399
if not ctype:
365400
ctype, encoding = mimetypes.guess_type(nurl)
366401
if ctype != 'text/html':

0 commit comments

Comments
 (0)