Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 663f6c2

Browse files
author
Skip Montanaro
committed
rewrite of robotparser.py by Bastian Kleineidam. Closes patch 102229.
1 parent a5d23a1 commit 663f6c2

1 file changed

Lines changed: 179 additions & 60 deletions

File tree

Lib/robotparser.py

Lines changed: 179 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,28 @@
1-
"""
1+
""" robotparser.py
2+
3+
Copyright (C) 2000 Bastian Kleineidam
24
3-
Robots.txt file parser class. Accepts a list of lines or robots.txt URL as
4-
input, builds a set of rules from that list, then answers questions about
5-
fetchability of other URLs.
5+
You can choose between two licenses when using this package:
6+
1) GNU GPLv2
7+
2) PYTHON 2.0 OPEN SOURCE LICENSE
68
9+
The robots.txt Exclusion Protocol is implemented as specified in
10+
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
711
"""
12+
import re,string,urlparse,urllib
813

9-
class RobotFileParser:
14+
debug = 0
1015

11-
def __init__(self):
12-
self.rules = {}
13-
self.debug = 0
14-
self.url = ''
16+
def _debug(msg):
17+
if debug: print msg
18+
19+
20+
class RobotFileParser:
21+
def __init__(self, url=''):
22+
self.entries = []
23+
self.disallow_all = 0
24+
self.allow_all = 0
25+
self.set_url(url)
1526
self.last_checked = 0
1627

1728
def mtime(self):
@@ -23,75 +34,183 @@ def modified(self):
2334

2435
def set_url(self, url):
2536
self.url = url
37+
self.host, self.path = urlparse.urlparse(url)[1:3]
2638

2739
def read(self):
28-
import urllib
29-
self.parse(urllib.urlopen(self.url).readlines())
40+
import httplib
41+
tries = 0
42+
while tries<5:
43+
connection = httplib.HTTP(self.host)
44+
connection.putrequest("GET", self.path)
45+
connection.putheader("Host", self.host)
46+
connection.endheaders()
47+
status, text, mime = connection.getreply()
48+
if status in [301,302] and mime:
49+
tries = tries + 1
50+
newurl = mime.get("Location", mime.get("Uri", ""))
51+
newurl = urlparse.urljoin(self.url, newurl)
52+
self.set_url(newurl)
53+
else:
54+
break
55+
if status==401 or status==403:
56+
self.disallow_all = 1
57+
elif status>=400:
58+
self.allow_all = 1
59+
else:
60+
# status < 400
61+
self.parse(connection.getfile().readlines())
3062

3163
def parse(self, lines):
32-
"""parse the input lines from a robot.txt file"""
33-
import string, re
34-
active = []
64+
"""parse the input lines from a robot.txt file.
65+
We allow that a user-agent: line is not preceded by
66+
one or more blank lines."""
67+
state = 0
68+
linenumber = 0
69+
entry = Entry()
70+
3571
for line in lines:
36-
if self.debug: print '>', line,
37-
# blank line terminates current record
38-
if not line[:-1]:
39-
active = []
40-
continue
72+
line = string.strip(line)
73+
linenumber = linenumber + 1
74+
if not line:
75+
if state==1:
76+
_debug("line %d: warning: you should insert"
77+
" allow: or disallow: directives below any"
78+
" user-agent: line" % linenumber)
79+
entry = Entry()
80+
state = 0
81+
elif state==2:
82+
self.entries.append(entry)
83+
entry = Entry()
84+
state = 0
4185
# remove optional comment and strip line
42-
line = string.strip(line[:string.find(line, '#')])
86+
i = string.find(line, '#')
87+
if i>=0:
88+
line = line[:i]
89+
line = string.strip(line)
4390
if not line:
4491
continue
45-
line = re.split(' *: *', line)
92+
line = string.split(line, ':', 1)
4693
if len(line) == 2:
47-
line[0] = string.lower(line[0])
48-
if line[0] == 'user-agent':
49-
# this record applies to this user agent
50-
if self.debug: print '>> user-agent:', line[1]
51-
active.append(line[1])
52-
if not self.rules.has_key(line[1]):
53-
self.rules[line[1]] = []
54-
elif line[0] == 'disallow':
55-
if line[1]:
56-
if self.debug: print '>> disallow:', line[1]
57-
for agent in active:
58-
self.rules[agent].append(re.compile(line[1]))
94+
line[0] = string.lower(string.strip(line[0]))
95+
line[1] = string.strip(line[1])
96+
if line[0] == "user-agent":
97+
if state==2:
98+
_debug("line %d: warning: you should insert a blank"
99+
" line before any user-agent"
100+
" directive" % linenumber)
101+
self.entries.append(entry)
102+
entry = Entry()
103+
entry.useragents.append(line[1])
104+
state = 1
105+
elif line[0] == "disallow":
106+
if state==0:
107+
_debug("line %d: error: you must insert a user-agent:"
108+
" directive before this line" % linenumber)
109+
else:
110+
entry.rulelines.append(RuleLine(line[1], 0))
111+
state = 2
112+
elif line[0] == "allow":
113+
if state==0:
114+
_debug("line %d: error: you must insert a user-agent:"
115+
" directive before this line" % linenumber)
59116
else:
60-
pass
61-
for agent in active:
62-
if self.debug: print '>> allow', agent
63-
self.rules[agent] = []
117+
entry.rulelines.append(RuleLine(line[1], 1))
64118
else:
65-
if self.debug: print '>> unknown:', line
119+
_debug("line %d: warning: unknown key %s" % (linenumber,
120+
line[0]))
121+
else:
122+
_debug("line %d: error: malformed line %s"%(linenumber, line))
123+
if state==2:
124+
self.entries.append(entry)
125+
_debug("Parsed rules:\n%s" % str(self))
66126

67-
self.modified()
68127

69-
# returns true if agent is allowed to fetch url
70128
def can_fetch(self, useragent, url):
71129
"""using the parsed robots.txt decide if useragent can fetch url"""
72-
import urlparse
73-
ag = useragent
74-
if not self.rules.has_key(ag): ag = '*'
75-
if not self.rules.has_key(ag):
76-
if self.debug: print '>> allowing', url, 'fetch by', useragent
130+
_debug("Checking robot.txt allowance for\n%s\n%s" % (useragent, url))
131+
if self.disallow_all:
132+
return 0
133+
if self.allow_all:
77134
return 1
78-
path = urlparse.urlparse(url)[2]
79-
for rule in self.rules[ag]:
80-
if rule.match(path) is not None:
81-
if self.debug: print '>> disallowing', url, 'fetch by', useragent
82-
return 0
83-
if self.debug: print '>> allowing', url, 'fetch by', useragent
135+
# search for given user agent matches
136+
# the first match counts
137+
useragent = string.lower(useragent)
138+
url = urllib.quote(urlparse.urlparse(url)[2])
139+
for entry in self.entries:
140+
if entry.applies_to(useragent):
141+
return entry.allowance(url)
142+
# agent not found ==> access granted
84143
return 1
85144

145+
146+
def __str__(self):
147+
ret = ""
148+
for entry in self.entries:
149+
ret = ret + str(entry) + "\n"
150+
return ret
151+
152+
153+
class RuleLine:
154+
"""A rule line is a single "Allow:" (allowance==1) or "Disallow:"
155+
(allowance==0) followed by a path."""
156+
def __init__(self, path, allowance):
157+
self.path = urllib.quote(path)
158+
self.allowance = allowance
159+
160+
def applies_to(self, filename):
161+
return self.path=="*" or re.match(self.path, filename)
162+
163+
def __str__(self):
164+
return (self.allowance and "Allow" or "Disallow")+": "+self.path
165+
166+
167+
class Entry:
168+
"""An entry has one or more user-agents and zero or more rulelines"""
169+
def __init__(self):
170+
self.useragents = []
171+
self.rulelines = []
172+
173+
def __str__(self):
174+
ret = ""
175+
for agent in self.useragents:
176+
ret = ret + "User-agent: "+agent+"\n"
177+
for line in self.rulelines:
178+
ret = ret + str(line) + "\n"
179+
return ret
180+
181+
def applies_to(self, useragent):
182+
"check if this entry applies to the specified agent"
183+
for agent in self.useragents:
184+
if agent=="*":
185+
return 1
186+
if re.match(agent, useragent):
187+
return 1
188+
return 0
189+
190+
def allowance(self, filename):
191+
"""Preconditions:
192+
- our agent applies to this entry
193+
- filename is URL decoded"""
194+
for line in self.rulelines:
195+
if line.applies_to(filename):
196+
return line.allowance
197+
return 1
198+
199+
86200
def _test():
201+
global debug
202+
import sys
87203
rp = RobotFileParser()
88-
rp.debug = 1
89-
rp.set_url('http://www.musi-cal.com/robots.txt')
90-
rp.read()
91-
print rp.rules
92-
print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
93-
print rp.can_fetch('Musi-Cal-Robot',
94-
'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
95-
96-
if __name__ == "__main__":
204+
debug = 1
205+
if len(sys.argv) <= 1:
206+
rp.set_url('http://www.musi-cal.com/robots.txt')
207+
rp.read()
208+
else:
209+
rp.parse(open(sys.argv[1]).readlines())
210+
print rp.can_fetch('*', 'http://www.musi-cal.com/')
211+
print rp.can_fetch('Musi-Cal-Robot/1.0',
212+
'http://www.musi-cal.com/cgi-bin/event-search'
213+
'?city=San+Francisco')
214+
215+
if __name__ == '__main__':
97216
_test()

0 commit comments

Comments
 (0)