1- """
1+ """ robotparser.py
2+
3+ Copyright (C) 2000 Bastian Kleineidam
24
3- Robots.txt file parser class. Accepts a list of lines or robots.txt URL as
4- input, builds a set of rules from that list, then answers questions about
5- fetchability of other URLs.
5+ You can choose between two licenses when using this package:
6+ 1) GNU GPLv2
7+ 2) PYTHON 2.0 OPEN SOURCE LICENSE
68
9+ The robots.txt Exclusion Protocol is implemented as specified in
10+ http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
711"""
12+ import re ,string ,urlparse ,urllib
813
9- class RobotFileParser :
14+ debug = 0
1015
11- def __init__ (self ):
12- self .rules = {}
13- self .debug = 0
14- self .url = ''
16+ def _debug (msg ):
17+ if debug : print msg
18+
19+
20+ class RobotFileParser :
21+ def __init__ (self , url = '' ):
22+ self .entries = []
23+ self .disallow_all = 0
24+ self .allow_all = 0
25+ self .set_url (url )
1526 self .last_checked = 0
1627
1728 def mtime (self ):
@@ -23,75 +34,183 @@ def modified(self):
2334
2435 def set_url (self , url ):
2536 self .url = url
37+ self .host , self .path = urlparse .urlparse (url )[1 :3 ]
2638
2739 def read (self ):
28- import urllib
29- self .parse (urllib .urlopen (self .url ).readlines ())
40+ import httplib
41+ tries = 0
42+ while tries < 5 :
43+ connection = httplib .HTTP (self .host )
44+ connection .putrequest ("GET" , self .path )
45+ connection .putheader ("Host" , self .host )
46+ connection .endheaders ()
47+ status , text , mime = connection .getreply ()
48+ if status in [301 ,302 ] and mime :
49+ tries = tries + 1
50+ newurl = mime .get ("Location" , mime .get ("Uri" , "" ))
51+ newurl = urlparse .urljoin (self .url , newurl )
52+ self .set_url (newurl )
53+ else :
54+ break
55+ if status == 401 or status == 403 :
56+ self .disallow_all = 1
57+ elif status >= 400 :
58+ self .allow_all = 1
59+ else :
60+ # status < 400
61+ self .parse (connection .getfile ().readlines ())
3062
3163 def parse (self , lines ):
32- """parse the input lines from a robot.txt file"""
33- import string , re
34- active = []
64+ """parse the input lines from a robot.txt file.
65+ We allow that a user-agent: line is not preceded by
66+ one or more blank lines."""
67+ state = 0
68+ linenumber = 0
69+ entry = Entry ()
70+
3571 for line in lines :
36- if self .debug : print '>' , line ,
37- # blank line terminates current record
38- if not line [:- 1 ]:
39- active = []
40- continue
72+ line = string .strip (line )
73+ linenumber = linenumber + 1
74+ if not line :
75+ if state == 1 :
76+ _debug ("line %d: warning: you should insert"
77+ " allow: or disallow: directives below any"
78+ " user-agent: line" % linenumber )
79+ entry = Entry ()
80+ state = 0
81+ elif state == 2 :
82+ self .entries .append (entry )
83+ entry = Entry ()
84+ state = 0
4185 # remove optional comment and strip line
42- line = string .strip (line [:string .find (line , '#' )])
86+ i = string .find (line , '#' )
87+ if i >= 0 :
88+ line = line [:i ]
89+ line = string .strip (line )
4390 if not line :
4491 continue
45- line = re .split (' *: * ' , line )
92+ line = string .split (line , ': ' , 1 )
4693 if len (line ) == 2 :
47- line [0 ] = string .lower (line [0 ])
48- if line [0 ] == 'user-agent' :
49- # this record applies to this user agent
50- if self .debug : print '>> user-agent:' , line [1 ]
51- active .append (line [1 ])
52- if not self .rules .has_key (line [1 ]):
53- self .rules [line [1 ]] = []
54- elif line [0 ] == 'disallow' :
55- if line [1 ]:
56- if self .debug : print '>> disallow:' , line [1 ]
57- for agent in active :
58- self .rules [agent ].append (re .compile (line [1 ]))
94+ line [0 ] = string .lower (string .strip (line [0 ]))
95+ line [1 ] = string .strip (line [1 ])
96+ if line [0 ] == "user-agent" :
97+ if state == 2 :
98+ _debug ("line %d: warning: you should insert a blank"
99+ " line before any user-agent"
100+ " directive" % linenumber )
101+ self .entries .append (entry )
102+ entry = Entry ()
103+ entry .useragents .append (line [1 ])
104+ state = 1
105+ elif line [0 ] == "disallow" :
106+ if state == 0 :
107+ _debug ("line %d: error: you must insert a user-agent:"
108+ " directive before this line" % linenumber )
109+ else :
110+ entry .rulelines .append (RuleLine (line [1 ], 0 ))
111+ state = 2
112+ elif line [0 ] == "allow" :
113+ if state == 0 :
114+ _debug ("line %d: error: you must insert a user-agent:"
115+ " directive before this line" % linenumber )
59116 else :
60- pass
61- for agent in active :
62- if self .debug : print '>> allow' , agent
63- self .rules [agent ] = []
117+ entry .rulelines .append (RuleLine (line [1 ], 1 ))
64118 else :
65- if self .debug : print '>> unknown:' , line
119+ _debug ("line %d: warning: unknown key %s" % (linenumber ,
120+ line [0 ]))
121+ else :
122+ _debug ("line %d: error: malformed line %s" % (linenumber , line ))
123+ if state == 2 :
124+ self .entries .append (entry )
125+ _debug ("Parsed rules:\n %s" % str (self ))
66126
67- self .modified ()
68127
69- # returns true if agent is allowed to fetch url
70128 def can_fetch (self , useragent , url ):
71129 """using the parsed robots.txt decide if useragent can fetch url"""
72- import urlparse
73- ag = useragent
74- if not self .rules .has_key (ag ): ag = '*'
75- if not self .rules .has_key (ag ):
76- if self .debug : print '>> allowing' , url , 'fetch by' , useragent
130+ _debug ("Checking robot.txt allowance for\n %s\n %s" % (useragent , url ))
131+ if self .disallow_all :
132+ return 0
133+ if self .allow_all :
77134 return 1
78- path = urlparse .urlparse (url )[2 ]
79- for rule in self .rules [ag ]:
80- if rule .match (path ) is not None :
81- if self .debug : print '>> disallowing' , url , 'fetch by' , useragent
82- return 0
83- if self .debug : print '>> allowing' , url , 'fetch by' , useragent
135+ # search for given user agent matches
136+ # the first match counts
137+ useragent = string .lower (useragent )
138+ url = urllib .quote (urlparse .urlparse (url )[2 ])
139+ for entry in self .entries :
140+ if entry .applies_to (useragent ):
141+ return entry .allowance (url )
142+ # agent not found ==> access granted
84143 return 1
85144
145+
146+ def __str__ (self ):
147+ ret = ""
148+ for entry in self .entries :
149+ ret = ret + str (entry ) + "\n "
150+ return ret
151+
152+
153+ class RuleLine :
154+ """A rule line is a single "Allow:" (allowance==1) or "Disallow:"
155+ (allowance==0) followed by a path."""
156+ def __init__ (self , path , allowance ):
157+ self .path = urllib .quote (path )
158+ self .allowance = allowance
159+
160+ def applies_to (self , filename ):
161+ return self .path == "*" or re .match (self .path , filename )
162+
163+ def __str__ (self ):
164+ return (self .allowance and "Allow" or "Disallow" )+ ": " + self .path
165+
166+
167+ class Entry :
168+ """An entry has one or more user-agents and zero or more rulelines"""
169+ def __init__ (self ):
170+ self .useragents = []
171+ self .rulelines = []
172+
173+ def __str__ (self ):
174+ ret = ""
175+ for agent in self .useragents :
176+ ret = ret + "User-agent: " + agent + "\n "
177+ for line in self .rulelines :
178+ ret = ret + str (line ) + "\n "
179+ return ret
180+
181+ def applies_to (self , useragent ):
182+ "check if this entry applies to the specified agent"
183+ for agent in self .useragents :
184+ if agent == "*" :
185+ return 1
186+ if re .match (agent , useragent ):
187+ return 1
188+ return 0
189+
190+ def allowance (self , filename ):
191+ """Preconditions:
192+ - our agent applies to this entry
193+ - filename is URL decoded"""
194+ for line in self .rulelines :
195+ if line .applies_to (filename ):
196+ return line .allowance
197+ return 1
198+
199+
86200def _test ():
201+ global debug
202+ import sys
87203 rp = RobotFileParser ()
88- rp .debug = 1
89- rp .set_url ('http://www.musi-cal.com/robots.txt' )
90- rp .read ()
91- print rp .rules
92- print rp .can_fetch ('*' , 'http://www.musi-cal.com.com/' )
93- print rp .can_fetch ('Musi-Cal-Robot' ,
94- 'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco' )
95-
96- if __name__ == "__main__" :
204+ debug = 1
205+ if len (sys .argv ) <= 1 :
206+ rp .set_url ('http://www.musi-cal.com/robots.txt' )
207+ rp .read ()
208+ else :
209+ rp .parse (open (sys .argv [1 ]).readlines ())
210+ print rp .can_fetch ('*' , 'http://www.musi-cal.com/' )
211+ print rp .can_fetch ('Musi-Cal-Robot/1.0' ,
212+ 'http://www.musi-cal.com/cgi-bin/event-search'
213+ '?city=San+Francisco' )
214+
215+ if __name__ == '__main__' :
97216 _test ()
0 commit comments