1
1
#!/usr/bin/env python
2
- """Spider to try and find bugs in the parser. Requires httplib2 and elementtree
2
+ """Spider to try and find bugs in the parser. Requires httplib2 and elementtree.
3
3
4
4
usage:
5
5
import spider
6
6
s = spider.Spider()
7
7
s.spider("http://www.google.com", maxURLs=100)
8
8
"""
9
+ from __future__ import absolute_import , division , unicode_literals , print_function
9
10
10
- import urllib .request
11
- import urllib .error
12
- import urllib .parse
13
- import urllib .robotparser
14
- import md5
11
+ import sys
15
12
16
- import httplib2
13
+ try :
14
+ import urllib .parse as urllib_parse
15
+ except ImportError :
16
+ import urlparse as urllib_parse
17
+ try :
18
+ import urllib .robotparser as robotparser
19
+ except ImportError :
20
+ import robotparser
21
+
22
+ from hashlib import md5
17
23
24
+ import httplib2
18
25
import html5lib
19
- from html5lib .treebuilders import etree
20
26
21
27
22
28
class Spider (object ):
@@ -25,7 +31,7 @@ def __init__(self):
25
31
self .unvisitedURLs = set ()
26
32
self .visitedURLs = set ()
27
33
self .buggyURLs = set ()
28
- self .robotParser = urllib . robotparser .RobotFileParser ()
34
+ self .robotParser = robotparser .RobotFileParser ()
29
35
self .contentDigest = {}
30
36
self .http = httplib2 .Http (".cache" )
31
37
@@ -40,31 +46,39 @@ def run(self, initialURL, maxURLs=1000):
40
46
if not self .unvisitedURLs :
41
47
break
42
48
content = self .loadURL (self .unvisitedURLs .pop ())
49
+ return urlNumber
43
50
44
51
def parse (self , content ):
45
52
failed = False
46
- p = html5lib .HTMLParser (tree = etree . TreeBuilder )
53
+ p = html5lib .HTMLParser (tree = html5lib . getTreeBuilder ( 'etree' ) )
47
54
try :
48
55
tree = p .parse (content )
49
- except :
56
+ except Exception as e :
50
57
self .buggyURLs .add (self .currentURL )
51
58
failed = True
52
- print ("BUGGY:" , self .currentURL )
59
+ print ("BUGGY: {0}: {1}" . format ( self .currentURL , e ), file = sys . stderr )
53
60
self .visitedURLs .add (self .currentURL )
54
61
if not failed :
55
62
self .updateURLs (tree )
56
63
57
64
def loadURL (self , url ):
58
- resp , content = self .http .request (url , "GET" )
65
+ print ('Processing {0}' .format (url ), file = sys .stderr )
66
+ try :
67
+ resp , content = self .http .request (url , "GET" )
68
+ except Exception as e :
69
+ print ("Failed to fetch {0}: {1}" .format (url , e ), file = sys .stderr )
70
+ return None
71
+
59
72
self .currentURL = url
60
- digest = md5 . md5 (content ).hexdigest ()
73
+ digest = md5 (content ).hexdigest ()
61
74
if digest in self .contentDigest :
62
75
content = None
63
76
self .visitedURLs .add (url )
64
77
else :
65
78
self .contentDigest [digest ] = url
66
79
67
- if resp ['status' ] != "200" :
80
+ if resp ['status' ] not in ('200' , '304' ):
81
+ print ("Fetch {0} status {1}" .format (url , resp ['status' ]), file = sys .stderr )
68
82
content = None
69
83
70
84
return content
@@ -75,9 +89,11 @@ def updateURLs(self, tree):
75
89
have seen them before or not"""
76
90
urls = set ()
77
91
# Remove all links we have already visited
78
- for link in tree .findall (".//a" ):
92
+ namespace = tree .tag [1 :].split ('}' )[0 ]
93
+ links = list (tree .findall ('.//{%s}a' % namespace ))
94
+ for link in links :
79
95
try :
80
- url = urllib . parse .urldefrag (link .attrib ['href' ])[0 ]
96
+ url = urllib_parse .urldefrag (link .attrib ['href' ])[0 ]
81
97
if (url and url not in self .unvisitedURLs and url
82
98
not in self .visitedURLs ):
83
99
urls .add (url )
@@ -88,38 +104,62 @@ def updateURLs(self, tree):
88
104
# missing
89
105
newUrls = set ()
90
106
for url in urls :
91
- splitURL = list (urllib . parse .urlsplit (url ))
107
+ splitURL = list (urllib_parse .urlsplit (url ))
92
108
if splitURL [0 ] != "http" :
93
109
continue
94
110
if splitURL [1 ] == "" :
95
- splitURL [1 ] = urllib . parse .urlsplit (self .currentURL )[1 ]
96
- newUrls .add (urllib . parse .urlunsplit (splitURL ))
111
+ splitURL [1 ] = urllib_parse .urlsplit (self .currentURL )[1 ]
112
+ newUrls .add (urllib_parse .urlunsplit (splitURL ))
97
113
urls = newUrls
98
114
99
115
responseHeaders = {}
100
116
# Now we want to find the content types of the links we haven't visited
101
117
for url in urls :
118
+ print ('Checking {0}' .format (url ), file = sys .stderr )
102
119
try :
103
120
resp , content = self .http .request (url , "HEAD" )
104
121
responseHeaders [url ] = resp
105
- except AttributeError :
106
- # Don't know why this happens
107
- pass
122
+ except Exception as e :
123
+ print ('Error fetching HEAD of {0}: {1}' .format (url , e ), file = sys .stderr )
108
124
109
125
# Remove links not of content-type html or pages not found
110
126
# XXX - need to deal with other status codes?
111
127
toVisit = set ([url for url in urls if url in responseHeaders and
112
- " html" in responseHeaders [url ][ 'content-type' ] and
128
+ ' html' in responseHeaders [url ]. get ( 'content-type' , '' ) and
113
129
responseHeaders [url ]['status' ] == "200" ])
114
130
115
131
# Now check we are allowed to spider the page
116
- for url in toVisit :
117
- robotURL = list (urllib . parse .urlsplit (url )[:2 ])
132
+ for url in list ( toVisit ) :
133
+ robotURL = list (urllib_parse .urlsplit (url )[:2 ])
118
134
robotURL .extend (["robots.txt" , "" , "" ])
119
- robotURL = urllib . parse .urlunsplit (robotURL )
135
+ robotURL = urllib_parse .urlunsplit (robotURL )
120
136
self .robotParser .set_url (robotURL )
137
+ try :
138
+ self .robotParser .read ()
139
+ except Exception as e :
140
+ print ('Failed to read {0}: {1}' .format (robotURL , e ), file = sys .stderr )
141
+ toVisit .remove (url )
142
+ continue
143
+
121
144
if not self .robotParser .can_fetch ("*" , url ):
145
+ print ('{0} rejects {1}' .format (robotURL , url ), file = sys .stderr )
122
146
toVisit .remove (url )
123
147
124
148
self .visitedURLs .update (urls )
125
149
self .unvisitedURLs .update (toVisit )
150
+
151
+
152
+ def main ():
153
+ max_urls = 100
154
+ s = Spider ()
155
+ count = s .run ("http://yahoo.com/" , maxURLs = max_urls )
156
+ if s .buggyURLs :
157
+ print ('Buggy URLs:' )
158
+ print (' ' + '\n ' .join (s .buggyURLs ))
159
+ print ('' )
160
+ if count != max_urls :
161
+ print ('{0} of {1} processed' .format (count , max_urls ))
162
+ sys .exit (count == max_urls and len (s .buggyURLs ) == 0 )
163
+
164
+ if __name__ == '__main__' :
165
+ main ()
0 commit comments