Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7ee9026

Browse files
committed
Unbreak utils/spider.py
spider.py used both Python 2-only (md5) and Python 3-only (urllib) imports. Also, it didnt use a namespace when searching for links to spider, and did not read the robots.txt, preventing any spidering occurring. Fix exception occuring when robots processing removed items from list toVisit while iterating over the list. Add more output on stderr, and a main() which spiders yahoo.com
1 parent a3022dc commit 7ee9026

File tree

1 file changed

+67
-27
lines changed

1 file changed

+67
-27
lines changed

utils/spider.py

Lines changed: 67 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,28 @@
11
#!/usr/bin/env python
2-
"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree
2+
"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree.
33
44
usage:
55
import spider
66
s = spider.Spider()
77
s.spider("http://www.google.com", maxURLs=100)
88
"""
9+
from __future__ import absolute_import, division, unicode_literals, print_function
910

10-
import urllib.request
11-
import urllib.error
12-
import urllib.parse
13-
import urllib.robotparser
14-
import md5
11+
import sys
1512

16-
import httplib2
13+
try:
14+
import urllib.parse as urllib_parse
15+
except ImportError:
16+
import urlparse as urllib_parse
17+
try:
18+
import urllib.robotparser as robotparser
19+
except ImportError:
20+
import robotparser
21+
22+
from hashlib import md5
1723

24+
import httplib2
1825
import html5lib
19-
from html5lib.treebuilders import etree
2026

2127

2228
class Spider(object):
@@ -25,7 +31,7 @@ def __init__(self):
2531
self.unvisitedURLs = set()
2632
self.visitedURLs = set()
2733
self.buggyURLs = set()
28-
self.robotParser = urllib.robotparser.RobotFileParser()
34+
self.robotParser = robotparser.RobotFileParser()
2935
self.contentDigest = {}
3036
self.http = httplib2.Http(".cache")
3137

@@ -40,31 +46,39 @@ def run(self, initialURL, maxURLs=1000):
4046
if not self.unvisitedURLs:
4147
break
4248
content = self.loadURL(self.unvisitedURLs.pop())
49+
return urlNumber
4350

4451
def parse(self, content):
4552
failed = False
46-
p = html5lib.HTMLParser(tree=etree.TreeBuilder)
53+
p = html5lib.HTMLParser(tree=html5lib.getTreeBuilder('etree'))
4754
try:
4855
tree = p.parse(content)
49-
except:
56+
except Exception as e:
5057
self.buggyURLs.add(self.currentURL)
5158
failed = True
52-
print("BUGGY:", self.currentURL)
59+
print("BUGGY: {0}: {1}".format(self.currentURL, e), file=sys.stderr)
5360
self.visitedURLs.add(self.currentURL)
5461
if not failed:
5562
self.updateURLs(tree)
5663

5764
def loadURL(self, url):
58-
resp, content = self.http.request(url, "GET")
65+
print('Processing {0}'.format(url), file=sys.stderr)
66+
try:
67+
resp, content = self.http.request(url, "GET")
68+
except Exception as e:
69+
print("Failed to fetch {0}: {1}".format(url, e), file=sys.stderr)
70+
return None
71+
5972
self.currentURL = url
60-
digest = md5.md5(content).hexdigest()
73+
digest = md5(content).hexdigest()
6174
if digest in self.contentDigest:
6275
content = None
6376
self.visitedURLs.add(url)
6477
else:
6578
self.contentDigest[digest] = url
6679

67-
if resp['status'] != "200":
80+
if resp['status'] not in ('200', '304'):
81+
print("Fetch {0} status {1}".format(url, resp['status']), file=sys.stderr)
6882
content = None
6983

7084
return content
@@ -75,9 +89,11 @@ def updateURLs(self, tree):
7589
have seen them before or not"""
7690
urls = set()
7791
# Remove all links we have already visited
78-
for link in tree.findall(".//a"):
92+
namespace = tree.tag[1:].split('}')[0]
93+
links = list(tree.findall('.//{%s}a' % namespace))
94+
for link in links:
7995
try:
80-
url = urllib.parse.urldefrag(link.attrib['href'])[0]
96+
url = urllib_parse.urldefrag(link.attrib['href'])[0]
8197
if (url and url not in self.unvisitedURLs and url
8298
not in self.visitedURLs):
8399
urls.add(url)
@@ -88,38 +104,62 @@ def updateURLs(self, tree):
88104
# missing
89105
newUrls = set()
90106
for url in urls:
91-
splitURL = list(urllib.parse.urlsplit(url))
107+
splitURL = list(urllib_parse.urlsplit(url))
92108
if splitURL[0] != "http":
93109
continue
94110
if splitURL[1] == "":
95-
splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1]
96-
newUrls.add(urllib.parse.urlunsplit(splitURL))
111+
splitURL[1] = urllib_parse.urlsplit(self.currentURL)[1]
112+
newUrls.add(urllib_parse.urlunsplit(splitURL))
97113
urls = newUrls
98114

99115
responseHeaders = {}
100116
# Now we want to find the content types of the links we haven't visited
101117
for url in urls:
118+
print('Checking {0}'.format(url), file=sys.stderr)
102119
try:
103120
resp, content = self.http.request(url, "HEAD")
104121
responseHeaders[url] = resp
105-
except AttributeError:
106-
# Don't know why this happens
107-
pass
122+
except Exception as e:
123+
print('Error fetching HEAD of {0}: {1}'.format(url, e), file=sys.stderr)
108124

109125
# Remove links not of content-type html or pages not found
110126
# XXX - need to deal with other status codes?
111127
toVisit = set([url for url in urls if url in responseHeaders and
112-
"html" in responseHeaders[url]['content-type'] and
128+
'html' in responseHeaders[url].get('content-type', '') and
113129
responseHeaders[url]['status'] == "200"])
114130

115131
# Now check we are allowed to spider the page
116-
for url in toVisit:
117-
robotURL = list(urllib.parse.urlsplit(url)[:2])
132+
for url in list(toVisit):
133+
robotURL = list(urllib_parse.urlsplit(url)[:2])
118134
robotURL.extend(["robots.txt", "", ""])
119-
robotURL = urllib.parse.urlunsplit(robotURL)
135+
robotURL = urllib_parse.urlunsplit(robotURL)
120136
self.robotParser.set_url(robotURL)
137+
try:
138+
self.robotParser.read()
139+
except Exception as e:
140+
print('Failed to read {0}: {1}'.format(robotURL, e), file=sys.stderr)
141+
toVisit.remove(url)
142+
continue
143+
121144
if not self.robotParser.can_fetch("*", url):
145+
print('{0} rejects {1}'.format(robotURL, url), file=sys.stderr)
122146
toVisit.remove(url)
123147

124148
self.visitedURLs.update(urls)
125149
self.unvisitedURLs.update(toVisit)
150+
151+
152+
def main():
153+
max_urls = 100
154+
s = Spider()
155+
count = s.run("http://yahoo.com/", maxURLs=max_urls)
156+
if s.buggyURLs:
157+
print('Buggy URLs:')
158+
print(' ' + '\n '.join(s.buggyURLs))
159+
print('')
160+
if count != max_urls:
161+
print('{0} of {1} processed'.format(count, max_urls))
162+
sys.exit(count == max_urls and len(s.buggyURLs) == 0)
163+
164+
if __name__ == '__main__':
165+
main()

0 commit comments

Comments
 (0)