Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 1a7eae9

Browse files
committed
Adapt to new webchecker structure. Due to better structure of
getpage(), much less duplicate code is needed -- we only need to override readhtml().
1 parent 00756bd commit 1a7eae9

1 file changed

Lines changed: 33 additions & 59 deletions

File tree

Tools/webchecker/websucker.py

Lines changed: 33 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
"""A variant on webchecker that creates a mirror copy of a remote site."""
44

5-
__version__ = "0.1"
5+
__version__ = "$Revision$"
66

77
import os
88
import sys
@@ -11,22 +11,28 @@
1111
import getopt
1212

1313
import webchecker
14-
verbose = webchecker.verbose
14+
15+
# Extract real version number if necessary
16+
if __version__[0] == '$':
17+
_v = string.split(__version__)
18+
if len(_v) == 3:
19+
__version__ = _v[1]
1520

1621
def main():
17-
global verbose
22+
verbose = webchecker.VERBOSE
1823
try:
1924
opts, args = getopt.getopt(sys.argv[1:], "qv")
2025
except getopt.error, msg:
2126
print msg
22-
print "usage:", sys.argv[0], "[-v] ... [rooturl] ..."
27+
print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
2328
return 2
2429
for o, a in opts:
2530
if o == "-q":
26-
webchecker.verbose = verbose = 0
31+
verbose = 0
2732
if o == "-v":
28-
webchecker.verbose = verbose = verbose + 1
29-
c = Sucker(0)
33+
verbose = verbose + 1
34+
c = Sucker()
35+
c.setflags(verbose=verbose)
3036
c.urlopener.addheaders = [
3137
('User-agent', 'websucker/%s' % __version__),
3238
]
@@ -38,63 +44,31 @@ def main():
3844

3945
class Sucker(webchecker.Checker):
4046

41-
# Alas, had to copy this to make one change...
42-
def getpage(self, url):
43-
if url[:7] == 'mailto:' or url[:5] == 'news:':
44-
if verbose > 1: print " Not checking mailto/news URL"
45-
return None
46-
isint = self.inroots(url)
47-
if not isint and not self.checkext:
48-
if verbose > 1: print " Not checking ext link"
49-
return None
47+
checkext = 0
48+
49+
def readhtml(self, url):
50+
text = None
5051
path = self.savefilename(url)
51-
saved = 0
5252
try:
5353
f = open(path, "rb")
5454
except IOError:
55-
try:
56-
f = self.urlopener.open(url)
57-
except IOError, msg:
58-
msg = webchecker.sanitize(msg)
59-
if verbose > 0:
60-
print "Error ", msg
61-
if verbose > 0:
62-
webchecker.show(" HREF ", url, " from", self.todo[url])
63-
self.setbad(url, msg)
64-
return None
65-
if not isint:
66-
if verbose > 1: print " Not gathering links from ext URL"
67-
safeclose(f)
68-
return None
69-
nurl = f.geturl()
70-
if nurl != url:
71-
path = self.savefilename(nurl)
72-
info = f.info()
55+
f = self.openpage(url)
56+
if f:
57+
info = f.info()
58+
nurl = f.geturl()
59+
if nurl != url:
60+
url = nurl
61+
path = self.savefilename(url)
62+
text = f.read()
63+
f.close()
64+
self.savefile(text, path)
65+
if not self.checkforhtml(info, url):
66+
text = None
7367
else:
74-
if verbose: print "Loading cached URL", url
75-
saved = 1
76-
nurl = url
77-
info = {}
78-
if url[-1:] == "/":
79-
info["content-type"] = "text/html"
80-
text = f.read()
81-
if not saved: self.savefile(text, path)
82-
if info.has_key('content-type'):
83-
ctype = string.lower(info['content-type'])
84-
else:
85-
ctype = None
86-
if nurl != url:
87-
if verbose > 1:
88-
print " Redirected to", nurl
89-
if not ctype:
90-
ctype, encoding = webchecker.mimetypes.guess_type(nurl)
91-
if ctype != 'text/html':
92-
webchecker.safeclose(f)
93-
if verbose > 1:
94-
print " Not HTML, mime type", ctype
95-
return None
96-
f.close()
97-
return webchecker.Page(text, nurl)
68+
if self.checkforhtml({}, url):
69+
text = f.read()
70+
f.close()
71+
return text, url
9872

9973
def savefile(self, text, path):
10074
dir, base = os.path.split(path)

0 commit comments

Comments
 (0)