Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit d575480

Browse files
committed
A variant on webchecker that creates a mirror copy of a remote site.
1 parent 2237b73 commit d575480

1 file changed

Lines changed: 131 additions & 0 deletions

File tree

Tools/webchecker/websucker.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#! /usr/bin/env python
2+
3+
"""A variant on webchecker that creates a mirror copy of a remote site."""
4+
5+
__version__ = "0.1"
6+
7+
import os
8+
import sys
9+
import string
10+
import urllib
11+
import getopt
12+
13+
import webchecker
14+
verbose = webchecker.verbose
15+
16+
def main():
17+
global verbose
18+
try:
19+
opts, args = getopt.getopt(sys.argv[1:], "qv")
20+
except getopt.error, msg:
21+
print msg
22+
print "usage:", sys.argv[0], "[-v] ... [rooturl] ..."
23+
return 2
24+
for o, a in opts:
25+
if o == "-q":
26+
webchecker.verbose = verbose = 0
27+
if o == "-v":
28+
webchecker.verbose = verbose = verbose + 1
29+
c = Sucker(0)
30+
c.urlopener.addheaders = [
31+
('User-agent', 'websucker/%s' % __version__),
32+
]
33+
for arg in args:
34+
print "Adding root", arg
35+
c.addroot(arg)
36+
print "Run..."
37+
c.run()
38+
39+
class Sucker(webchecker.Checker):
40+
41+
# Alas, had to copy this to make one change...
42+
def getpage(self, url):
43+
if url[:7] == 'mailto:' or url[:5] == 'news:':
44+
if verbose > 1: print " Not checking mailto/news URL"
45+
return None
46+
isint = self.inroots(url)
47+
if not isint and not self.checkext:
48+
if verbose > 1: print " Not checking ext link"
49+
return None
50+
path = self.savefilename(url)
51+
saved = 0
52+
try:
53+
f = open(path, "rb")
54+
except IOError:
55+
try:
56+
f = self.urlopener.open(url)
57+
except IOError, msg:
58+
msg = webchecker.sanitize(msg)
59+
if verbose > 0:
60+
print "Error ", msg
61+
if verbose > 0:
62+
webchecker.show(" HREF ", url, " from", self.todo[url])
63+
self.setbad(url, msg)
64+
return None
65+
if not isint:
66+
if verbose > 1: print " Not gathering links from ext URL"
67+
safeclose(f)
68+
return None
69+
nurl = f.geturl()
70+
if nurl != url:
71+
path = self.savefilename(nurl)
72+
info = f.info()
73+
else:
74+
if verbose: print "Loading cached URL", url
75+
saved = 1
76+
nurl = url
77+
info = {}
78+
if url[-1:] == "/":
79+
info["content-type"] = "text/html"
80+
text = f.read()
81+
if not saved: self.savefile(text, path)
82+
if info.has_key('content-type'):
83+
ctype = string.lower(info['content-type'])
84+
else:
85+
ctype = None
86+
if nurl != url:
87+
if verbose > 1:
88+
print " Redirected to", nurl
89+
if not ctype:
90+
ctype, encoding = webchecker.mimetypes.guess_type(nurl)
91+
if ctype != 'text/html':
92+
webchecker.safeclose(f)
93+
if verbose > 1:
94+
print " Not HTML, mime type", ctype
95+
return None
96+
f.close()
97+
return webchecker.Page(text, nurl)
98+
99+
def savefile(self, text, path):
100+
dir, base = os.path.split(path)
101+
makedirs(dir)
102+
f = open(path, "wb")
103+
f.write(text)
104+
f.close()
105+
print "saved", path
106+
107+
def savefilename(self, url):
108+
type, rest = urllib.splittype(url)
109+
host, path = urllib.splithost(rest)
110+
while path[:1] == "/": path = path[1:]
111+
user, host = urllib.splituser(host)
112+
host, port = urllib.splitnport(host)
113+
host = string.lower(host)
114+
path = os.path.join(host, path)
115+
if path[-1] == "/": path = path + "index.html"
116+
if os.sep != "/":
117+
path = string.join(string.split(path, "/"), os.sep)
118+
return path
119+
120+
def makedirs(dir):
121+
if not dir or os.path.exists(dir):
122+
return
123+
head, tail = os.path.split(dir)
124+
if not tail:
125+
print "Huh? Don't know how to make dir", dir
126+
return
127+
makedirs(head)
128+
os.mkdir(dir, 0777)
129+
130+
if __name__ == '__main__':
131+
sys.exit(main() or 0)

0 commit comments

Comments
 (0)