22
33"""A variant on webchecker that creates a mirror copy of a remote site."""
44
5- __version__ = "0.1 "
5+ __version__ = "$Revision$ "
66
77import os
88import sys
1111import getopt
1212
1313import webchecker
14- verbose = webchecker .verbose
14+
15+ # Extract real version number if necessary
16+ if __version__ [0 ] == '$' :
17+ _v = string .split (__version__ )
18+ if len (_v ) == 3 :
19+ __version__ = _v [1 ]
1520
1621def main ():
17- global verbose
22+ verbose = webchecker . VERBOSE
1823 try :
1924 opts , args = getopt .getopt (sys .argv [1 :], "qv" )
2025 except getopt .error , msg :
2126 print msg
22- print "usage:" , sys .argv [0 ], "[-v ] ... [rooturl] ..."
27+ print "usage:" , sys .argv [0 ], "[-qv ] ... [rooturl] ..."
2328 return 2
2429 for o , a in opts :
2530 if o == "-q" :
26- webchecker . verbose = verbose = 0
31+ verbose = 0
2732 if o == "-v" :
28- webchecker .verbose = verbose = verbose + 1
29- c = Sucker (0 )
33+ verbose = verbose + 1
34+ c = Sucker ()
35+ c .setflags (verbose = verbose )
3036 c .urlopener .addheaders = [
3137 ('User-agent' , 'websucker/%s' % __version__ ),
3238 ]
@@ -38,63 +44,31 @@ def main():
3844
3945class Sucker (webchecker .Checker ):
4046
41- # Alas, had to copy this to make one change...
42- def getpage (self , url ):
43- if url [:7 ] == 'mailto:' or url [:5 ] == 'news:' :
44- if verbose > 1 : print " Not checking mailto/news URL"
45- return None
46- isint = self .inroots (url )
47- if not isint and not self .checkext :
48- if verbose > 1 : print " Not checking ext link"
49- return None
47+ checkext = 0
48+
49+ def readhtml (self , url ):
50+ text = None
5051 path = self .savefilename (url )
51- saved = 0
5252 try :
5353 f = open (path , "rb" )
5454 except IOError :
55- try :
56- f = self .urlopener .open (url )
57- except IOError , msg :
58- msg = webchecker .sanitize (msg )
59- if verbose > 0 :
60- print "Error " , msg
61- if verbose > 0 :
62- webchecker .show (" HREF " , url , " from" , self .todo [url ])
63- self .setbad (url , msg )
64- return None
65- if not isint :
66- if verbose > 1 : print " Not gathering links from ext URL"
67- safeclose (f )
68- return None
69- nurl = f .geturl ()
70- if nurl != url :
71- path = self .savefilename (nurl )
72- info = f .info ()
55+ f = self .openpage (url )
56+ if f :
57+ info = f .info ()
58+ nurl = f .geturl ()
59+ if nurl != url :
60+ url = nurl
61+ path = self .savefilename (url )
62+ text = f .read ()
63+ f .close ()
64+ self .savefile (text , path )
65+ if not self .checkforhtml (info , url ):
66+ text = None
7367 else :
74- if verbose : print "Loading cached URL" , url
75- saved = 1
76- nurl = url
77- info = {}
78- if url [- 1 :] == "/" :
79- info ["content-type" ] = "text/html"
80- text = f .read ()
81- if not saved : self .savefile (text , path )
82- if info .has_key ('content-type' ):
83- ctype = string .lower (info ['content-type' ])
84- else :
85- ctype = None
86- if nurl != url :
87- if verbose > 1 :
88- print " Redirected to" , nurl
89- if not ctype :
90- ctype , encoding = webchecker .mimetypes .guess_type (nurl )
91- if ctype != 'text/html' :
92- webchecker .safeclose (f )
93- if verbose > 1 :
94- print " Not HTML, mime type" , ctype
95- return None
96- f .close ()
97- return webchecker .Page (text , nurl )
68+ if self .checkforhtml ({}, url ):
69+ text = f .read ()
70+ f .close ()
71+ return text , url
9872
9973 def savefile (self , text , path ):
10074 dir , base = os .path .split (path )
0 commit comments