99from StringIO import StringIO
1010from BeautifulSoup import BeautifulSoup
1111
12- def spider (url , args , new_link ):
12+ def spider (url , args , flag_get_new_link ):
1313
1414 # 分析页面,获取链接
1515 def get_link (html ):
16- new_task = []
16+ new_link = []
1717
1818 soup = BeautifulSoup (html )
1919 for link in soup .findAll ('a' ,
2020 attrs = {'href' : re .compile ("^http://" )}):
2121 href = link .get ('href' )
22- new_task .append (href )
22+ new_link .append (href )
2323
24- return new_task
24+ return new_link
2525
26- def get_html (url , args , new_link ):
26+ def get_html (url , args , flag_get_new_link ):
2727 try :
2828 response = urllib2 .urlopen (url , timeout = 20 )
2929 if response .info ().get ('Content-Encoding' ) == 'gzip' :
@@ -42,27 +42,27 @@ def get_html(url, args, new_link):
4242 print 'exception'
4343 #self.logging.error("Unexpected:{0} {1}".format(url[1].encode("utf8"), str(e)))
4444 else :
45- new_task = []
45+ new_link = []
4646
4747 if args .key == "" :
48- if new_link :
49- new_task = get_link (html )
48+ if flag_get_new_link :
49+ new_link = get_link (html )
5050 else :
5151 # 下载匹配关键字的页面
5252 if not self .encoding :
5353 charset = chardet .detect (html )
5454 self .encoding = charset ['encoding' ]
5555
5656 match = re .search (re .compile (self .key ), html .decode (self .encoding , "ignore" ))
57- if match and new_link :
58- new_task = get_link (html )
57+ if match and flag_get_new_link :
58+ new_link = get_link (html )
5959 else :
6060 print 'not match'
6161 #self.logging.debug("{0} ignore {1} key not match".format(self.getName(), url[1].encode("utf8")))
6262
63- return new_task
63+ return new_link
6464
65- return get_html (url , args , new_link )
65+ return get_html (url , args , flag_get_new_link )
6666
6767
6868
0 commit comments