Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 1f6b4c9

Browse files
author
dqw
committed
命名调整
1 parent ab73357 commit 1f6b4c9

File tree

2 files changed

+18
-17
lines changed

2 files changed

+18
-17
lines changed

utils/pool.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,14 @@ def run(self):
6565
self.thread_pool.increase_running()
6666
print "{0} downloaded {1} \n".format(threading.current_thread(), url)
6767

68-
new_link = True
68+
# 判断deep,是否获取新的链接
69+
flag_get_new_link = True
6970
if deep >= self.thread_pool.args.deep:
70-
new_link = False
71+
flag_get_new_link = False
7172

72-
new_task = do(url, self.thread_pool.args, new_link)
73-
if new_task:
74-
for url in new_task:
73+
new_link = do(url, self.thread_pool.args, flag_get_new_link)
74+
if new_link:
75+
for url in new_link:
7576
self.thread_pool.add_task(do, url, deep + 1)
7677

7778
self.thread_pool.decrease_running()

utils/spider.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,21 @@
99
from StringIO import StringIO
1010
from BeautifulSoup import BeautifulSoup
1111

12-
def spider(url, args, new_link):
12+
def spider(url, args, flag_get_new_link):
1313

1414
# 分析页面,获取链接
1515
def get_link(html):
16-
new_task = []
16+
new_link = []
1717

1818
soup = BeautifulSoup(html)
1919
for link in soup.findAll('a',
2020
attrs={'href': re.compile("^http://")}):
2121
href = link.get('href')
22-
new_task.append(href)
22+
new_link.append(href)
2323

24-
return new_task
24+
return new_link
2525

26-
def get_html(url, args, new_link):
26+
def get_html(url, args, flag_get_new_link):
2727
try:
2828
response = urllib2.urlopen(url, timeout=20)
2929
if response.info().get('Content-Encoding') == 'gzip':
@@ -42,27 +42,27 @@ def get_html(url, args, new_link):
4242
print 'exception'
4343
#self.logging.error("Unexpected:{0} {1}".format(url[1].encode("utf8"), str(e)))
4444
else:
45-
new_task = []
45+
new_link = []
4646

4747
if args.key == "":
48-
if new_link:
49-
new_task = get_link(html)
48+
if flag_get_new_link:
49+
new_link = get_link(html)
5050
else:
5151
# 下载匹配关键字的页面
5252
if not self.encoding:
5353
charset = chardet.detect(html)
5454
self.encoding = charset['encoding']
5555

5656
match = re.search(re.compile(self.key), html.decode(self.encoding, "ignore"))
57-
if match and new_link:
58-
new_task = get_link(html)
57+
if match and flag_get_new_link:
58+
new_link = get_link(html)
5959
else:
6060
print 'not match'
6161
#self.logging.debug("{0} ignore {1} key not match".format(self.getName(), url[1].encode("utf8")))
6262

63-
return new_task
63+
return new_link
6464

65-
return get_html(url, args, new_link)
65+
return get_html(url, args, flag_get_new_link)
6666

6767

6868

0 commit comments

Comments
 (0)