-
Notifications
You must be signed in to change notification settings - Fork 179
Expand file tree
/
Copy pathparser.py
More file actions
131 lines (115 loc) · 5.59 KB
/
parser.py
File metadata and controls
131 lines (115 loc) · 5.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import logging
import queue
import time
from threading import current_thread
from urllib.parse import urlsplit
from .utils import ThreadPool
class Parser(ThreadPool):
"""Base class for parser.
A thread pool of parser threads, in charge of downloading and parsing pages,
extracting file urls and put them into the input queue of downloader.
Attributes:
global_signal: A Signal object for cross-module communication.
session: A requests.Session object.
logger: A logging.Logger object used for logging.
threads: A list storing all the threading.Thread objects of the parser.
thread_num: An integer indicating the number of threads.
lock: A threading.Lock object.
"""
def __init__(self, thread_num, signal, session, in_queue=None, out_queue=None, name="parser"):
"""Init Parser with some shared variables."""
super().__init__(thread_num, in_queue=in_queue, out_queue=out_queue, name=name)
self.signal = signal
self.session = session
def parse(self, response, **kwargs):
"""Parse a page and extract image urls, then put it into task_queue.
This method should be overridden by users.
:Example:
>>> task = {}
>>> self.output(task) # doctest: +SKIP
"""
raise NotImplementedError
def worker_exec(self, queue_timeout=2, req_timeout=5, max_retry=3, **kwargs):
"""Target method of workers.
Firstly download the page and then call the :func:`parse` method.
A parser thread will exit in either of the following cases:
1. All feeder threads have exited and the ``url_queue`` is empty.
2. Downloaded image number has reached required number.
Args:
queue_timeout (int): Timeout of getting urls from ``url_queue``.
req_timeout (int): Timeout of making requests for downloading pages.
max_retry (int): Max retry times if the request fails.
**kwargs: Arguments to be passed to the :func:`parse` method.
"""
while True:
if self.signal.get("reach_max_num"):
self.logger.info(
"downloaded image reached max num, thread %s " "is ready to exit", current_thread().name
)
break
if self.signal.get("exceed_storage_space"):
self.logger.info("no more storage space, thread %s " "is ready to exit", current_thread().name)
break
# get the page url
try:
url = self.in_queue.get(timeout=queue_timeout)
except queue.Empty:
if self.signal.get("feeder_exited"):
self.logger.info("no more page urls for thread %s to parse", current_thread().name)
break
self.logger.info("%s is waiting for new page urls", current_thread().name)
continue
except:
self.logger.error("exception in thread %s", current_thread().name)
continue
else:
self.logger.debug(f"start fetching page {url}")
# fetch and parse the page
retry = max_retry
while retry > 0:
try:
base_url = "{0.scheme}://{0.netloc}".format(urlsplit(url))
response = self.session.get(url, timeout=req_timeout, headers={"Referer": base_url})
except Exception as e:
self.logger.error(
"Exception caught when fetching page %s, " "error: %s, remaining retry times: %d",
url,
e,
retry - 1,
)
else:
self.logger.info(f"parsing result page {url}")
task_list = self.parse(response, **kwargs)
if not task_list:
self.logger.debug("self.parse() returned no tasks")
with open("task_list_error.log", "ab") as f:
f.write(response.content)
for task in task_list:
while not self.signal.get("reach_max_num") and not self.signal.get("exceed_storage_space"):
try:
if isinstance(task, dict):
self.output(task, timeout=1)
elif isinstance(task, str):
# this case only work for GreedyCrawler,
# which need to feed the url back to
# url_queue, dirty implementation
self.input(task, timeout=1)
except queue.Full:
time.sleep(1)
except Exception as e:
self.logger.error(
"Exception caught when put task %s into " "queue, error: %s", task, url
)
else:
break
if self.signal.get("reach_max_num"):
break
if self.signal.get("exceed_storage_space"):
break
self.in_queue.task_done()
break
finally:
retry -= 1
self.logger.info(f"thread {current_thread().name} exit")
def __exit__(self, exc_type, exc_val, exc_tb):
logging.info("all parser threads exited")