Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit dd69649

Browse files
committed
Issue #9205: concurrent.futures.ProcessPoolExecutor now detects killed
children and raises BrokenProcessPool in such a situation. Previously it would reliably freeze/deadlock.
1 parent 4a5e5de commit dd69649

8 files changed

Lines changed: 587 additions & 107 deletions

File tree

Doc/library/concurrent.futures.rst

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,12 @@ to a :class:`ProcessPoolExecutor` will result in deadlock.
169169
of at most *max_workers* processes. If *max_workers* is ``None`` or not
170170
given, it will default to the number of processors on the machine.
171171

172+
.. versionchanged:: 3.3
173+
When one of the worker processes terminates abruptly, a
174+
:exc:`BrokenProcessPool` error is now raised. Previously, behaviour
175+
was undefined but operations on the executor or its futures would often
176+
freeze or deadlock.
177+
172178

173179
.. _processpoolexecutor-example:
174180

@@ -369,3 +375,16 @@ Module Functions
369375
:pep:`3148` -- futures - execute computations asynchronously
370376
The proposal which described this feature for inclusion in the Python
371377
standard library.
378+
379+
380+
Exception classes
381+
-----------------
382+
383+
.. exception:: BrokenProcessPool
384+
385+
Derived from :exc:`RuntimeError`, this exception class is raised when
386+
one of the workers of a :class:`ProcessPoolExecutor` has terminated
387+
in a non-clean fashion (for example, if it was killed from the outside).
388+
389+
.. versionadded:: 3.3
390+

Lib/concurrent/futures/process.py

Lines changed: 74 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,11 @@
4646
__author__ = 'Brian Quinlan ([email protected])'
4747

4848
import atexit
49+
import os
4950
from concurrent.futures import _base
5051
import queue
5152
import multiprocessing
52-
from multiprocessing.queues import SimpleQueue
53+
from multiprocessing.queues import SimpleQueue, SentinelReady
5354
import threading
5455
import weakref
5556

@@ -122,7 +123,7 @@ def _process_worker(call_queue, result_queue):
122123
call_item = call_queue.get(block=True)
123124
if call_item is None:
124125
# Wake up queue management thread
125-
result_queue.put(None)
126+
result_queue.put(os.getpid())
126127
return
127128
try:
128129
r = call_item.fn(*call_item.args, **call_item.kwargs)
@@ -194,29 +195,63 @@ def _queue_management_worker(executor_reference,
194195
result_queue: A multiprocessing.Queue of _ResultItems generated by the
195196
process workers.
196197
"""
197-
nb_shutdown_processes = 0
198-
def shutdown_one_process():
199-
"""Tell a worker to terminate, which will in turn wake us again"""
200-
nonlocal nb_shutdown_processes
201-
call_queue.put(None)
202-
nb_shutdown_processes += 1
198+
199+
def shutdown_worker():
200+
# This is an upper bound
201+
nb_children_alive = sum(p.is_alive() for p in processes.values())
202+
for i in range(0, nb_children_alive):
203+
call_queue.put(None)
204+
# If .join() is not called on the created processes then
205+
# some multiprocessing.Queue methods may deadlock on Mac OS
206+
# X.
207+
for p in processes.values():
208+
p.join()
209+
203210
while True:
204211
_add_call_item_to_queue(pending_work_items,
205212
work_ids_queue,
206213
call_queue)
207214

208-
result_item = result_queue.get()
209-
if result_item is not None:
210-
work_item = pending_work_items[result_item.work_id]
211-
del pending_work_items[result_item.work_id]
212-
213-
if result_item.exception:
214-
work_item.future.set_exception(result_item.exception)
215-
else:
216-
work_item.future.set_result(result_item.result)
217-
continue
218-
# If we come here, we either got a timeout or were explicitly woken up.
219-
# In either case, check whether we should start shutting down.
215+
sentinels = [p.sentinel for p in processes.values()]
216+
assert sentinels
217+
try:
218+
result_item = result_queue.get(sentinels=sentinels)
219+
except SentinelReady as e:
220+
# Mark the process pool broken so that submits fail right now.
221+
executor = executor_reference()
222+
if executor is not None:
223+
executor._broken = True
224+
executor._shutdown_thread = True
225+
del executor
226+
# All futures in flight must be marked failed
227+
for work_id, work_item in pending_work_items.items():
228+
work_item.future.set_exception(
229+
BrokenProcessPool(
230+
"A process in the process pool was "
231+
"terminated abruptly while the future was "
232+
"running or pending."
233+
))
234+
pending_work_items.clear()
235+
# Terminate remaining workers forcibly: the queues or their
236+
# locks may be in a dirty state and block forever.
237+
for p in processes.values():
238+
p.terminate()
239+
for p in processes.values():
240+
p.join()
241+
return
242+
if isinstance(result_item, int):
243+
# Clean shutdown of a worker using its PID
244+
# (avoids marking the executor broken)
245+
del processes[result_item]
246+
elif result_item is not None:
247+
work_item = pending_work_items.pop(result_item.work_id, None)
248+
# work_item can be None if another process terminated (see above)
249+
if work_item is not None:
250+
if result_item.exception:
251+
work_item.future.set_exception(result_item.exception)
252+
else:
253+
work_item.future.set_result(result_item.result)
254+
# Check whether we should start shutting down.
220255
executor = executor_reference()
221256
# No more work items can be added if:
222257
# - The interpreter is shutting down OR
@@ -226,17 +261,11 @@ def shutdown_one_process():
226261
# Since no new work items can be added, it is safe to shutdown
227262
# this thread if there are no pending work items.
228263
if not pending_work_items:
229-
while nb_shutdown_processes < len(processes):
230-
shutdown_one_process()
231-
# If .join() is not called on the created processes then
232-
# some multiprocessing.Queue methods may deadlock on Mac OS
233-
# X.
234-
for p in processes:
235-
p.join()
264+
shutdown_worker()
236265
return
237266
else:
238267
# Start shutting down by telling a process it can exit.
239-
shutdown_one_process()
268+
call_queue.put(None)
240269
del executor
241270

242271
_system_limits_checked = False
@@ -264,6 +293,14 @@ def _check_system_limits():
264293
_system_limited = "system provides too few semaphores (%d available, 256 necessary)" % nsems_max
265294
raise NotImplementedError(_system_limited)
266295

296+
297+
class BrokenProcessPool(RuntimeError):
298+
"""
299+
Raised when a process in a ProcessPoolExecutor terminated abruptly
300+
while a future was in the running state.
301+
"""
302+
303+
267304
class ProcessPoolExecutor(_base.Executor):
268305
def __init__(self, max_workers=None):
269306
"""Initializes a new ProcessPoolExecutor instance.
@@ -288,11 +325,13 @@ def __init__(self, max_workers=None):
288325
self._result_queue = SimpleQueue()
289326
self._work_ids = queue.Queue()
290327
self._queue_management_thread = None
291-
self._processes = set()
328+
# Map of pids to processes
329+
self._processes = {}
292330

293331
# Shutdown is a two-step process.
294332
self._shutdown_thread = False
295333
self._shutdown_lock = threading.Lock()
334+
self._broken = False
296335
self._queue_count = 0
297336
self._pending_work_items = {}
298337

@@ -302,6 +341,8 @@ def _start_queue_management_thread(self):
302341
def weakref_cb(_, q=self._result_queue):
303342
q.put(None)
304343
if self._queue_management_thread is None:
344+
# Start the processes so that their sentinels are known.
345+
self._adjust_process_count()
305346
self._queue_management_thread = threading.Thread(
306347
target=_queue_management_worker,
307348
args=(weakref.ref(self, weakref_cb),
@@ -321,10 +362,13 @@ def _adjust_process_count(self):
321362
args=(self._call_queue,
322363
self._result_queue))
323364
p.start()
324-
self._processes.add(p)
365+
self._processes[p.pid] = p
325366

326367
def submit(self, fn, *args, **kwargs):
327368
with self._shutdown_lock:
369+
if self._broken:
370+
raise BrokenProcessPool('A child process terminated '
371+
'abruptly, the process pool is not usable anymore')
328372
if self._shutdown_thread:
329373
raise RuntimeError('cannot schedule new futures after shutdown')
330374

@@ -338,7 +382,6 @@ def submit(self, fn, *args, **kwargs):
338382
self._result_queue.put(None)
339383

340384
self._start_queue_management_thread()
341-
self._adjust_process_count()
342385
return f
343386
submit.__doc__ = _base.Executor.submit.__doc__
344387

0 commit comments

Comments
 (0)