From c8f48966426a7979728370724dc868a176f88115 Mon Sep 17 00:00:00 2001 From: Davin Potts Date: Fri, 13 Sep 2019 08:23:26 -0500 Subject: [PATCH 1/5] Adds tracking of which process in the pool takes which job from the queue; adds test for issue22393/issue38084. --- Lib/multiprocessing/pool.py | 62 +++++++++++++++++++++---------- Lib/test/_test_multiprocessing.py | 21 +++++++++++ 2 files changed, 63 insertions(+), 20 deletions(-) diff --git a/Lib/multiprocessing/pool.py b/Lib/multiprocessing/pool.py index b223d6aa724bb6..ccd79e317a4289 100644 --- a/Lib/multiprocessing/pool.py +++ b/Lib/multiprocessing/pool.py @@ -121,6 +121,7 @@ def worker(inqueue, outqueue, initializer=None, initargs=(), maxtasks=None, break job, i, func, args, kwds = task + put((job, i, (None, os.getpid()))) # Provide info on who took job try: result = (True, func(*args, **kwds)) except Exception as e: @@ -220,12 +221,14 @@ def __init__(self, processes=None, initializer=None, initargs=(), sentinels = self._get_sentinels() + self._job_assignments = {} self._worker_handler = threading.Thread( target=Pool._handle_workers, args=(self._cache, self._taskqueue, self._ctx, self.Process, self._processes, self._pool, self._inqueue, self._outqueue, self._initializer, self._initargs, self._maxtasksperchild, - self._wrap_exception, sentinels, self._change_notifier) + self._wrap_exception, sentinels, self._change_notifier, + self._job_assignments) ) self._worker_handler.daemon = True self._worker_handler._state = RUN @@ -243,7 +246,8 @@ def __init__(self, processes=None, initializer=None, initargs=(), self._result_handler = threading.Thread( target=Pool._handle_results, - args=(self._outqueue, self._quick_get, self._cache) + args=(self._outqueue, self._quick_get, self._cache, + self._job_assignments) ) self._result_handler.daemon = True self._result_handler._state = RUN @@ -284,7 +288,7 @@ def _get_worker_sentinels(workers): workers if hasattr(worker, "sentinel")] @staticmethod - def _join_exited_workers(pool): + def _join_exited_workers(pool, outqueue, job_assignments): """Cleanup after any worker processes which have exited due to reaching their specified lifetime. Returns True if any workers were cleaned up. """ @@ -294,8 +298,15 @@ def _join_exited_workers(pool): if worker.exitcode is not None: # worker exited util.debug('cleaning up worker %d' % i) + pid = worker.ident worker.join() cleaned = True + if pid in job_assignments: + # If the worker process died without communicating back + # while running a job, add a default result for it. + job = job_assignments[pid] + outqueue.put((job, i, (False, RuntimeError("Worker died")))) + del job_assignments[pid] del pool[i] return cleaned @@ -330,10 +341,10 @@ def _repopulate_pool_static(ctx, Process, processes, pool, inqueue, @staticmethod def _maintain_pool(ctx, Process, processes, pool, inqueue, outqueue, initializer, initargs, maxtasksperchild, - wrap_exception): + wrap_exception, job_assignments): """Clean up any exited workers and start replacements for them. """ - if Pool._join_exited_workers(pool): + if Pool._join_exited_workers(pool, outqueue, job_assignments): Pool._repopulate_pool_static(ctx, Process, processes, pool, inqueue, outqueue, initializer, initargs, maxtasksperchild, @@ -504,7 +515,7 @@ def _wait_for_updates(sentinels, change_notifier, timeout=None): def _handle_workers(cls, cache, taskqueue, ctx, Process, processes, pool, inqueue, outqueue, initializer, initargs, maxtasksperchild, wrap_exception, sentinels, - change_notifier): + change_notifier, job_assignments): thread = threading.current_thread() # Keep maintaining workers until the cache gets drained, unless the pool @@ -512,7 +523,8 @@ def _handle_workers(cls, cache, taskqueue, ctx, Process, processes, while thread._state == RUN or (cache and thread._state != TERMINATE): cls._maintain_pool(ctx, Process, processes, pool, inqueue, outqueue, initializer, initargs, - maxtasksperchild, wrap_exception) + maxtasksperchild, wrap_exception, + job_assignments) current_sentinels = [*cls._get_worker_sentinels(pool), *sentinels] @@ -568,7 +580,7 @@ def _handle_tasks(taskqueue, put, outqueue, pool, cache): util.debug('task handler exiting') @staticmethod - def _handle_results(outqueue, get, cache): + def _handle_results(outqueue, get, cache, job_assignments): thread = threading.current_thread() while 1: @@ -587,12 +599,18 @@ def _handle_results(outqueue, get, cache): util.debug('result handler got sentinel') break - job, i, obj = task - try: - cache[job]._set(i, obj) - except KeyError: - pass - task = job = obj = None + job, i, (task_info, value) = task + if task_info is None: + # task_info is True or False when a task has completed but + # None indicates information about which process has + # accepted a job from the queue. + job_assignments[value] = job + else: + try: + cache[job]._set(i, (task_info, value)) + except KeyError: + pass + task = job = task_info = value = None while cache and thread._state != TERMINATE: try: @@ -604,12 +622,16 @@ def _handle_results(outqueue, get, cache): if task is None: util.debug('result handler ignoring extra sentinel') continue - job, i, obj = task - try: - cache[job]._set(i, obj) - except KeyError: - pass - task = job = obj = None + + job, i, (task_info, value) = task + if task_info is None: + job_assignments[value] = job + else: + try: + cache[job]._set(i, (task_info, value)) + except KeyError: + pass + task = job = task_info = value = None if hasattr(outqueue, '_reader'): util.debug('ensuring that outqueue is not full') diff --git a/Lib/test/_test_multiprocessing.py b/Lib/test/_test_multiprocessing.py index c717d0aad2874b..1da3715eec7bea 100644 --- a/Lib/test/_test_multiprocessing.py +++ b/Lib/test/_test_multiprocessing.py @@ -2712,6 +2712,27 @@ def errback(exc): p.close() p.join() + def test_pool_worker_died_without_communicating(self): + # Issue22393: test fix of indefinite hang caused by worker processes + # exiting abruptly (such as via os._exit()) without communicating + # back to the pool at all. + prog = ( + "import os, multiprocessing as mp; " + "is_main = (__name__ == '__main__'); " + "p = mp.Pool(1) if is_main else print('worker'); " + "p.map(os._exit, [1]) if is_main else None; " + "(p.close() or p.join()) if is_main else None" + ) + # Only if there is a regression will this ever trigger a + # subprocess.TimeoutExpired. + completed_process = subprocess.run( + [sys.executable, '-E', '-S', '-O', '-c', prog], + check=False, + timeout=100, + capture_output=True + ) + self.assertNotEqual(0, completed_process.returncode) + class _TestPoolWorkerLifetime(BaseTestCase): ALLOWED_TYPES = ('processes', ) From 315ec3d9682bcb04b26d92a275a6fce90d553211 Mon Sep 17 00:00:00 2001 From: Davin Potts Date: Fri, 13 Sep 2019 10:42:09 -0500 Subject: [PATCH 2/5] Added blurb. --- .../NEWS.d/next/Library/2019-09-13-10-39-28.bpo-22393.SZ3LeI.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2019-09-13-10-39-28.bpo-22393.SZ3LeI.rst diff --git a/Misc/NEWS.d/next/Library/2019-09-13-10-39-28.bpo-22393.SZ3LeI.rst b/Misc/NEWS.d/next/Library/2019-09-13-10-39-28.bpo-22393.SZ3LeI.rst new file mode 100644 index 00000000000000..aeb65b639d1ce0 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-09-13-10-39-28.bpo-22393.SZ3LeI.rst @@ -0,0 +1 @@ +Fix deadlock from multiprocessing.Pool worker death without communication. From bcbd7d3ebf596f197c127fea3e90469f803e968d Mon Sep 17 00:00:00 2001 From: Davin Potts Date: Sun, 15 Sep 2019 14:29:55 -0500 Subject: [PATCH 3/5] Fix for missing checks on resources still being available during teardown. --- Lib/multiprocessing/pool.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Lib/multiprocessing/pool.py b/Lib/multiprocessing/pool.py index ccd79e317a4289..6242f631255738 100644 --- a/Lib/multiprocessing/pool.py +++ b/Lib/multiprocessing/pool.py @@ -268,8 +268,6 @@ def __del__(self, _warn=warnings.warn, RUN=RUN): if self._state == RUN: _warn(f"unclosed running multiprocessing pool {self!r}", ResourceWarning, source=self) - if getattr(self, '_change_notifier', None) is not None: - self._change_notifier.put(None) def __repr__(self): cls = self.__class__ @@ -693,7 +691,8 @@ def join(self): def _help_stuff_finish(inqueue, task_handler, size): # task_handler may be blocked trying to put items on inqueue util.debug('removing tasks from inqueue until task handler finished') - inqueue._rlock.acquire() + if inqueue._reader.poll(): + inqueue._rlock.acquire() while task_handler.is_alive() and inqueue._reader.poll(): inqueue._reader.recv() time.sleep(0) From e1a9eb5d43c79fe5b23017bd1c27516e3decca56 Mon Sep 17 00:00:00 2001 From: Davin Potts Date: Sun, 22 Sep 2019 16:37:03 -0500 Subject: [PATCH 4/5] Remove spurious space. Co-Authored-By: Steve Dower --- Lib/multiprocessing/pool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/multiprocessing/pool.py b/Lib/multiprocessing/pool.py index 6242f631255738..4114320d1e9892 100644 --- a/Lib/multiprocessing/pool.py +++ b/Lib/multiprocessing/pool.py @@ -296,7 +296,7 @@ def _join_exited_workers(pool, outqueue, job_assignments): if worker.exitcode is not None: # worker exited util.debug('cleaning up worker %d' % i) - pid = worker.ident + pid = worker.ident worker.join() cleaned = True if pid in job_assignments: From 64592844258585099b517b2f3ef8caa73d8b4464 Mon Sep 17 00:00:00 2001 From: Davin Potts Date: Mon, 23 Sep 2019 11:47:29 -0500 Subject: [PATCH 5/5] Fix result position for killed workers, add Steve suggested change to use dict.pop(). --- Lib/multiprocessing/pool.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Lib/multiprocessing/pool.py b/Lib/multiprocessing/pool.py index 4114320d1e9892..dc1b3c7ce4aadd 100644 --- a/Lib/multiprocessing/pool.py +++ b/Lib/multiprocessing/pool.py @@ -299,12 +299,13 @@ def _join_exited_workers(pool, outqueue, job_assignments): pid = worker.ident worker.join() cleaned = True - if pid in job_assignments: + job_info = job_assignments.pop(pid, None) + if job_info is not None: # If the worker process died without communicating back # while running a job, add a default result for it. - job = job_assignments[pid] - outqueue.put((job, i, (False, RuntimeError("Worker died")))) - del job_assignments[pid] + outqueue.put( + (*job_info, (False, RuntimeError("Worker died"))) + ) del pool[i] return cleaned @@ -602,7 +603,7 @@ def _handle_results(outqueue, get, cache, job_assignments): # task_info is True or False when a task has completed but # None indicates information about which process has # accepted a job from the queue. - job_assignments[value] = job + job_assignments[value] = (job, i) else: try: cache[job]._set(i, (task_info, value)) @@ -623,7 +624,7 @@ def _handle_results(outqueue, get, cache, job_assignments): job, i, (task_info, value) = task if task_info is None: - job_assignments[value] = job + job_assignments[value] = (job, i) else: try: cache[job]._set(i, (task_info, value))