From a1408a4f60826ba9800a61662959d8d48eff259d Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Wed, 19 Mar 2025 13:45:52 +1300 Subject: [PATCH 1/2] gh-130895: fix multiprocessing.Process join/wait/poll races This bug is caused by race conditions in the poll implementations (which are called by join/wait) where if multiple threads try to reap the dead process only one "wins" and gets the exit code, while the others get an error. In the forkserver implementation the losing thread(s) set the code to an error, possibly overwriting the correct code set by the winning thread. This is relatively easy to fix: we can just take a lock before waiting for the process, since at that point we know the call should not block. In the fork and spawn implementations the losers of the race return before the exit code is set, meaning the process may still report itself as alive after join returns. Fixing this is trickier as we have to support a mixture of blocking and non-blocking calls to poll, and we cannot have the latter waiting to take a lock held by the former. The approach taken is to split the blocking and non-blocking call variants. The non-blocking variant does its work with the lock held: since it won't block this should be safe. The blocking variant releases the lock before making the blocking operating system call. It then retakes the lock and either sets the code if it wins or waits for a potentially racing thread to do so otherwise. If a non-blocking call is racing with the unlocked part of a blocking call it may still "lose" the race, and return None instead of the exit code, even though the process is dead. However, as the process could be alive at the time the call is made but die immediately afterwards, this situation should already be handled by correctly written code. To verify the behaviour a test is added which reliably triggers failures for all three implementations. A work-around for this bug in a test added for gh-128041 is also reverted. --- Lib/multiprocessing/popen_fork.py | 65 ++++++++++++++++--- Lib/multiprocessing/popen_forkserver.py | 17 +++-- Lib/test/_test_multiprocessing.py | 38 +++++++++++ .../test_process_pool.py | 10 +-- 4 files changed, 107 insertions(+), 23 deletions(-) diff --git a/Lib/multiprocessing/popen_fork.py b/Lib/multiprocessing/popen_fork.py index a57ef6bdad5ccc..9ec0b1705abb99 100644 --- a/Lib/multiprocessing/popen_fork.py +++ b/Lib/multiprocessing/popen_fork.py @@ -1,6 +1,7 @@ import atexit import os import signal +import threading from . import util @@ -17,23 +18,71 @@ def __init__(self, process_obj): util._flush_std_streams() self.returncode = None self.finalizer = None + self._exit_condition = threading.Condition() + self._exit_blockers = 0 self._launch(process_obj) def duplicate_for_child(self, fd): return fd def poll(self, flag=os.WNOHANG): - if self.returncode is None: - try: - pid, sts = os.waitpid(self.pid, flag) - except OSError: - # Child process not yet created. See #1731717 - # e.errno == errno.ECHILD == 10 - return None + with self._exit_condition: + if self.returncode is not None: + return self.returncode + elif flag & os.WNOHANG == os.WNOHANG: + return self._nonblocking_poll(flag) + else: + self._exit_blockers += 1 + + # We have released the lock, so may be racing with blocking & + # non-blocking calls at this point... + pid = None + try: + pid, sts = os.waitpid(self.pid, flag) + except OSError: + # Child process doesn't exist because it hasn't started yet (see + # bpo-1731717) or has already been awaited on a racing thread (see + # gh-130895) + pass + + with self._exit_condition: + self._exit_blockers -= 1 if pid == self.pid: - self.returncode = os.waitstatus_to_exitcode(sts) + self._set_returncode(sts) + elif self._exit_blockers == 0: + self._exit_condition.notify_all() + + # Wait until we get a definitive result, or we know there are no + # racing calls that might be about to set it + while self.returncode is None and self._exit_blockers > 0: + self._exit_condition.wait() + + return self.returncode + + def _nonblocking_poll(self, flag): + assert self._exit_condition._is_owned() + assert self.returncode is None + assert flag & os.WNOHANG == os.WNOHANG + try: + pid, sts = os.waitpid(self.pid, flag) + if pid == self.pid: + self._set_returncode(sts) + except OSError: + # See comments in the poll(...) except clause above + pass + + # We may be racing with a blocking wait call, in which case (if we lose + # the race) it is arbitrary whether this returns None or the exit code + # (if there is one): calling code must always be prepared to handle a + # situation where this method returns None but the process has ended. return self.returncode + def _set_returncode(self, sts): + assert self._exit_condition._is_owned() + assert self.returncode is None + self.returncode = os.waitstatus_to_exitcode(sts) + self._exit_condition.notify_all() + def wait(self, timeout=None): if self.returncode is None: if timeout is not None: diff --git a/Lib/multiprocessing/popen_forkserver.py b/Lib/multiprocessing/popen_forkserver.py index a56eb9bf11080b..0e086a17da7e98 100644 --- a/Lib/multiprocessing/popen_forkserver.py +++ b/Lib/multiprocessing/popen_forkserver.py @@ -1,5 +1,6 @@ import io import os +import threading from .context import reduction, set_spawning_popen if not reduction.HAVE_SEND_HANDLE: @@ -32,6 +33,7 @@ class Popen(popen_fork.Popen): def __init__(self, process_obj): self._fds = [] + self._lock = threading.Lock() super().__init__(process_obj) def duplicate_for_child(self, fd): @@ -64,11 +66,14 @@ def poll(self, flag=os.WNOHANG): timeout = 0 if flag == os.WNOHANG else None if not wait([self.sentinel], timeout): return None - try: - self.returncode = forkserver.read_signed(self.sentinel) - except (OSError, EOFError): - # This should not happen usually, but perhaps the forkserver - # process itself got killed - self.returncode = 255 + + with self._lock: + if self.returncode is None: + try: + self.returncode = forkserver.read_signed(self.sentinel) + except (OSError, EOFError): + # This should not happen usually, but perhaps the + # forkserver process itself got killed + self.returncode = 255 return self.returncode diff --git a/Lib/test/_test_multiprocessing.py b/Lib/test/_test_multiprocessing.py index dcce57629efe5b..7b456c920a986e 100644 --- a/Lib/test/_test_multiprocessing.py +++ b/Lib/test/_test_multiprocessing.py @@ -953,6 +953,44 @@ def test_forkserver_without_auth_fails(self): proc.start() proc.join() + @staticmethod + def _wait_for_barrier(barrier): + barrier.wait() + + def _wait_on_proc(self, barrier, proc, errs): + barrier.wait() + proc.join() + if proc.is_alive(): + errs.append("process alive after join") + if proc.exitcode != 0: + errs.append("process reported non-zero exit code") + + def test_racing_joins(self): + if self.TYPE == "threads": + self.skipTest(f"test not appropriate for {self.TYPE}") + + N = 5 + ITERATIONS = 10 + for _ in range(ITERATIONS): + barrier = self.Barrier(N+1) + proc = self.Process(target=self._wait_for_barrier, args=(barrier,)) + + errs = [] + threads = [threading.Thread(target=self._wait_on_proc, + args=(barrier, proc, errs)) + for _ in range(N)] + for t in threads: + t.start() + + proc.start() + for t in threads: + t.join() + + # On failure(s), report the first since they are likely the same + # error reported from multiple threads + if errs: + raise AssertionError(errs[0]) + # # # diff --git a/Lib/test/test_concurrent_futures/test_process_pool.py b/Lib/test/test_concurrent_futures/test_process_pool.py index 3f13a1900a4ca4..11784b96d6ed47 100644 --- a/Lib/test/test_concurrent_futures/test_process_pool.py +++ b/Lib/test/test_concurrent_futures/test_process_pool.py @@ -327,15 +327,7 @@ def test_force_shutdown_workers_stops_pool(self, function_name): # error since the process would be alive immediately after the # test run.. and die a moment later. worker_process.join(support.SHORT_TIMEOUT) - - # Oddly enough, even though join completes, sometimes it takes a - # moment for the process to actually be marked as dead. - # ... that seems a bit buggy. - # We need it dead before ending the test to ensure it doesn't - # get marked as an ENV CHANGE due to living child process. - for _ in support.sleeping_retry(support.SHORT_TIMEOUT): - if not worker_process.is_alive(): - break + self.assertFalse(worker_process.is_alive()) create_executor_tests(globals(), ProcessPoolExecutorTest, From ad102f47c731e5fee061a1a20e4e6bf726cac6c8 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Thu, 20 Mar 2025 23:30:43 +1300 Subject: [PATCH 2/2] Add blurb --- .../next/Library/2025-03-19-16-09-19.gh-issue-130895.6ILa73.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2025-03-19-16-09-19.gh-issue-130895.6ILa73.rst diff --git a/Misc/NEWS.d/next/Library/2025-03-19-16-09-19.gh-issue-130895.6ILa73.rst b/Misc/NEWS.d/next/Library/2025-03-19-16-09-19.gh-issue-130895.6ILa73.rst new file mode 100644 index 00000000000000..3b336f9257c658 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-03-19-16-09-19.gh-issue-130895.6ILa73.rst @@ -0,0 +1 @@ +Fix race with ``poll``/``wait``/``join`` in :mod:`multiprocessing`.``Process``.