diff --git a/Lib/multiprocessing/popen_fork.py b/Lib/multiprocessing/popen_fork.py index a57ef6bdad5ccc..9ec0b1705abb99 100644 --- a/Lib/multiprocessing/popen_fork.py +++ b/Lib/multiprocessing/popen_fork.py @@ -1,6 +1,7 @@ import atexit import os import signal +import threading from . import util @@ -17,23 +18,71 @@ def __init__(self, process_obj): util._flush_std_streams() self.returncode = None self.finalizer = None + self._exit_condition = threading.Condition() + self._exit_blockers = 0 self._launch(process_obj) def duplicate_for_child(self, fd): return fd def poll(self, flag=os.WNOHANG): - if self.returncode is None: - try: - pid, sts = os.waitpid(self.pid, flag) - except OSError: - # Child process not yet created. See #1731717 - # e.errno == errno.ECHILD == 10 - return None + with self._exit_condition: + if self.returncode is not None: + return self.returncode + elif flag & os.WNOHANG == os.WNOHANG: + return self._nonblocking_poll(flag) + else: + self._exit_blockers += 1 + + # We have released the lock, so may be racing with blocking & + # non-blocking calls at this point... + pid = None + try: + pid, sts = os.waitpid(self.pid, flag) + except OSError: + # Child process doesn't exist because it hasn't started yet (see + # bpo-1731717) or has already been awaited on a racing thread (see + # gh-130895) + pass + + with self._exit_condition: + self._exit_blockers -= 1 if pid == self.pid: - self.returncode = os.waitstatus_to_exitcode(sts) + self._set_returncode(sts) + elif self._exit_blockers == 0: + self._exit_condition.notify_all() + + # Wait until we get a definitive result, or we know there are no + # racing calls that might be about to set it + while self.returncode is None and self._exit_blockers > 0: + self._exit_condition.wait() + + return self.returncode + + def _nonblocking_poll(self, flag): + assert self._exit_condition._is_owned() + assert self.returncode is None + assert flag & os.WNOHANG == os.WNOHANG + try: + pid, sts = os.waitpid(self.pid, flag) + if pid == self.pid: + self._set_returncode(sts) + except OSError: + # See comments in the poll(...) except clause above + pass + + # We may be racing with a blocking wait call, in which case (if we lose + # the race) it is arbitrary whether this returns None or the exit code + # (if there is one): calling code must always be prepared to handle a + # situation where this method returns None but the process has ended. return self.returncode + def _set_returncode(self, sts): + assert self._exit_condition._is_owned() + assert self.returncode is None + self.returncode = os.waitstatus_to_exitcode(sts) + self._exit_condition.notify_all() + def wait(self, timeout=None): if self.returncode is None: if timeout is not None: diff --git a/Lib/multiprocessing/popen_forkserver.py b/Lib/multiprocessing/popen_forkserver.py index a56eb9bf11080b..0e086a17da7e98 100644 --- a/Lib/multiprocessing/popen_forkserver.py +++ b/Lib/multiprocessing/popen_forkserver.py @@ -1,5 +1,6 @@ import io import os +import threading from .context import reduction, set_spawning_popen if not reduction.HAVE_SEND_HANDLE: @@ -32,6 +33,7 @@ class Popen(popen_fork.Popen): def __init__(self, process_obj): self._fds = [] + self._lock = threading.Lock() super().__init__(process_obj) def duplicate_for_child(self, fd): @@ -64,11 +66,14 @@ def poll(self, flag=os.WNOHANG): timeout = 0 if flag == os.WNOHANG else None if not wait([self.sentinel], timeout): return None - try: - self.returncode = forkserver.read_signed(self.sentinel) - except (OSError, EOFError): - # This should not happen usually, but perhaps the forkserver - # process itself got killed - self.returncode = 255 + + with self._lock: + if self.returncode is None: + try: + self.returncode = forkserver.read_signed(self.sentinel) + except (OSError, EOFError): + # This should not happen usually, but perhaps the + # forkserver process itself got killed + self.returncode = 255 return self.returncode diff --git a/Lib/test/_test_multiprocessing.py b/Lib/test/_test_multiprocessing.py index dcce57629efe5b..7b456c920a986e 100644 --- a/Lib/test/_test_multiprocessing.py +++ b/Lib/test/_test_multiprocessing.py @@ -953,6 +953,44 @@ def test_forkserver_without_auth_fails(self): proc.start() proc.join() + @staticmethod + def _wait_for_barrier(barrier): + barrier.wait() + + def _wait_on_proc(self, barrier, proc, errs): + barrier.wait() + proc.join() + if proc.is_alive(): + errs.append("process alive after join") + if proc.exitcode != 0: + errs.append("process reported non-zero exit code") + + def test_racing_joins(self): + if self.TYPE == "threads": + self.skipTest(f"test not appropriate for {self.TYPE}") + + N = 5 + ITERATIONS = 10 + for _ in range(ITERATIONS): + barrier = self.Barrier(N+1) + proc = self.Process(target=self._wait_for_barrier, args=(barrier,)) + + errs = [] + threads = [threading.Thread(target=self._wait_on_proc, + args=(barrier, proc, errs)) + for _ in range(N)] + for t in threads: + t.start() + + proc.start() + for t in threads: + t.join() + + # On failure(s), report the first since they are likely the same + # error reported from multiple threads + if errs: + raise AssertionError(errs[0]) + # # # diff --git a/Lib/test/test_concurrent_futures/test_process_pool.py b/Lib/test/test_concurrent_futures/test_process_pool.py index 3f13a1900a4ca4..11784b96d6ed47 100644 --- a/Lib/test/test_concurrent_futures/test_process_pool.py +++ b/Lib/test/test_concurrent_futures/test_process_pool.py @@ -327,15 +327,7 @@ def test_force_shutdown_workers_stops_pool(self, function_name): # error since the process would be alive immediately after the # test run.. and die a moment later. worker_process.join(support.SHORT_TIMEOUT) - - # Oddly enough, even though join completes, sometimes it takes a - # moment for the process to actually be marked as dead. - # ... that seems a bit buggy. - # We need it dead before ending the test to ensure it doesn't - # get marked as an ENV CHANGE due to living child process. - for _ in support.sleeping_retry(support.SHORT_TIMEOUT): - if not worker_process.is_alive(): - break + self.assertFalse(worker_process.is_alive()) create_executor_tests(globals(), ProcessPoolExecutorTest, diff --git a/Misc/NEWS.d/next/Library/2025-03-19-16-09-19.gh-issue-130895.6ILa73.rst b/Misc/NEWS.d/next/Library/2025-03-19-16-09-19.gh-issue-130895.6ILa73.rst new file mode 100644 index 00000000000000..3b336f9257c658 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-03-19-16-09-19.gh-issue-130895.6ILa73.rst @@ -0,0 +1 @@ +Fix race with ``poll``/``wait``/``join`` in :mod:`multiprocessing`.``Process``.