Thanks to visit codestin.com
Credit goes to github.com

Skip to content

gh-91048: Fix external inspection multi-threaded performance #136005

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Include/internal/pycore_global_objects_fini_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Include/internal/pycore_global_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,7 @@ struct _Py_global_strings {
STRUCT_FOR_ID(offset_src)
STRUCT_FOR_ID(on_type_read)
STRUCT_FOR_ID(onceregistry)
STRUCT_FOR_ID(only_active_thread)
STRUCT_FOR_ID(only_keys)
STRUCT_FOR_ID(oparg)
STRUCT_FOR_ID(opcode)
Expand Down
1 change: 1 addition & 0 deletions Include/internal/pycore_runtime_init_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions Include/internal/pycore_unicodeobject_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

122 changes: 121 additions & 1 deletion Lib/test/test_external_inspection.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import threading
from asyncio import staggered, taskgroups, base_events, tasks
from unittest.mock import ANY
from test.support import os_helper, SHORT_TIMEOUT, busy_retry
from test.support import os_helper, SHORT_TIMEOUT, busy_retry, requires_gil_enabled
from test.support.script_helper import make_script
from test.support.socket_helper import find_unused_port

Expand Down Expand Up @@ -876,6 +876,126 @@ def test_self_trace(self):
],
)

@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
@requires_gil_enabled("Free threaded builds don't have an 'active thread'")
def test_only_active_thread(self):
# Test that only_active_thread parameter works correctly
port = find_unused_port()
script = textwrap.dedent(
f"""\
import time, sys, socket, threading
# Connect to the test process
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.connect(('localhost', {port}))
def worker_thread(name, barrier, ready_event):
barrier.wait() # Synchronize thread start
ready_event.wait() # Wait for main thread signal
# Sleep to keep thread alive
time.sleep(10_000)
def main_work():
# Do busy work to hold the GIL
sock.sendall(b"working\\n")
count = 0
while count < 100000000:
count += 1
if count % 10000000 == 0:
pass # Keep main thread busy
sock.sendall(b"done\\n")
# Create synchronization primitives
num_threads = 3
barrier = threading.Barrier(num_threads + 1) # +1 for main thread
ready_event = threading.Event()
# Start worker threads
threads = []
for i in range(num_threads):
t = threading.Thread(target=worker_thread, args=(f"Worker-{{i}}", barrier, ready_event))
t.start()
threads.append(t)
# Wait for all threads to be ready
barrier.wait()
# Signal ready to parent process
sock.sendall(b"ready\\n")
# Signal threads to start waiting
ready_event.set()
# Give threads time to start sleeping
time.sleep(0.1)
# Now do busy work to hold the GIL
main_work()
"""
)

with os_helper.temp_dir() as work_dir:
script_dir = os.path.join(work_dir, "script_pkg")
os.mkdir(script_dir)

# Create a socket server to communicate with the target process
server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server_socket.bind(("localhost", port))
server_socket.settimeout(SHORT_TIMEOUT)
server_socket.listen(1)

script_name = _make_test_script(script_dir, "script", script)
client_socket = None
try:
p = subprocess.Popen([sys.executable, script_name])
client_socket, _ = server_socket.accept()
server_socket.close()

# Wait for ready signal
response = b""
while b"ready" not in response:
response += client_socket.recv(1024)

# Wait for the main thread to start its busy work
while b"working" not in response:
response += client_socket.recv(1024)

# Get stack trace with all threads
unwinder_all = RemoteUnwinder(p.pid, all_threads=True)
all_traces = unwinder_all.get_stack_trace()

# Get stack trace with only GIL holder
unwinder_gil = RemoteUnwinder(p.pid, only_active_thread=True)
gil_traces = unwinder_gil.get_stack_trace()

except PermissionError:
self.skipTest(
"Insufficient permissions to read the stack trace"
)
finally:
if client_socket is not None:
client_socket.close()
p.kill()
p.terminate()
p.wait(timeout=SHORT_TIMEOUT)

# Verify we got multiple threads in all_traces
self.assertGreater(len(all_traces), 1, "Should have multiple threads")

# Verify we got exactly one thread in gil_traces
self.assertEqual(len(gil_traces), 1, "Should have exactly one GIL holder")

# The GIL holder should be in the all_traces list
gil_thread_id = gil_traces[0][0]
all_thread_ids = [trace[0] for trace in all_traces]
self.assertIn(gil_thread_id, all_thread_ids,
"GIL holder should be among all threads")


if __name__ == "__main__":
unittest.main()
83 changes: 68 additions & 15 deletions Modules/_remote_debugging_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,14 @@
#endif

#ifdef Py_GIL_DISABLED
#define INTERP_STATE_MIN_SIZE MAX(MAX(offsetof(PyInterpreterState, _code_object_generation) + sizeof(uint64_t), \
offsetof(PyInterpreterState, tlbc_indices.tlbc_generation) + sizeof(uint32_t)), \
offsetof(PyInterpreterState, threads.head) + sizeof(void*))
#define INTERP_STATE_MIN_SIZE MAX(MAX(MAX(offsetof(PyInterpreterState, _code_object_generation) + sizeof(uint64_t), \
offsetof(PyInterpreterState, tlbc_indices.tlbc_generation) + sizeof(uint32_t)), \
offsetof(PyInterpreterState, threads.head) + sizeof(void*)), \
offsetof(PyInterpreterState, _gil.last_holder) + sizeof(PyThreadState*))
#else
#define INTERP_STATE_MIN_SIZE MAX(offsetof(PyInterpreterState, _code_object_generation) + sizeof(uint64_t), \
offsetof(PyInterpreterState, threads.head) + sizeof(void*))
#define INTERP_STATE_MIN_SIZE MAX(MAX(offsetof(PyInterpreterState, _code_object_generation) + sizeof(uint64_t), \
offsetof(PyInterpreterState, threads.head) + sizeof(void*)), \
offsetof(PyInterpreterState, _gil.last_holder) + sizeof(PyThreadState*))
#endif
#define INTERP_STATE_BUFFER_SIZE MAX(INTERP_STATE_MIN_SIZE, 256)

Expand Down Expand Up @@ -206,6 +208,7 @@ typedef struct {
uint64_t code_object_generation;
_Py_hashtable_t *code_object_cache;
int debug;
int only_active_thread;
RemoteDebuggingState *cached_state; // Cached module state
#ifdef Py_GIL_DISABLED
// TLBC cache invalidation tracking
Expand Down Expand Up @@ -2496,6 +2499,7 @@ _remote_debugging.RemoteUnwinder.__init__
pid: int
*
all_threads: bool = False
only_active_thread: bool = False
debug: bool = False
Initialize a new RemoteUnwinder object for debugging a remote Python process.
Expand All @@ -2504,6 +2508,8 @@ Initialize a new RemoteUnwinder object for debugging a remote Python process.
pid: Process ID of the target Python process to debug
all_threads: If True, initialize state for all threads in the process.
If False, only initialize for the main thread.
only_active_thread: If True, only sample the thread holding the GIL.
Cannot be used together with all_threads=True.
debug: If True, chain exceptions to explain the sequence of events that
lead to the exception.
Expand All @@ -2514,15 +2520,33 @@ process, including examining thread states, stack frames and other runtime data.
PermissionError: If access to the target process is denied
OSError: If unable to attach to the target process or access its memory
RuntimeError: If unable to read debug information from the target process
ValueError: If both all_threads and only_active_thread are True
[clinic start generated code]*/

static int
_remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
int pid, int all_threads,
int only_active_thread,
int debug)
/*[clinic end generated code: output=3982f2a7eba49334 input=48a762566b828e91]*/
/*[clinic end generated code: output=13ba77598ecdcbe1 input=8f8f12504e17da04]*/
{
// Validate that all_threads and only_active_thread are not both True
if (all_threads && only_active_thread) {
PyErr_SetString(PyExc_ValueError,
"all_threads and only_active_thread cannot both be True");
return -1;
}

#ifdef Py_GIL_DISABLED
if (only_active_thread) {
PyErr_SetString(PyExc_ValueError,
"only_active_thread is not supported when Py_GIL_DISABLED is not defined");
return -1;
}
#endif

self->debug = debug;
self->only_active_thread = only_active_thread;
self->cached_state = NULL;
if (_Py_RemoteDebug_InitProcHandle(&self->handle, pid) < 0) {
set_exception_cause(self, PyExc_RuntimeError, "Failed to initialize process handle");
Expand Down Expand Up @@ -2602,13 +2626,18 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
@critical_section
_remote_debugging.RemoteUnwinder.get_stack_trace
Returns a list of stack traces for all threads in the target process.
Returns a list of stack traces for threads in the target process.
Each element in the returned list is a tuple of (thread_id, frame_list), where:
- thread_id is the OS thread identifier
- frame_list is a list of tuples (function_name, filename, line_number) representing
the Python stack frames for that thread, ordered from most recent to oldest
The threads returned depend on the initialization parameters:
- If only_active_thread was True: returns only the thread holding the GIL
- If all_threads was True: returns all threads
- Otherwise: returns only the main thread
Example:
[
(1234, [
Expand All @@ -2632,7 +2661,7 @@ Each element in the returned list is a tuple of (thread_id, frame_list), where:

static PyObject *
_remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self)
/*[clinic end generated code: output=666192b90c69d567 input=331dbe370578badf]*/
/*[clinic end generated code: output=666192b90c69d567 input=f756f341206f9116]*/
{
PyObject* result = NULL;
// Read interpreter state into opaque buffer
Expand All @@ -2655,6 +2684,28 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
_Py_hashtable_clear(self->code_object_cache);
}

// If only_active_thread is true, we need to determine which thread holds the GIL
PyThreadState* gil_holder = NULL;
if (self->only_active_thread) {
// The GIL state is already in interp_state_buffer, just read from there
// Check if GIL is locked
int gil_locked = GET_MEMBER(int, interp_state_buffer,
self->debug_offsets.interpreter_state.gil_runtime_state_locked);

if (gil_locked) {
// Get the last holder (current holder when GIL is locked)
gil_holder = GET_MEMBER(PyThreadState*, interp_state_buffer,
self->debug_offsets.interpreter_state.gil_runtime_state_holder);
} else {
// GIL is not locked, return empty list
result = PyList_New(0);
if (!result) {
set_exception_cause(self, PyExc_MemoryError, "Failed to create empty result list");
}
goto exit;
}
}

#ifdef Py_GIL_DISABLED
// Check TLBC generation and invalidate cache if needed
uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer,
Expand All @@ -2666,7 +2717,10 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
#endif

uintptr_t current_tstate;
if (self->tstate_addr == 0) {
if (self->only_active_thread && gil_holder != NULL) {
// We have the GIL holder, process only that thread
current_tstate = (uintptr_t)gil_holder;
} else if (self->tstate_addr == 0) {
// Get threads head from buffer
current_tstate = GET_MEMBER(uintptr_t, interp_state_buffer,
self->debug_offsets.interpreter_state.threads_head);
Expand Down Expand Up @@ -2700,10 +2754,14 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
if (self->tstate_addr) {
break;
}

// If we're only processing the GIL holder, we're done after one iteration
if (self->only_active_thread && gil_holder != NULL) {
break;
}
}

exit:
_Py_RemoteDebug_ClearCache(&self->handle);
return result;
}

Expand Down Expand Up @@ -2827,11 +2885,9 @@ _remote_debugging_RemoteUnwinder_get_all_awaited_by_impl(RemoteUnwinderObject *s
goto result_err;
}

_Py_RemoteDebug_ClearCache(&self->handle);
return result;

result_err:
_Py_RemoteDebug_ClearCache(&self->handle);
Py_XDECREF(result);
return NULL;
}
Expand Down Expand Up @@ -2898,11 +2954,9 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject
goto cleanup;
}

_Py_RemoteDebug_ClearCache(&self->handle);
return result;

cleanup:
_Py_RemoteDebug_ClearCache(&self->handle);
Py_XDECREF(result);
return NULL;
}
Expand All @@ -2928,7 +2982,6 @@ RemoteUnwinder_dealloc(PyObject *op)
}
#endif
if (self->handle.pid != 0) {
_Py_RemoteDebug_ClearCache(&self->handle);
_Py_RemoteDebug_CleanupProcHandle(&self->handle);
}
PyObject_Del(self);
Expand Down
Loading
Loading