diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index 4ff34bf8ead7d0..b1806df2706097 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -45,12 +45,13 @@ static inline PyObject* _Py_FROM_GC(PyGC_Head *gc) { * the per-object lock. */ #ifdef Py_GIL_DISABLED -# define _PyGC_BITS_TRACKED (1) // Tracked by the GC -# define _PyGC_BITS_FINALIZED (2) // tp_finalize was called -# define _PyGC_BITS_UNREACHABLE (4) -# define _PyGC_BITS_FROZEN (8) -# define _PyGC_BITS_SHARED (16) -# define _PyGC_BITS_DEFERRED (64) // Use deferred reference counting +# define _PyGC_BITS_TRACKED (1<<0) // Tracked by the GC +# define _PyGC_BITS_FINALIZED (1<<1) // tp_finalize was called +# define _PyGC_BITS_UNREACHABLE (1<<2) +# define _PyGC_BITS_FROZEN (1<<3) +# define _PyGC_BITS_SHARED (1<<4) +# define _PyGC_BITS_ALIVE (1<<5) // Reachable from a known root. +# define _PyGC_BITS_DEFERRED (1<<6) // Use deferred reference counting #endif #ifdef Py_GIL_DISABLED @@ -330,6 +331,9 @@ struct _gc_runtime_state { collections, and are awaiting to undergo a full collection for the first time. */ Py_ssize_t long_lived_pending; + + /* True if gc.freeze() has been used. */ + int freeze_active; #endif }; diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-01-13-17-03-49.gh-issue-128807.BGGBxD.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-13-17-03-49.gh-issue-128807.BGGBxD.rst new file mode 100644 index 00000000000000..34952e9abb66e5 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-13-17-03-49.gh-issue-128807.BGGBxD.rst @@ -0,0 +1,6 @@ +Add a marking phase to the free-threaded GC. This is similar to what was +done in GH-126491. Since the free-threaded GC does not have generations and +is not incremental, the marking phase looks for all objects reachable from +known roots. The roots are objects known to not be garbage, like the module +dictionary for :mod:`sys`. For most programs, this marking phase should +make the GC a bit faster since typically less work is done per object. diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index f7f44407494e51..d1023d9351086f 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -17,6 +17,17 @@ #include "pydtrace.h" #include "pycore_uniqueid.h" // _PyObject_MergeThreadLocalRefcounts() + +// enable the "mark alive" pass of GC +#define GC_ENABLE_MARK_ALIVE 1 + +// include additional roots in "mark alive" pass +#define GC_MARK_ALIVE_EXTRA_ROOTS 1 + +// include Python stacks as set of known roots +#define GC_MARK_ALIVE_STACKS 1 + + #ifdef Py_GIL_DISABLED typedef struct _gc_runtime_state GCState; @@ -113,28 +124,66 @@ worklist_remove(struct worklist_iter *iter) iter->next = iter->ptr; } +static inline int +gc_has_bit(PyObject *op, uint8_t bit) +{ + return (op->ob_gc_bits & bit) != 0; +} + +static inline void +gc_set_bit(PyObject *op, uint8_t bit) +{ + op->ob_gc_bits |= bit; +} + +static inline void +gc_clear_bit(PyObject *op, uint8_t bit) +{ + op->ob_gc_bits &= ~bit; +} + static inline int gc_is_frozen(PyObject *op) { - return (op->ob_gc_bits & _PyGC_BITS_FROZEN) != 0; + return gc_has_bit(op, _PyGC_BITS_FROZEN); } static inline int gc_is_unreachable(PyObject *op) { - return (op->ob_gc_bits & _PyGC_BITS_UNREACHABLE) != 0; + return gc_has_bit(op, _PyGC_BITS_UNREACHABLE); } -static void +static inline void gc_set_unreachable(PyObject *op) { - op->ob_gc_bits |= _PyGC_BITS_UNREACHABLE; + gc_set_bit(op, _PyGC_BITS_UNREACHABLE); } -static void +static inline void gc_clear_unreachable(PyObject *op) { - op->ob_gc_bits &= ~_PyGC_BITS_UNREACHABLE; + gc_clear_bit(op, _PyGC_BITS_UNREACHABLE); +} + +static inline int +gc_is_alive(PyObject *op) +{ + return gc_has_bit(op, _PyGC_BITS_ALIVE); +} + +#ifdef GC_ENABLE_MARK_ALIVE +static inline void +gc_set_alive(PyObject *op) +{ + gc_set_bit(op, _PyGC_BITS_ALIVE); +} +#endif + +static inline void +gc_clear_alive(PyObject *op) +{ + gc_clear_bit(op, _PyGC_BITS_ALIVE); } // Initialize the `ob_tid` field to zero if the object is not already @@ -143,6 +192,7 @@ static void gc_maybe_init_refs(PyObject *op) { if (!gc_is_unreachable(op)) { + assert(!gc_is_alive(op)); gc_set_unreachable(op); op->ob_tid = 0; } @@ -264,9 +314,13 @@ static void gc_restore_refs(PyObject *op) { if (gc_is_unreachable(op)) { + assert(!gc_is_alive(op)); gc_restore_tid(op); gc_clear_unreachable(op); } + else { + gc_clear_alive(op); + } } // Given a mimalloc memory block return the PyObject stored in it or NULL if @@ -392,6 +446,119 @@ gc_visit_thread_stacks(PyInterpreterState *interp) _Py_FOR_EACH_TSTATE_END(interp); } +// Untrack objects that can never create reference cycles. +// Return true if the object was untracked. +static bool +gc_maybe_untrack(PyObject *op) +{ + // Currently we only check for tuples containing only non-GC objects. In + // theory we could check other immutable objects that contain references + // to non-GC objects. + if (PyTuple_CheckExact(op)) { + _PyTuple_MaybeUntrack(op); + if (!_PyObject_GC_IS_TRACKED(op)) { + return true; + } + } + return false; +} + +#ifdef GC_ENABLE_MARK_ALIVE +static int +mark_alive_stack_push(PyObject *op, _PyObjectStack *stack) +{ + if (op == NULL) { + return 0; + } + if (!_PyObject_GC_IS_TRACKED(op)) { + return 0; + } + if (gc_is_alive(op)) { + return 0; // already visited this object + } + if (gc_maybe_untrack(op)) { + return 0; // was untracked, don't visit it + } + + // Need to call tp_traverse on this object. Add to stack and mark it + // alive so we don't traverse it a second time. + gc_set_alive(op); + if (_PyObjectStack_Push(stack, op) < 0) { + return -1; + } + return 0; +} + +static bool +gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area, + void *block, size_t block_size, void *args) +{ + PyObject *op = op_from_block(block, args, false); + if (op == NULL) { + return true; + } + if (gc_is_alive(op)) { + gc_clear_alive(op); + } + return true; +} + +static void +gc_abort_mark_alive(PyInterpreterState *interp, + struct collection_state *state, + _PyObjectStack *stack) +{ + // We failed to allocate memory for "stack" while doing the "mark + // alive" phase. In that case, free the object stack and make sure + // that no objects have the alive bit set. + _PyObjectStack_Clear(stack); + gc_visit_heaps(interp, &gc_clear_alive_bits, &state->base); +} + +#ifdef GC_MARK_ALIVE_STACKS +static int +gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref) +{ + // Note: we MUST check that it is deferred before checking the rest. + // Otherwise we might read into invalid memory due to non-deferred references + // being dead already. + if (PyStackRef_IsDeferred(stackref) && !PyStackRef_IsNull(stackref)) { + PyObject *op = PyStackRef_AsPyObjectBorrow(stackref); + if (mark_alive_stack_push(op, stack) < 0) { + return -1; + } + } + return 0; +} + +static int +gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *stack) +{ + _Py_FOR_EACH_TSTATE_BEGIN(interp, p) { + for (_PyInterpreterFrame *f = p->current_frame; f != NULL; f = f->previous) { + PyObject *executable = PyStackRef_AsPyObjectBorrow(f->f_executable); + if (executable == NULL || !PyCode_Check(executable)) { + continue; + } + + PyCodeObject *co = (PyCodeObject *)executable; + int max_stack = co->co_nlocalsplus + co->co_stacksize; + if (gc_visit_stackref_mark_alive(stack, f->f_executable) < 0) { + return -1; + } + for (int i = 0; i < max_stack; i++) { + if (gc_visit_stackref_mark_alive(stack, f->localsplus[i]) < 0) { + return -1; + } + } + } + } + _Py_FOR_EACH_TSTATE_END(interp); + return 0; +} +#endif // GC_MARK_ALIVE_STACKS +#endif // GC_ENABLE_MARK_ALIVE + static void queue_untracked_obj_decref(PyObject *op, struct collection_state *state) { @@ -460,7 +627,8 @@ visit_decref(PyObject *op, void *arg) { if (_PyObject_GC_IS_TRACKED(op) && !_Py_IsImmortal(op) - && !gc_is_frozen(op)) + && !gc_is_frozen(op) + && !gc_is_alive(op)) { // If update_refs hasn't reached this object yet, mark it // as (tentatively) unreachable and initialize ob_tid to zero. @@ -482,6 +650,10 @@ update_refs(const mi_heap_t *heap, const mi_heap_area_t *area, return true; } + if (gc_is_alive(op)) { + return true; + } + // Exclude immortal objects from garbage collection if (_Py_IsImmortal(op)) { op->ob_tid = 0; @@ -497,14 +669,9 @@ update_refs(const mi_heap_t *heap, const mi_heap_area_t *area, _PyObject_ASSERT(op, refcount >= 0); if (refcount > 0 && !_PyObject_HasDeferredRefcount(op)) { - // Untrack tuples and dicts as necessary in this pass, but not objects - // with zero refcount, which we will want to collect. - if (PyTuple_CheckExact(op)) { - _PyTuple_MaybeUntrack(op); - if (!_PyObject_GC_IS_TRACKED(op)) { - gc_restore_refs(op); - return true; - } + if (gc_maybe_untrack(op)) { + gc_restore_refs(op); + return true; } } @@ -553,6 +720,21 @@ mark_reachable(PyObject *op) } #ifdef GC_DEBUG +static bool +validate_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area, + void *block, size_t block_size, void *args) +{ + PyObject *op = op_from_block(block, args, false); + if (op == NULL) { + return true; + } + + _PyObject_ASSERT_WITH_MSG(op, !gc_is_alive(op), + "object should not be marked as alive yet"); + + return true; +} + static bool validate_refcounts(const mi_heap_t *heap, const mi_heap_area_t *area, void *block, size_t block_size, void *args) @@ -586,6 +768,11 @@ validate_gc_objects(const mi_heap_t *heap, const mi_heap_area_t *area, return true; } + if (gc_is_alive(op)) { + _PyObject_ASSERT(op, !gc_is_unreachable(op)); + return true; + } + _PyObject_ASSERT(op, gc_is_unreachable(op)); _PyObject_ASSERT_WITH_MSG(op, gc_get_refs(op) >= 0, "refcount is too small"); @@ -602,6 +789,10 @@ mark_heap_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, return true; } + if (gc_is_alive(op)) { + return true; + } + _PyObject_ASSERT_WITH_MSG(op, gc_get_refs(op) >= 0, "refcount is too small"); @@ -630,6 +821,7 @@ restore_refs(const mi_heap_t *heap, const mi_heap_area_t *area, } gc_restore_tid(op); gc_clear_unreachable(op); + gc_clear_alive(op); return true; } @@ -679,6 +871,7 @@ scan_heap_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, // object is reachable, restore `ob_tid`; we're done with these objects gc_restore_tid(op); + gc_clear_alive(op); state->long_lived_total++; return true; } @@ -686,6 +879,89 @@ scan_heap_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, static int move_legacy_finalizer_reachable(struct collection_state *state); +#ifdef GC_ENABLE_MARK_ALIVE +static int +propagate_alive_bits(_PyObjectStack *stack) +{ + for (;;) { + PyObject *op = _PyObjectStack_Pop(stack); + if (op == NULL) { + break; + } + assert(_PyObject_GC_IS_TRACKED(op)); + assert(gc_is_alive(op)); + traverseproc traverse = Py_TYPE(op)->tp_traverse; + if (traverse(op, (visitproc)&mark_alive_stack_push, stack) < 0) { + return -1; + } + } + return 0; +} + +// Using tp_traverse, mark everything reachable from known root objects +// (which must be non-garbage) as alive (_PyGC_BITS_ALIVE is set). In +// most programs, this marks nearly all objects that are not actually +// unreachable. +// +// Actually alive objects can be missed in this pass if they are alive +// due to being referenced from an unknown root (e.g. an extension +// module global), some tp_traverse methods are either missing or not +// accurate, or objects that have been untracked. Objects that are only +// reachable from the aforementioned are also missed. +// +// If gc.freeze() is used, this pass is disabled since it is unlikely to +// help much. The next stages of cyclic GC will ignore objects with the +// alive bit set. +// +// Returns -1 on failure (out of memory). +static int +mark_alive_from_roots(PyInterpreterState *interp, + struct collection_state *state) +{ +#ifdef GC_DEBUG + // Check that all objects don't have alive bit set + gc_visit_heaps(interp, &validate_alive_bits, &state->base); +#endif + _PyObjectStack stack = { NULL }; + + #define STACK_PUSH(op) \ + if (mark_alive_stack_push(op, &stack) < 0) { \ + gc_abort_mark_alive(interp, state, &stack); \ + return -1; \ + } + STACK_PUSH(interp->sysdict); +#ifdef GC_MARK_ALIVE_EXTRA_ROOTS + STACK_PUSH(interp->builtins); + STACK_PUSH(interp->dict); + struct types_state *types = &interp->types; + for (int i = 0; i < _Py_MAX_MANAGED_STATIC_BUILTIN_TYPES; i++) { + STACK_PUSH(types->builtins.initialized[i].tp_dict); + STACK_PUSH(types->builtins.initialized[i].tp_subclasses); + } + for (int i = 0; i < _Py_MAX_MANAGED_STATIC_EXT_TYPES; i++) { + STACK_PUSH(types->for_extensions.initialized[i].tp_dict); + STACK_PUSH(types->for_extensions.initialized[i].tp_subclasses); + } +#endif +#ifdef GC_MARK_ALIVE_STACKS + if (gc_visit_thread_stacks_mark_alive(interp, &stack) < 0) { + gc_abort_mark_alive(interp, state, &stack); + return -1; + } +#endif + #undef STACK_PUSH + + // Use tp_traverse to find everything reachable from roots. + if (propagate_alive_bits(&stack) < 0) { + gc_abort_mark_alive(interp, state, &stack); + return -1; + } + + return 0; +} +#endif // GC_ENABLE_MARK_ALIVE + + static int deduce_unreachable_heap(PyInterpreterState *interp, struct collection_state *state) @@ -1245,6 +1521,25 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, process_delayed_frees(interp, state); + #ifdef GC_ENABLE_MARK_ALIVE + // If gc.freeze() was used, it seems likely that doing this "mark alive" + // pass will not be a performance win. Typically the majority of alive + // objects will be marked as frozen and will be skipped anyhow, without + // doing this extra work. Doing this pass also defeats one of the + // purposes of using freeze: avoiding writes to objects that are frozen. + // So, we just skip this if gc.freeze() was used. + if (!state->gcstate->freeze_active) { + // Mark objects reachable from known roots as "alive". These will + // be ignored for rest of the GC pass. + int err = mark_alive_from_roots(interp, state); + if (err < 0) { + _PyEval_StartTheWorld(interp); + PyErr_NoMemory(); + return; + } + } + #endif + // Find unreachable objects int err = deduce_unreachable_heap(interp, state); if (err < 0) { @@ -1253,6 +1548,11 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, return; } +#ifdef GC_DEBUG + // At this point, no object should have the alive bit set + gc_visit_heaps(interp, &validate_alive_bits, &state->base); +#endif + // Print debugging information. if (interp->gc.debug & _PyGC_DEBUG_COLLECTABLE) { PyObject *op; @@ -1564,6 +1864,8 @@ _PyGC_Freeze(PyInterpreterState *interp) { struct visitor_args args; _PyEval_StopTheWorld(interp); + GCState *gcstate = get_gc_state(); + gcstate->freeze_active = true; gc_visit_heaps(interp, &visit_freeze, &args); _PyEval_StartTheWorld(interp); } @@ -1574,7 +1876,7 @@ visit_unfreeze(const mi_heap_t *heap, const mi_heap_area_t *area, { PyObject *op = op_from_block(block, args, true); if (op != NULL) { - op->ob_gc_bits &= ~_PyGC_BITS_FROZEN; + gc_clear_bit(op, _PyGC_BITS_FROZEN); } return true; } @@ -1584,6 +1886,8 @@ _PyGC_Unfreeze(PyInterpreterState *interp) { struct visitor_args args; _PyEval_StopTheWorld(interp); + GCState *gcstate = get_gc_state(); + gcstate->freeze_active = false; gc_visit_heaps(interp, &visit_unfreeze, &args); _PyEval_StartTheWorld(interp); }