diff --git a/Include/internal/pycore_pymem.h b/Include/internal/pycore_pymem.h
index 6b5113714dbeb2..56832960af6a14 100644
--- a/Include/internal/pycore_pymem.h
+++ b/Include/internal/pycore_pymem.h
@@ -42,6 +42,11 @@ struct _pymem_allocators {
         debug_alloc_api_t obj;
     } debug;
     PyObjectArenaAllocator obj_arena;
+    unsigned int num_gils;
+    struct {
+        PyMemAllocatorEx mem;
+        PyMemAllocatorEx obj;
+    } wrapped_with_lock;
 };
 
 
diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index 7d552ff57c8f1e..432c804559c9a1 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -1,6 +1,7 @@
 /* Python's malloc wrappers (see pymem.h) */
 
 #include "Python.h"
+#include "pycore_ceval.h"         // _PyEval_AcquireLock()
 #include "pycore_code.h"          // stats
 #include "pycore_object.h"        // _PyDebugAllocatorStats() definition
 #include "pycore_obmalloc.h"
@@ -210,6 +211,8 @@ _PyMem_ArenaFree(void *Py_UNUSED(ctx), void *ptr,
 #define _PyObject (_PyRuntime.allocators.standard.obj)
 #define _PyMem_Debug (_PyRuntime.allocators.debug)
 #define _PyObject_Arena (_PyRuntime.allocators.obj_arena)
+#define _PyMem_Wrapped (_PyRuntime.allocators.wrapped_with_lock.mem)
+#define _PyObject_Wrapped (_PyRuntime.allocators.wrapped_with_lock.obj)
 
 
 /***************************/
@@ -531,14 +534,32 @@ PyMem_SetupDebugHooks(void)
     PyThread_release_lock(ALLOCATORS_MUTEX);
 }
 
+static int has_locking_wrapper(PyMemAllocatorEx *allocator);
+
 static void
 get_allocator_unlocked(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator)
 {
     switch(domain)
     {
-    case PYMEM_DOMAIN_RAW: *allocator = _PyMem_Raw; break;
-    case PYMEM_DOMAIN_MEM: *allocator = _PyMem; break;
-    case PYMEM_DOMAIN_OBJ: *allocator = _PyObject; break;
+    case PYMEM_DOMAIN_RAW:
+        *allocator = _PyMem_Raw;
+        break;
+    case PYMEM_DOMAIN_MEM:
+        if (has_locking_wrapper(&_PyMem)) {
+            *allocator = *(PyMemAllocatorEx *)_PyMem.ctx;
+        }
+        else {
+            *allocator = _PyMem;
+        }
+        break;
+    case PYMEM_DOMAIN_OBJ:
+        if (has_locking_wrapper(&_PyObject)) {
+            *allocator = *(PyMemAllocatorEx *)_PyObject.ctx;
+        }
+        else {
+            *allocator = _PyObject;
+        }
+        break;
     default:
         /* unknown domain: set all attributes to NULL */
         allocator->ctx = NULL;
@@ -549,15 +570,28 @@ get_allocator_unlocked(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator)
     }
 }
 
+static void maybe_add_locking_wrapper(
+        PyMemAllocatorDomain, PyMemAllocatorEx *, PyMemAllocatorEx *);
+
 static void
 set_allocator_unlocked(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator)
 {
     switch(domain)
     {
-    case PYMEM_DOMAIN_RAW: _PyMem_Raw = *allocator; break;
-    case PYMEM_DOMAIN_MEM: _PyMem = *allocator; break;
-    case PYMEM_DOMAIN_OBJ: _PyObject = *allocator; break;
-    /* ignore unknown domain */
+    case PYMEM_DOMAIN_RAW:
+        _PyMem_Raw = *allocator;
+        break;
+    case PYMEM_DOMAIN_MEM:
+        _PyMem = *allocator;
+        maybe_add_locking_wrapper(domain, &_PyMem, &_PyMem_Wrapped);
+        break;
+    case PYMEM_DOMAIN_OBJ:
+        _PyObject = *allocator;
+        maybe_add_locking_wrapper(domain, &_PyObject, &_PyObject_Wrapped);
+        break;
+    default:
+        /* ignore unknown domain */
+        return;
     }
 }
 
@@ -628,6 +662,192 @@ PyObject_SetArenaAllocator(PyObjectArenaAllocator *allocator)
  * around these uses of the runtime-global allocators state. */
 
 
+/************************************/
+/* locking around custom allocators */
+/************************************/
+
+static inline int
+should_lock(PyInterpreterState *interp)
+{
+    return interp->ceval.gil != _PyInterpreterState_Main()->ceval.gil;
+}
+
+static PyThreadState *
+get_tstate_for_main_gil(void)
+{
+    PyThreadState *tstate = _PyRuntimeState_GetFinalizing(&_PyRuntime);
+    if (tstate == NULL) {
+        /* To use its GIL, we only need the pointer and one field. */
+        static const PyThreadState _main_tstate = {
+            .interp = &_PyRuntime._main_interpreter,
+        };
+        tstate = (PyThreadState *)&_main_tstate;
+    }
+    return tstate;
+}
+
+static inline void
+acquire_custom_allocator_lock(PyThreadState *tstate)
+{
+    _PyEval_AcquireLock(tstate);
+}
+
+static inline void
+release_custom_allocator_lock(PyThreadState *tstate)
+{
+    _PyEval_ReleaseLock(tstate->interp, tstate);
+}
+
+static void *
+_PyMem_MallocLocked(void *ctx, size_t size)
+{
+    PyMemAllocatorEx *wrapped = (PyMemAllocatorEx *)ctx;
+    if (_PyRuntime.allocators.num_gils > 1) {
+        PyThreadState *tstate = _PyThreadState_GET();
+        if (should_lock(tstate->interp)) {
+            tstate = get_tstate_for_main_gil();
+            acquire_custom_allocator_lock(tstate);
+            void *ptr = wrapped->malloc(wrapped->ctx, size);
+            release_custom_allocator_lock(tstate);
+            return ptr;
+        }
+    }
+    return wrapped->malloc(wrapped->ctx, size);
+}
+
+static void *
+_PyMem_CallocLocked(void *ctx, size_t nelem, size_t elsize)
+{
+    PyMemAllocatorEx *wrapped = (PyMemAllocatorEx *)ctx;
+    if (_PyRuntime.allocators.num_gils > 1) {
+        PyThreadState *tstate = _PyThreadState_GET();
+        if (should_lock(tstate->interp)) {
+            tstate = get_tstate_for_main_gil();
+            acquire_custom_allocator_lock(tstate);
+            void *ptr = wrapped->calloc(wrapped->ctx, nelem, elsize);
+            release_custom_allocator_lock(tstate);
+            return ptr;
+        }
+    }
+    return wrapped->calloc(wrapped->ctx, nelem, elsize);
+}
+
+static void *
+_PyMem_ReallocLocked(void *ctx, void *ptr, size_t new_size)
+{
+    PyMemAllocatorEx *wrapped = (PyMemAllocatorEx *)ctx;
+    if (_PyRuntime.allocators.num_gils > 1) {
+        PyThreadState *tstate = _PyThreadState_GET();
+        if (should_lock(tstate->interp)) {
+            tstate = get_tstate_for_main_gil();
+            acquire_custom_allocator_lock(tstate);
+            ptr = wrapped->realloc(wrapped->ctx, ptr, new_size);
+            release_custom_allocator_lock(tstate);
+            return ptr;
+        }
+    }
+    return wrapped->realloc(wrapped->ctx, ptr, new_size);
+}
+
+static void
+_PyMem_FreeLocked(void *ctx, void *ptr)
+{
+    PyMemAllocatorEx *wrapped = (PyMemAllocatorEx *)ctx;
+    if (_PyRuntime.allocators.num_gils > 1) {
+        PyThreadState *tstate = _PyThreadState_GET();
+        if (should_lock(tstate->interp)) {
+            tstate = get_tstate_for_main_gil();
+            acquire_custom_allocator_lock(tstate);
+            wrapped->free(wrapped->ctx, ptr);
+            release_custom_allocator_lock(tstate);
+            return;
+        }
+    }
+    wrapped->free(wrapped->ctx, ptr);
+}
+
+static int
+has_locking_wrapper(PyMemAllocatorEx *allocator)
+{
+    if (allocator->ctx == NULL) {
+        return 0;
+    }
+    return (allocator->malloc == _PyMem_MallocLocked
+            || allocator->calloc == _PyMem_CallocLocked
+            || allocator->realloc == _PyMem_ReallocLocked
+            || allocator->free == _PyMem_FreeLocked);
+}
+
+static void
+maybe_add_locking_wrapper(PyMemAllocatorDomain domain,
+                          PyMemAllocatorEx *allocator,
+                          PyMemAllocatorEx *wrapped)
+{
+    assert(domain == PYMEM_DOMAIN_MEM || domain == PYMEM_DOMAIN_OBJ);
+
+    *wrapped = (PyMemAllocatorEx){0};
+
+    if (allocator->malloc == _PyMem_DebugMalloc) {
+        /* The debug allocator only wraps an already set allocator,
+         * which would have gone through this function already. */
+        return;
+    }
+
+    void *ctx = allocator->ctx;
+
+    /* What is the likelihood of reentrancy with the wrapper funcs?
+     * For now we assume it is effectively zero. */
+
+    if (allocator->malloc != _PyMem_RawMalloc
+#ifdef WITH_PYMALLOC
+        && allocator->malloc != _PyObject_Malloc
+#endif
+        && allocator->malloc != _PyMem_MallocLocked)
+    {
+        wrapped->ctx = ctx;
+        wrapped->malloc = allocator->malloc;
+        allocator->ctx = wrapped;
+        allocator->malloc = _PyMem_MallocLocked;
+    }
+
+    if (allocator->calloc != _PyMem_RawCalloc
+#ifdef WITH_PYMALLOC
+        && allocator->calloc != _PyObject_Calloc
+#endif
+        && allocator->calloc != _PyMem_CallocLocked)
+    {
+        wrapped->ctx = ctx;
+        wrapped->calloc = allocator->calloc;
+        allocator->ctx = wrapped;
+        allocator->calloc = _PyMem_CallocLocked;
+    }
+
+    if (allocator->realloc != _PyMem_RawRealloc
+#ifdef WITH_PYMALLOC
+        && allocator->realloc != _PyObject_Realloc
+#endif
+        && allocator->realloc != _PyMem_ReallocLocked)
+    {
+        wrapped->ctx = ctx;
+        wrapped->realloc = allocator->realloc;
+        allocator->ctx = wrapped;
+        allocator->realloc = _PyMem_ReallocLocked;
+    }
+
+    if (allocator->free != _PyMem_RawFree
+#ifdef WITH_PYMALLOC
+        && allocator->free != _PyObject_Free
+#endif
+        && allocator->free != _PyMem_FreeLocked)
+    {
+        wrapped->ctx = ctx;
+        wrapped->free = allocator->free;
+        allocator->ctx = wrapped;
+        allocator->free = _PyMem_FreeLocked;
+    }
+}
+
+
 /*************************/
 /* the "arena" allocator */
 /*************************/
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index ceca7776a276ed..bca533dd44fc86 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -592,6 +592,7 @@ static PyStatus
 init_interp_create_gil(PyThreadState *tstate, int gil)
 {
     PyStatus status;
+    _PyRuntimeState *runtime = tstate->interp->runtime;
 
     /* finalize_interp_delete() comment explains why _PyEval_FiniGIL() is
        only called here. */
@@ -619,7 +620,19 @@ init_interp_create_gil(PyThreadState *tstate, int gil)
         return status;
     }
 
-    return _PyStatus_OK();
+    if (own_gil) {
+        HEAD_LOCK(runtime);
+        if (runtime->allocators.num_gils == INT_MAX) {
+            status = _PyStatus_ERR("allocators GIL overflow");
+        }
+        else {
+            status =_PyStatus_OK();
+            runtime->allocators.num_gils++;
+        }
+        HEAD_UNLOCK(runtime);
+    }
+
+    return status;
 }
 
 
@@ -1742,6 +1755,14 @@ finalize_interp_delete(PyInterpreterState *interp)
     /* Cleanup auto-thread-state */
     _PyGILState_Fini(interp);
 
+    _PyRuntimeState *runtime = interp->runtime;
+    if (interp->ceval.own_gil) {
+        HEAD_LOCK(runtime);
+        assert(runtime->allocators.num_gils > 0);
+        runtime->allocators.num_gils--;
+        HEAD_UNLOCK(runtime);
+    }
+
     /* We can't call _PyEval_FiniGIL() here because destroying the GIL lock can
        fail when it is being awaited by another running daemon thread (see
        bpo-9901). Instead pycore_create_interpreter() destroys the previously