diff --git a/Misc/NEWS.d/next/Library/2024-10-17-01-12-22.gh-issue-119109.u4hcvb.rst b/Misc/NEWS.d/next/Library/2024-10-17-01-12-22.gh-issue-119109.u4hcvb.rst new file mode 100644 index 00000000000000..782b2eb8d74c8e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-10-17-01-12-22.gh-issue-119109.u4hcvb.rst @@ -0,0 +1 @@ +:func:`functools.partial` calls are now faster when keyword arguments are used. diff --git a/Modules/_functoolsmodule.c b/Modules/_functoolsmodule.c index 354dbad84b5099..3f828ccc5f90f8 100644 --- a/Modules/_functoolsmodule.c +++ b/Modules/_functoolsmodule.c @@ -368,19 +368,6 @@ partial_descr_get(PyObject *self, PyObject *obj, PyObject *type) return PyMethod_New(self, obj); } -/* Merging keyword arguments using the vectorcall convention is messy, so - * if we would need to do that, we stop using vectorcall and fall back - * to using partial_call() instead. */ -Py_NO_INLINE static PyObject * -partial_vectorcall_fallback(PyThreadState *tstate, partialobject *pto, - PyObject *const *args, size_t nargsf, - PyObject *kwnames) -{ - pto->vectorcall = NULL; - Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); - return _PyObject_MakeTpCall(tstate, (PyObject *)pto, args, nargs, kwnames); -} - static PyObject * partial_vectorcall(PyObject *self, PyObject *const *args, size_t nargsf, PyObject *kwnames) @@ -389,10 +376,7 @@ partial_vectorcall(PyObject *self, PyObject *const *args, PyThreadState *tstate = _PyThreadState_GET(); Py_ssize_t nargs = PyVectorcall_NARGS(nargsf); - /* pto->kw is mutable, so need to check every time */ - if (PyDict_GET_SIZE(pto->kw)) { - return partial_vectorcall_fallback(tstate, pto, args, nargsf, kwnames); - } + /* Placeholder check */ Py_ssize_t pto_phcount = pto->phcount; if (nargs < pto_phcount) { PyErr_Format(PyExc_TypeError, @@ -401,53 +385,149 @@ partial_vectorcall(PyObject *self, PyObject *const *args, return NULL; } - Py_ssize_t nargskw = nargs; - if (kwnames != NULL) { - nargskw += PyTuple_GET_SIZE(kwnames); - } - PyObject **pto_args = _PyTuple_ITEMS(pto->args); Py_ssize_t pto_nargs = PyTuple_GET_SIZE(pto->args); + Py_ssize_t pto_nkwds = PyDict_GET_SIZE(pto->kw); + Py_ssize_t nkwds = kwnames == NULL ? 0 : PyTuple_GET_SIZE(kwnames); + Py_ssize_t nargskw = nargs + nkwds; + + /* Special cases */ + if (!pto_nkwds) { + /* Fast path if we're called without arguments */ + if (nargskw == 0) { + return _PyObject_VectorcallTstate(tstate, pto->fn, pto_args, + pto_nargs, NULL); + } - /* Fast path if we're called without arguments */ - if (nargskw == 0) { - return _PyObject_VectorcallTstate(tstate, pto->fn, - pto_args, pto_nargs, NULL); + /* Use PY_VECTORCALL_ARGUMENTS_OFFSET to prepend a single + * positional argument. */ + if (pto_nargs == 1 && (nargsf & PY_VECTORCALL_ARGUMENTS_OFFSET)) { + PyObject **newargs = (PyObject **)args - 1; + PyObject *tmp = newargs[0]; + newargs[0] = pto_args[0]; + PyObject *ret = _PyObject_VectorcallTstate(tstate, pto->fn, newargs, + nargs + 1, kwnames); + newargs[0] = tmp; + return ret; + } } - /* Fast path using PY_VECTORCALL_ARGUMENTS_OFFSET to prepend a single - * positional argument */ - if (pto_nargs == 1 && (nargsf & PY_VECTORCALL_ARGUMENTS_OFFSET)) { - PyObject **newargs = (PyObject **)args - 1; - PyObject *tmp = newargs[0]; - newargs[0] = pto_args[0]; - PyObject *ret = _PyObject_VectorcallTstate(tstate, pto->fn, - newargs, nargs + 1, kwnames); - newargs[0] = tmp; - return ret; - } + /* Total sizes */ + Py_ssize_t tot_nargs = pto_nargs + nargs - pto_phcount; + Py_ssize_t tot_nkwds = pto_nkwds + nkwds; + Py_ssize_t tot_nargskw = tot_nargs + tot_nkwds; - PyObject *small_stack[_PY_FASTCALL_SMALL_STACK]; - PyObject **stack; + PyObject *pto_kw_merged = NULL; // pto_kw with duplicates merged (if any) + PyObject *tot_kwnames; - Py_ssize_t tot_nargskw = pto_nargs + nargskw - pto_phcount; - if (tot_nargskw <= (Py_ssize_t)Py_ARRAY_LENGTH(small_stack)) { + /* Allocate Stack + * Note, _PY_FASTCALL_SMALL_STACK is optimal for positional only + * This case might have keyword arguments + * furthermore, it might use extra stack space for temporary key storage + * thus, double small_stack size is used, which is 10 * 8 = 80 bytes */ + PyObject *small_stack[_PY_FASTCALL_SMALL_STACK * 2]; + PyObject **tmp_stack, **stack; + Py_ssize_t init_stack_size = tot_nargskw; + if (pto_nkwds) { + // If pto_nkwds, allocate additional space for temporary new keys + init_stack_size += nkwds; + } + if (init_stack_size <= (Py_ssize_t)Py_ARRAY_LENGTH(small_stack)) { stack = small_stack; } else { - stack = PyMem_Malloc(tot_nargskw * sizeof(PyObject *)); + stack = PyMem_Malloc(init_stack_size * sizeof(PyObject *)); if (stack == NULL) { - PyErr_NoMemory(); + return PyErr_NoMemory(); + } + } + + /* Copy keywords to stack */ + if (!pto_nkwds) { + tot_kwnames = kwnames; + if (nkwds) { + /* if !pto_nkwds & nkwds, then simply append kw */ + memcpy(stack + tot_nargs, args + nargs, nkwds * sizeof(PyObject*)); + } + } + else { + /* stack is now [, , , ] + * Will resize later to [, ] */ + PyObject *key, *val; + + /* Merge kw to pto_kw or add to tail (if not duplicate) */ + Py_ssize_t n_tail = 0; + for (Py_ssize_t i = 0; i < nkwds; ++i) { + key = PyTuple_GET_ITEM(kwnames, i); + val = args[nargs + i]; + if (PyDict_Contains(pto->kw, key)) { + if (pto_kw_merged == NULL) { + pto_kw_merged = PyDict_Copy(pto->kw); + if (pto_kw_merged == NULL) { + if (stack != small_stack) { + PyMem_Free(stack); + } + return NULL; + } + } + if (PyDict_SetItem(pto_kw_merged, key, val) < 0) { + Py_DECREF(pto_kw_merged); + if (stack != small_stack) { + PyMem_Free(stack); + } + return NULL; + } + } + else { + /* Copy keyword tail to stack */ + stack[tot_nargs + pto_nkwds + n_tail] = val; + stack[tot_nargskw + n_tail] = key; + n_tail++; + } + } + Py_ssize_t n_merges = nkwds - n_tail; + + /* Create total kwnames */ + tot_kwnames = PyTuple_New(tot_nkwds - n_merges); + if (tot_kwnames == NULL) { + if (stack != small_stack) { + PyMem_Free(stack); + } return NULL; } + for (Py_ssize_t i = 0; i < n_tail; ++i) { + key = Py_NewRef(stack[tot_nargskw + i]); + PyTuple_SET_ITEM(tot_kwnames, pto_nkwds + i, key); + } + + /* Copy pto_keywords with overlapping call keywords merged */ + Py_ssize_t pos = 0, i = 0; + while (PyDict_Next(n_merges ? pto_kw_merged : pto->kw, &pos, &key, &val)) { + PyTuple_SET_ITEM(tot_kwnames, i, Py_NewRef(key)); + stack[tot_nargs + i] = val; + i++; + } + Py_XDECREF(pto_kw_merged); + + /* Resize Stack if the call has keywords */ + if (nkwds && stack != small_stack) { + tmp_stack = PyMem_Realloc(stack, (tot_nargskw - n_merges) * sizeof(PyObject *)); + if (tmp_stack == NULL) { + Py_DECREF(tot_kwnames); + if (stack != small_stack) { + PyMem_Free(stack); + } + return PyErr_NoMemory(); + } + stack = tmp_stack; + } } - Py_ssize_t tot_nargs; + /* Copy Positionals to stack */ if (pto_phcount) { - tot_nargs = pto_nargs + nargs - pto_phcount; Py_ssize_t j = 0; // New args index for (Py_ssize_t i = 0; i < pto_nargs; i++) { - if (pto_args[i] == pto->placeholder) { + if (pto_args[i] == pto->placeholder){ stack[i] = args[j]; j += 1; } @@ -456,21 +536,24 @@ partial_vectorcall(PyObject *self, PyObject *const *args, } } assert(j == pto_phcount); - if (nargskw > pto_phcount) { - memcpy(stack + pto_nargs, args + j, (nargskw - j) * sizeof(PyObject*)); + /* Add remaining args from new_args */ + if (nargs > pto_phcount) { + memcpy(stack + pto_nargs, args + j, (nargs - j) * sizeof(PyObject*)); } } else { - tot_nargs = pto_nargs + nargs; - /* Copy to new stack, using borrowed references */ memcpy(stack, pto_args, pto_nargs * sizeof(PyObject*)); - memcpy(stack + pto_nargs, args, nargskw * sizeof(PyObject*)); + memcpy(stack + pto_nargs, args, nargs * sizeof(PyObject*)); } - PyObject *ret = _PyObject_VectorcallTstate(tstate, pto->fn, - stack, tot_nargs, kwnames); + + PyObject *ret = _PyObject_VectorcallTstate(tstate, pto->fn, stack, + tot_nargs, tot_kwnames); if (stack != small_stack) { PyMem_Free(stack); } + if (pto_nkwds) { + Py_DECREF(tot_kwnames); + } return ret; }