From ac9270f4143d91f252a682cdf31481af99610639 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Mon, 17 May 2021 21:46:46 +0100 Subject: [PATCH 01/25] Initial conversion of abarry's patch file. --- Include/unicodeobject.h | 6 +- Modules/_pickle.c | 2 +- Objects/bytearrayobject.c | 44 +++++++++++--- Objects/bytesobject.c | 47 +++++++++++--- Objects/clinic/bytearrayobject.c.h | 82 +++++++++++++++---------- Objects/clinic/bytesobject.c.h | 84 +++++++++++++++---------- Objects/clinic/unicodeobject.c.h | 90 +++++++++++++++++---------- Objects/stringlib/split.h | 83 ++++++++++++++++++------- Objects/unicodeobject.c | 98 +++++++++++++++++++++--------- 9 files changed, 368 insertions(+), 168 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index b0ac086a6be23d..f714c3c7307f42 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -851,7 +851,8 @@ PyAPI_FUNC(void) PyUnicode_AppendAndDel( PyAPI_FUNC(PyObject*) PyUnicode_Split( PyObject *s, /* String to split */ PyObject *sep, /* String separator */ - Py_ssize_t maxsplit /* Maxsplit count */ + Py_ssize_t maxsplit, /* Maxsplit count */ + int prune /* Whether to remove empty strings */ ); /* Dito, but split at line breaks. @@ -895,7 +896,8 @@ PyAPI_FUNC(PyObject*) PyUnicode_RPartition( PyAPI_FUNC(PyObject*) PyUnicode_RSplit( PyObject *s, /* String to split */ PyObject *sep, /* String separator */ - Py_ssize_t maxsplit /* Maxsplit count */ + Py_ssize_t maxsplit, /* Maxsplit count */ + int prune /* Whether to remove empty strings */ ); /* Translate a string by applying a character mapping table to it and diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 3e74fafb384176..c15ffec658bd5a 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -1810,7 +1810,7 @@ get_dotted_path(PyObject *obj, PyObject *name) PyObject *dotted_path; Py_ssize_t i, n; - dotted_path = PyUnicode_Split(name, _PyUnicode_FromId(&PyId_dot), -1); + dotted_path = PyUnicode_Split(name, _PyUnicode_FromId(&PyId_dot), -1, 0); if (dotted_path == NULL) return NULL; n = PyList_GET_SIZE(dotted_path); diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index 1ab9621b1f2656..ad02ac4f288c04 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -1404,25 +1404,40 @@ bytearray.split maxsplit: Py_ssize_t = -1 Maximum number of splits to do. -1 (the default value) means no limit. + prune: object = None + Determines whether or not to keep empty strings in the final list. Return a list of the sections in the bytearray, using sep as the delimiter. [clinic start generated code]*/ static PyObject * bytearray_split_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit) -/*[clinic end generated code: output=833e2cf385d9a04d input=24f82669f41bf523]*/ + Py_ssize_t maxsplit, PyObject *prune) +/*[clinic end generated code: output=62a007e24098bdb0 input=02fd88e2131c7cae]*/ { Py_ssize_t len = PyByteArray_GET_SIZE(self), n; const char *s = PyByteArray_AS_STRING(self), *sub; PyObject *list; Py_buffer vsub; + int prune_value; + + if (prune == Py_None) { + if (sep == Py_None) + prune_value = 1; + else + prune_value = 0; + } else { + prune_value = PyObject_IsTrue(prune); + if (prune_value < 0) + return NULL; + } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit); + return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, + prune_value); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; @@ -1430,7 +1445,7 @@ bytearray_split_impl(PyByteArrayObject *self, PyObject *sep, n = vsub.len; list = stringlib_split( - (PyObject*) self, s, len, sub, n, maxsplit + (PyObject*) self, s, len, sub, n, maxsplit, prune_value ); PyBuffer_Release(&vsub); return list; @@ -1521,19 +1536,32 @@ Splitting is done starting at the end of the bytearray and working to the front. static PyObject * bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit) -/*[clinic end generated code: output=a55e0b5a03cb6190 input=a68286e4dd692ffe]*/ + Py_ssize_t maxsplit, PyObject *prune) +/*[clinic end generated code: output=ef399a20ad6c8b71 input=a68286e4dd692ffe]*/ { Py_ssize_t len = PyByteArray_GET_SIZE(self), n; const char *s = PyByteArray_AS_STRING(self), *sub; PyObject *list; Py_buffer vsub; + int prune_value; + + if (prune == Py_None) { + if (sep == Py_None) + prune_value = 1; + else + prune_value = 0; + } else { + prune_value = PyObject_IsTrue(prune); + if (prune_value < 0) + return NULL; + } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit); + return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, + prune_value); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; @@ -1541,7 +1569,7 @@ bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep, n = vsub.len; list = stringlib_rsplit( - (PyObject*) self, s, len, sub, n, maxsplit + (PyObject*) self, s, len, sub, n, maxsplit, prune_value ); PyBuffer_Release(&vsub); return list; diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index eaedb0b5689b2a..3ced4647602d2c 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1701,29 +1701,45 @@ bytes.split maxsplit: Py_ssize_t = -1 Maximum number of splits to do. -1 (the default value) means no limit. + prune: object = None + Determines whether or not to keep empty strings in the final list Return a list of the sections in the bytes, using sep as the delimiter. [clinic start generated code]*/ static PyObject * -bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit) -/*[clinic end generated code: output=52126b5844c1d8ef input=8b809b39074abbfa]*/ +bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *prune) +/*[clinic end generated code: output=cc9c523f3392cbe0 input=06605e7d3430ff7e]*/ { Py_ssize_t len = PyBytes_GET_SIZE(self), n; const char *s = PyBytes_AS_STRING(self), *sub; Py_buffer vsub; PyObject *list; + int prune_value; + + if (prune == Py_None) { + if (sep == Py_None) + prune_value = 1; + else + prune_value = 0; + } else { + prune_value = PyObject_IsTrue(prune); + if (prune_value < 0) + return NULL; + } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit); + return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, + prune_value); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; sub = vsub.buf; n = vsub.len; - list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit); + list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit, prune_value); PyBuffer_Release(&vsub); return list; } @@ -1791,24 +1807,39 @@ Splitting is done starting at the end of the bytes and working to the front. [clinic start generated code]*/ static PyObject * -bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit) -/*[clinic end generated code: output=ba698d9ea01e1c8f input=0f86c9f28f7d7b7b]*/ +bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *prune) +/*[clinic end generated code: output=372b333ea8e35927 input=0f86c9f28f7d7b7b]*/ { Py_ssize_t len = PyBytes_GET_SIZE(self), n; const char *s = PyBytes_AS_STRING(self), *sub; Py_buffer vsub; PyObject *list; + int prune_value; + + if (prune == Py_None) { + if (sep == Py_None) + prune_value = 1; + else + prune_value = 0; + } else { + prune_value = PyObject_IsTrue(prune); + if (prune_value < 0) + return NULL; + } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit); + return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, + prune_value); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; sub = vsub.buf; n = vsub.len; - list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit); + list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit, + prune_value); PyBuffer_Release(&vsub); return list; } diff --git a/Objects/clinic/bytearrayobject.c.h b/Objects/clinic/bytearrayobject.c.h index 1e3f197561523f..a2a8fe7240a830 100644 --- a/Objects/clinic/bytearrayobject.c.h +++ b/Objects/clinic/bytearrayobject.c.h @@ -366,7 +366,7 @@ bytearray_replace(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t nar } PyDoc_STRVAR(bytearray_split__doc__, -"split($self, /, sep=None, maxsplit=-1)\n" +"split($self, /, sep=None, maxsplit=-1, prune=None)\n" "--\n" "\n" "Return a list of the sections in the bytearray, using sep as the delimiter.\n" @@ -377,27 +377,30 @@ PyDoc_STRVAR(bytearray_split__doc__, " (space, tab, return, newline, formfeed, vertical tab).\n" " maxsplit\n" " Maximum number of splits to do.\n" -" -1 (the default value) means no limit."); +" -1 (the default value) means no limit.\n" +" prune\n" +" Determines whether or not to keep empty strings in the final list."); #define BYTEARRAY_SPLIT_METHODDEF \ {"split", (PyCFunction)(void(*)(void))bytearray_split, METH_FASTCALL|METH_KEYWORDS, bytearray_split__doc__}, static PyObject * bytearray_split_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit); + Py_ssize_t maxsplit, PyObject *prune); static PyObject * bytearray_split(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "split", 0}; - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *prune = Py_None; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 2, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { goto exit; } @@ -410,20 +413,26 @@ bytearray_split(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t nargs goto skip_optional_pos; } } - { - Py_ssize_t ival = -1; - PyObject *iobj = _PyNumber_Index(args[1]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); + if (args[1]) { + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + maxsplit = ival; } - if (ival == -1 && PyErr_Occurred()) { - goto exit; + if (!--noptargs) { + goto skip_optional_pos; } - maxsplit = ival; } + prune = args[2]; skip_optional_pos: - return_value = bytearray_split_impl(self, sep, maxsplit); + return_value = bytearray_split_impl(self, sep, maxsplit, prune); exit: return return_value; @@ -463,7 +472,7 @@ PyDoc_STRVAR(bytearray_rpartition__doc__, {"rpartition", (PyCFunction)bytearray_rpartition, METH_O, bytearray_rpartition__doc__}, PyDoc_STRVAR(bytearray_rsplit__doc__, -"rsplit($self, /, sep=None, maxsplit=-1)\n" +"rsplit($self, /, sep=None, maxsplit=-1, prune=None)\n" "--\n" "\n" "Return a list of the sections in the bytearray, using sep as the delimiter.\n" @@ -475,6 +484,8 @@ PyDoc_STRVAR(bytearray_rsplit__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" +" prune\n" +" Determines whether or not to keep empty strings in the final list.\n" "\n" "Splitting is done starting at the end of the bytearray and working to the front."); @@ -483,20 +494,21 @@ PyDoc_STRVAR(bytearray_rsplit__doc__, static PyObject * bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit); + Py_ssize_t maxsplit, PyObject *prune); static PyObject * bytearray_rsplit(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "rsplit", 0}; - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *prune = Py_None; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 2, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { goto exit; } @@ -509,20 +521,26 @@ bytearray_rsplit(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t narg goto skip_optional_pos; } } - { - Py_ssize_t ival = -1; - PyObject *iobj = _PyNumber_Index(args[1]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); + if (args[1]) { + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + maxsplit = ival; } - if (ival == -1 && PyErr_Occurred()) { - goto exit; + if (!--noptargs) { + goto skip_optional_pos; } - maxsplit = ival; } + prune = args[2]; skip_optional_pos: - return_value = bytearray_rsplit_impl(self, sep, maxsplit); + return_value = bytearray_rsplit_impl(self, sep, maxsplit, prune); exit: return return_value; @@ -1120,4 +1138,4 @@ bytearray_sizeof(PyByteArrayObject *self, PyObject *Py_UNUSED(ignored)) { return bytearray_sizeof_impl(self); } -/*[clinic end generated code: output=a82659f581e55629 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=426d72f7a29be2a2 input=a9049054013a1b77]*/ diff --git a/Objects/clinic/bytesobject.c.h b/Objects/clinic/bytesobject.c.h index 9e365ce1a088ba..be458d4f49b1cd 100644 --- a/Objects/clinic/bytesobject.c.h +++ b/Objects/clinic/bytesobject.c.h @@ -3,7 +3,7 @@ preserve [clinic start generated code]*/ PyDoc_STRVAR(bytes_split__doc__, -"split($self, /, sep=None, maxsplit=-1)\n" +"split($self, /, sep=None, maxsplit=-1, prune=None)\n" "--\n" "\n" "Return a list of the sections in the bytes, using sep as the delimiter.\n" @@ -14,26 +14,30 @@ PyDoc_STRVAR(bytes_split__doc__, " (space, tab, return, newline, formfeed, vertical tab).\n" " maxsplit\n" " Maximum number of splits to do.\n" -" -1 (the default value) means no limit."); +" -1 (the default value) means no limit.\n" +" prune\n" +" Determines whether or not to keep empty strings in the final list"); #define BYTES_SPLIT_METHODDEF \ {"split", (PyCFunction)(void(*)(void))bytes_split, METH_FASTCALL|METH_KEYWORDS, bytes_split__doc__}, static PyObject * -bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit); +bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *prune); static PyObject * bytes_split(PyBytesObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "split", 0}; - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *prune = Py_None; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 2, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { goto exit; } @@ -46,20 +50,26 @@ bytes_split(PyBytesObject *self, PyObject *const *args, Py_ssize_t nargs, PyObje goto skip_optional_pos; } } - { - Py_ssize_t ival = -1; - PyObject *iobj = _PyNumber_Index(args[1]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); + if (args[1]) { + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + maxsplit = ival; } - if (ival == -1 && PyErr_Occurred()) { - goto exit; + if (!--noptargs) { + goto skip_optional_pos; } - maxsplit = ival; } + prune = args[2]; skip_optional_pos: - return_value = bytes_split_impl(self, sep, maxsplit); + return_value = bytes_split_impl(self, sep, maxsplit, prune); exit: return return_value; @@ -152,7 +162,7 @@ bytes_rpartition(PyBytesObject *self, PyObject *arg) } PyDoc_STRVAR(bytes_rsplit__doc__, -"rsplit($self, /, sep=None, maxsplit=-1)\n" +"rsplit($self, /, sep=None, maxsplit=-1, prune=None)\n" "--\n" "\n" "Return a list of the sections in the bytes, using sep as the delimiter.\n" @@ -164,6 +174,8 @@ PyDoc_STRVAR(bytes_rsplit__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" +" prune\n" +" Determines whether or not to keep empty strings in the final list\n" "\n" "Splitting is done starting at the end of the bytes and working to the front."); @@ -171,20 +183,22 @@ PyDoc_STRVAR(bytes_rsplit__doc__, {"rsplit", (PyCFunction)(void(*)(void))bytes_rsplit, METH_FASTCALL|METH_KEYWORDS, bytes_rsplit__doc__}, static PyObject * -bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit); +bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *prune); static PyObject * bytes_rsplit(PyBytesObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "rsplit", 0}; - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *prune = Py_None; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 2, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { goto exit; } @@ -197,20 +211,26 @@ bytes_rsplit(PyBytesObject *self, PyObject *const *args, Py_ssize_t nargs, PyObj goto skip_optional_pos; } } - { - Py_ssize_t ival = -1; - PyObject *iobj = _PyNumber_Index(args[1]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); + if (args[1]) { + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + maxsplit = ival; } - if (ival == -1 && PyErr_Occurred()) { - goto exit; + if (!--noptargs) { + goto skip_optional_pos; } - maxsplit = ival; } + prune = args[2]; skip_optional_pos: - return_value = bytes_rsplit_impl(self, sep, maxsplit); + return_value = bytes_rsplit_impl(self, sep, maxsplit, prune); exit: return return_value; @@ -878,4 +898,4 @@ bytes_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=b3f0ec2753246b9c input=a9049054013a1b77]*/ +/*[clinic end generated code: output=71549ea5861cef16 input=a9049054013a1b77]*/ diff --git a/Objects/clinic/unicodeobject.c.h b/Objects/clinic/unicodeobject.c.h index 9ef8ce2e35364c..58ed98cf2d2e4e 100644 --- a/Objects/clinic/unicodeobject.c.h +++ b/Objects/clinic/unicodeobject.c.h @@ -855,7 +855,7 @@ unicode_rjust(PyObject *self, PyObject *const *args, Py_ssize_t nargs) } PyDoc_STRVAR(unicode_split__doc__, -"split($self, /, sep=None, maxsplit=-1)\n" +"split($self, /, sep=None, maxsplit=-1, prune=None)\n" "--\n" "\n" "Return a list of the words in the string, using sep as the delimiter string.\n" @@ -866,26 +866,36 @@ PyDoc_STRVAR(unicode_split__doc__, " and discard empty strings from the result.\n" " maxsplit\n" " Maximum number of splits to do.\n" -" -1 (the default value) means no limit."); +" -1 (the default value) means no limit.\n" +" prune\n" +" Determines whether or not to keep empty strings in the final list.\n" +"\n" +"If maxsplit is given, at most maxsplit splits are done.\n" +"If sep is not specified or is None, any whitespace string is a separator.\n" +"If prune is given and True, empty strings are removed from the result.\n" +"If it is not given or None, the default behaviour is used: it is set to True if\n" +"sep is None, False otherwise."); #define UNICODE_SPLIT_METHODDEF \ {"split", (PyCFunction)(void(*)(void))unicode_split, METH_FASTCALL|METH_KEYWORDS, unicode_split__doc__}, static PyObject * -unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit); +unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *prune); static PyObject * unicode_split(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "split", 0}; - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *prune = Py_None; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 2, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { goto exit; } @@ -898,20 +908,26 @@ unicode_split(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject goto skip_optional_pos; } } - { - Py_ssize_t ival = -1; - PyObject *iobj = _PyNumber_Index(args[1]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); + if (args[1]) { + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + maxsplit = ival; } - if (ival == -1 && PyErr_Occurred()) { - goto exit; + if (!--noptargs) { + goto skip_optional_pos; } - maxsplit = ival; } + prune = args[2]; skip_optional_pos: - return_value = unicode_split_impl(self, sep, maxsplit); + return_value = unicode_split_impl(self, sep, maxsplit, prune); exit: return return_value; @@ -950,7 +966,7 @@ PyDoc_STRVAR(unicode_rpartition__doc__, {"rpartition", (PyCFunction)unicode_rpartition, METH_O, unicode_rpartition__doc__}, PyDoc_STRVAR(unicode_rsplit__doc__, -"rsplit($self, /, sep=None, maxsplit=-1)\n" +"rsplit($self, /, sep=None, maxsplit=-1, prune=None)\n" "--\n" "\n" "Return a list of the words in the string, using sep as the delimiter string.\n" @@ -962,6 +978,8 @@ PyDoc_STRVAR(unicode_rsplit__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" +" prune\n" +" Determines whether or not to keep empty strings in the final list.\n" "\n" "Splits are done starting at the end of the string and working to the front."); @@ -969,20 +987,22 @@ PyDoc_STRVAR(unicode_rsplit__doc__, {"rsplit", (PyCFunction)(void(*)(void))unicode_rsplit, METH_FASTCALL|METH_KEYWORDS, unicode_rsplit__doc__}, static PyObject * -unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit); +unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *prune); static PyObject * unicode_rsplit(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "rsplit", 0}; - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *prune = Py_None; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 2, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { goto exit; } @@ -995,20 +1015,26 @@ unicode_rsplit(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject goto skip_optional_pos; } } - { - Py_ssize_t ival = -1; - PyObject *iobj = _PyNumber_Index(args[1]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); + if (args[1]) { + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + maxsplit = ival; } - if (ival == -1 && PyErr_Occurred()) { - goto exit; + if (!--noptargs) { + goto skip_optional_pos; } - maxsplit = ival; } + prune = args[2]; skip_optional_pos: - return_value = unicode_rsplit_impl(self, sep, maxsplit); + return_value = unicode_rsplit_impl(self, sep, maxsplit, prune); exit: return return_value; @@ -1327,4 +1353,4 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=f10cf85d3935b3b7 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=7ebd449cc09bcc9b input=a9049054013a1b77]*/ diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h index 068047f9874a07..9e57b7ff75c31a 100644 --- a/Objects/stringlib/split.h +++ b/Objects/stringlib/split.h @@ -53,9 +53,9 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(split_whitespace)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { - Py_ssize_t i, j, count=0; + Py_ssize_t i, j, k, count=0; PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); PyObject *sub; @@ -64,10 +64,16 @@ STRINGLIB(split_whitespace)(PyObject* str_obj, i = j = 0; while (maxcount-- > 0) { + k = i; while (i < str_len && STRINGLIB_ISSPACE(str[i])) i++; - if (i == str_len) break; - j = i; i++; + for (; prune == 0 && k < i-1; k++) { + SPLIT_ADD(str, k, k); + } + if (i == str_len) + break; + j = i; + i++; while (i < str_len && !STRINGLIB_ISSPACE(str[i])) i++; #ifndef STRINGLIB_MUTABLE @@ -102,11 +108,12 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(split_char)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR ch, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, count=0; PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); PyObject *sub; + int pruned = 0; if (list == NULL) return NULL; @@ -116,21 +123,25 @@ STRINGLIB(split_char)(PyObject* str_obj, for(; j < str_len; j++) { /* I found that using memchr makes no difference */ if (str[j] == ch) { - SPLIT_ADD(str, i, j); + if (prune == 0 || i < j) { + SPLIT_ADD(str, i, j); + } else { + pruned = 1; + } i = j = j + 1; break; } } } #ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + if (count == 0 && pruned == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { /* ch not in str_obj, so just use str_obj as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); count++; } else #endif - if (i <= str_len) { + if (i < str_len || (prune == 0 && i == str_len)) { SPLIT_ADD(str, i, str_len); } FIX_PREALLOC_SIZE(list); @@ -145,17 +156,18 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(split)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, pos, count=0; PyObject *list, *sub; + int pruned = 0; if (sep_len == 0) { PyErr_SetString(PyExc_ValueError, "empty separator"); return NULL; } else if (sep_len == 1) - return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount); + return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount, prune); list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) @@ -166,12 +178,18 @@ STRINGLIB(split)(PyObject* str_obj, pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); if (pos < 0) break; + if (prune && pos == 0) { /* Empty string; ignore */ + i += sep_len; + pruned = 1; + maxcount++; /* Don't count pruned strings in the max count */ + continue; + } j = i + pos; SPLIT_ADD(str, i, j); i = j + sep_len; } #ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + if (count == 0 && pruned == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { /* No match in str_obj, so just use it as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); @@ -179,7 +197,8 @@ STRINGLIB(split)(PyObject* str_obj, } else #endif { - SPLIT_ADD(str, i, str_len); + if (prune == 0 || i < str_len) + SPLIT_ADD(str, i, str_len); } FIX_PREALLOC_SIZE(list); return list; @@ -192,9 +211,9 @@ STRINGLIB(split)(PyObject* str_obj, Py_LOCAL_INLINE(PyObject *) STRINGLIB(rsplit_whitespace)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { - Py_ssize_t i, j, count=0; + Py_ssize_t i, j, k, count=0; PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); PyObject *sub; @@ -203,10 +222,16 @@ STRINGLIB(rsplit_whitespace)(PyObject* str_obj, i = j = str_len - 1; while (maxcount-- > 0) { + k = i; while (i >= 0 && STRINGLIB_ISSPACE(str[i])) i--; - if (i < 0) break; - j = i; i--; + for (; prune == 0 && k > i+1; k--) { + SPLIT_ADD(str, k+1, k+1); + } + if (i < 0) + break; + j = i; + i--; while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) i--; #ifndef STRINGLIB_MUTABLE @@ -243,11 +268,12 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(rsplit_char)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR ch, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, count=0; PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); PyObject *sub; + int pruned = 0; if (list == NULL) return NULL; @@ -256,21 +282,25 @@ STRINGLIB(rsplit_char)(PyObject* str_obj, while ((i >= 0) && (maxcount-- > 0)) { for(; i >= 0; i--) { if (str[i] == ch) { - SPLIT_ADD(str, i + 1, j + 1); + if (prune == 0 || i < j) { + SPLIT_ADD(str, i + 1, j + 1); + } else { + pruned = 1; + } j = i = i - 1; break; } } } #ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + if (count == 0 && pruned == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { /* ch not in str_obj, so just use str_obj as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); count++; } else #endif - if (j >= -1) { + if (j > -1 || (prune == 0 && j == -1)) { SPLIT_ADD(str, 0, j + 1); } FIX_PREALLOC_SIZE(list); @@ -287,17 +317,18 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(rsplit)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { Py_ssize_t j, pos, count=0; PyObject *list, *sub; + int pruned = 0; if (sep_len == 0) { PyErr_SetString(PyExc_ValueError, "empty separator"); return NULL; } else if (sep_len == 1) - return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount); + return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount, prune); list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) @@ -308,11 +339,17 @@ STRINGLIB(rsplit)(PyObject* str_obj, pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH); if (pos < 0) break; + if (prune && pos == j-1) { + j--; + pruned = 1; + maxcount++; + continue; + } SPLIT_ADD(str, pos + sep_len, j); j = pos; } #ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + if (count == 0 && pruned == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { /* No match in str_obj, so just use it as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 82f0b1afed444c..941ce700833054 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10335,7 +10335,8 @@ PyUnicode_Splitlines(PyObject *string, int keepends) static PyObject * split(PyObject *self, PyObject *substring, - Py_ssize_t maxcount) + Py_ssize_t maxcount, + int prune) { int kind1, kind2; const void *buf1, *buf2; @@ -10354,22 +10355,22 @@ split(PyObject *self, if (PyUnicode_IS_ASCII(self)) return asciilib_split_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); else return ucs1lib_split_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_2BYTE_KIND: return ucs2lib_split_whitespace( self, PyUnicode_2BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_4BYTE_KIND: return ucs4lib_split_whitespace( self, PyUnicode_4BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); default: Py_UNREACHABLE(); @@ -10402,18 +10403,18 @@ split(PyObject *self, case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) out = asciilib_split( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); else out = ucs1lib_split( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_2BYTE_KIND: out = ucs2lib_split( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_4BYTE_KIND: out = ucs4lib_split( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); break; default: out = NULL; @@ -10427,7 +10428,8 @@ split(PyObject *self, static PyObject * rsplit(PyObject *self, PyObject *substring, - Py_ssize_t maxcount) + Py_ssize_t maxcount, + int prune) { int kind1, kind2; const void *buf1, *buf2; @@ -10446,22 +10448,22 @@ rsplit(PyObject *self, if (PyUnicode_IS_ASCII(self)) return asciilib_rsplit_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); else return ucs1lib_rsplit_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_2BYTE_KIND: return ucs2lib_rsplit_whitespace( self, PyUnicode_2BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_4BYTE_KIND: return ucs4lib_rsplit_whitespace( self, PyUnicode_4BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); default: Py_UNREACHABLE(); @@ -10494,18 +10496,18 @@ rsplit(PyObject *self, case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) out = asciilib_rsplit( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); else out = ucs1lib_rsplit( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_2BYTE_KIND: out = ucs2lib_rsplit( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_4BYTE_KIND: out = ucs4lib_rsplit( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); break; default: out = NULL; @@ -13113,12 +13115,12 @@ unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) } PyObject * -PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) +PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int prune) { if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - return split(s, sep, maxsplit); + return split(s, sep, maxsplit, prune); } /*[clinic input] @@ -13131,18 +13133,40 @@ str.split as unicode_split maxsplit: Py_ssize_t = -1 Maximum number of splits to do. -1 (the default value) means no limit. + prune: object = None + Determines whether or not to keep empty strings in the final list. Return a list of the words in the string, using sep as the delimiter string. + +If maxsplit is given, at most maxsplit splits are done. +If sep is not specified or is None, any whitespace string is a separator. +If prune is given and True, empty strings are removed from the result. +If it is not given or None, the default behaviour is used: it is set to True if +sep is None, False otherwise. [clinic start generated code]*/ static PyObject * -unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit) -/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/ +unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *prune) +/*[clinic end generated code: output=bd1017c441cc3c85 input=e5dc515c226ad88f]*/ { + int prune_value; + + if (prune == Py_None) { + if (sep == Py_None) + prune_value = 1; + else + prune_value = 0; + } else { + prune_value = PyObject_IsTrue(prune); + if (prune_value < 0) + return NULL; + } + if (sep == Py_None) - return split(self, NULL, maxsplit); + return split(self, NULL, maxsplit, prune_value); if (PyUnicode_Check(sep)) - return split(self, sep, maxsplit); + return split(self, sep, maxsplit, prune_value); PyErr_Format(PyExc_TypeError, "must be str or None, not %.100s", @@ -13297,12 +13321,12 @@ unicode_rpartition(PyObject *self, PyObject *sep) } PyObject * -PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) +PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int prune) { if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - return rsplit(s, sep, maxsplit); + return rsplit(s, sep, maxsplit, prune); } /*[clinic input] @@ -13314,13 +13338,27 @@ Splits are done starting at the end of the string and working to the front. [clinic start generated code]*/ static PyObject * -unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit) -/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/ +unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *prune) +/*[clinic end generated code: output=bc79058595de0bf6 input=12ad4bf57dd35f15]*/ { + int prune_value; + + if (prune == Py_None) { + if (sep == Py_None) + prune_value = 1; + else + prune_value = 0; + } else { + prune_value = PyObject_IsTrue(prune); + if (prune_value < 0) + return NULL; + } + if (sep == Py_None) - return rsplit(self, NULL, maxsplit); + return rsplit(self, NULL, maxsplit, prune_value); if (PyUnicode_Check(sep)) - return rsplit(self, sep, maxsplit); + return rsplit(self, sep, maxsplit, prune_value); PyErr_Format(PyExc_TypeError, "must be str or None, not %.100s", From 0912edefe564c248776f42512ab5a4a59acaaa51 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Mon, 17 May 2021 23:10:58 +0100 Subject: [PATCH 02/25] Switched logic to use keepempty. --- Include/unicodeobject.h | 4 +-- Modules/_pickle.c | 2 +- Objects/bytearrayobject.c | 48 +++++++++++++--------------- Objects/bytesobject.c | 45 +++++++++++++-------------- Objects/clinic/bytearrayobject.c.h | 30 +++++++++--------- Objects/clinic/bytesobject.c.h | 30 +++++++++--------- Objects/clinic/unicodeobject.c.h | 30 +++++++++--------- Objects/unicodeobject.c | 50 +++++++++++++++--------------- 8 files changed, 115 insertions(+), 124 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index f714c3c7307f42..f105cdd70ad52b 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -852,7 +852,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_Split( PyObject *s, /* String to split */ PyObject *sep, /* String separator */ Py_ssize_t maxsplit, /* Maxsplit count */ - int prune /* Whether to remove empty strings */ + PyObject *keepempty /* Whether to remove empty strings */ ); /* Dito, but split at line breaks. @@ -897,7 +897,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_RSplit( PyObject *s, /* String to split */ PyObject *sep, /* String separator */ Py_ssize_t maxsplit, /* Maxsplit count */ - int prune /* Whether to remove empty strings */ + PyObject *keepempty /* Whether to remove empty strings */ ); /* Translate a string by applying a character mapping table to it and diff --git a/Modules/_pickle.c b/Modules/_pickle.c index c15ffec658bd5a..c475b3f34f6c8d 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -1810,7 +1810,7 @@ get_dotted_path(PyObject *obj, PyObject *name) PyObject *dotted_path; Py_ssize_t i, n; - dotted_path = PyUnicode_Split(name, _PyUnicode_FromId(&PyId_dot), -1, 0); + dotted_path = PyUnicode_Split(name, _PyUnicode_FromId(&PyId_dot), -1, Py_True); if (dotted_path == NULL) return NULL; n = PyList_GET_SIZE(dotted_path); diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index ad02ac4f288c04..6fd503155bfbd5 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -1404,7 +1404,7 @@ bytearray.split maxsplit: Py_ssize_t = -1 Maximum number of splits to do. -1 (the default value) means no limit. - prune: object = None + keepempty: object = None Determines whether or not to keep empty strings in the final list. Return a list of the sections in the bytearray, using sep as the delimiter. @@ -1412,23 +1412,23 @@ Return a list of the sections in the bytearray, using sep as the delimiter. static PyObject * bytearray_split_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit, PyObject *prune) -/*[clinic end generated code: output=62a007e24098bdb0 input=02fd88e2131c7cae]*/ + Py_ssize_t maxsplit, PyObject *keepempty) +/*[clinic end generated code: output=28286c156d864181 input=908de7e1dd1fd8ca]*/ { Py_ssize_t len = PyByteArray_GET_SIZE(self), n; const char *s = PyByteArray_AS_STRING(self), *sub; PyObject *list; Py_buffer vsub; - int prune_value; + int prune; - if (prune == Py_None) { + if (keepempty == Py_None) { if (sep == Py_None) - prune_value = 1; + prune = 1; else - prune_value = 0; + prune = 0; } else { - prune_value = PyObject_IsTrue(prune); - if (prune_value < 0) + prune = ! PyObject_IsTrue(keepempty); + if (prune < 0) return NULL; } @@ -1436,17 +1436,14 @@ bytearray_split_impl(PyByteArrayObject *self, PyObject *sep, maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, - prune_value); + return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, prune); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; sub = vsub.buf; n = vsub.len; - list = stringlib_split( - (PyObject*) self, s, len, sub, n, maxsplit, prune_value - ); + list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit, prune); PyBuffer_Release(&vsub); return list; } @@ -1536,23 +1533,23 @@ Splitting is done starting at the end of the bytearray and working to the front. static PyObject * bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit, PyObject *prune) -/*[clinic end generated code: output=ef399a20ad6c8b71 input=a68286e4dd692ffe]*/ + Py_ssize_t maxsplit, PyObject *keepempty) +/*[clinic end generated code: output=d8c2e7552a91a174 input=a68286e4dd692ffe]*/ { Py_ssize_t len = PyByteArray_GET_SIZE(self), n; const char *s = PyByteArray_AS_STRING(self), *sub; PyObject *list; Py_buffer vsub; - int prune_value; + int prune; - if (prune == Py_None) { + if (keepempty == Py_None) { if (sep == Py_None) - prune_value = 1; + prune = 1; else - prune_value = 0; + prune = 0; } else { - prune_value = PyObject_IsTrue(prune); - if (prune_value < 0) + prune = ! PyObject_IsTrue(keepempty); + if (prune < 0) return NULL; } @@ -1560,17 +1557,14 @@ bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep, maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, - prune_value); + return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, prune); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; sub = vsub.buf; n = vsub.len; - list = stringlib_rsplit( - (PyObject*) self, s, len, sub, n, maxsplit, prune_value - ); + list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit, prune); PyBuffer_Release(&vsub); return list; } diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 3ced4647602d2c..257a0d38cba4e2 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1701,7 +1701,7 @@ bytes.split maxsplit: Py_ssize_t = -1 Maximum number of splits to do. -1 (the default value) means no limit. - prune: object = None + keepempty: object = None Determines whether or not to keep empty strings in the final list Return a list of the sections in the bytes, using sep as the delimiter. @@ -1709,37 +1709,36 @@ Return a list of the sections in the bytes, using sep as the delimiter. static PyObject * bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, - PyObject *prune) -/*[clinic end generated code: output=cc9c523f3392cbe0 input=06605e7d3430ff7e]*/ + PyObject *keepempty) +/*[clinic end generated code: output=e1b678240fbff2e0 input=e58ccb5eb2569eb4]*/ { Py_ssize_t len = PyBytes_GET_SIZE(self), n; const char *s = PyBytes_AS_STRING(self), *sub; Py_buffer vsub; PyObject *list; - int prune_value; + int prune; - if (prune == Py_None) { + if (keepempty == Py_None) { if (sep == Py_None) - prune_value = 1; + prune = 1; else - prune_value = 0; + prune = 0; } else { - prune_value = PyObject_IsTrue(prune); - if (prune_value < 0) + prune = ! PyObject_IsTrue(keepempty); + if (prune < 0) return NULL; } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, - prune_value); + return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, prune); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; sub = vsub.buf; n = vsub.len; - list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit, prune_value); + list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit, prune); PyBuffer_Release(&vsub); return list; } @@ -1808,38 +1807,36 @@ Splitting is done starting at the end of the bytes and working to the front. static PyObject * bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, - PyObject *prune) -/*[clinic end generated code: output=372b333ea8e35927 input=0f86c9f28f7d7b7b]*/ + PyObject *keepempty) +/*[clinic end generated code: output=0e304d20c12f7ac0 input=0f86c9f28f7d7b7b]*/ { Py_ssize_t len = PyBytes_GET_SIZE(self), n; const char *s = PyBytes_AS_STRING(self), *sub; Py_buffer vsub; PyObject *list; - int prune_value; + int prune; - if (prune == Py_None) { + if (keepempty == Py_None) { if (sep == Py_None) - prune_value = 1; + prune = 1; else - prune_value = 0; + prune = 0; } else { - prune_value = PyObject_IsTrue(prune); - if (prune_value < 0) + prune = ! PyObject_IsTrue(keepempty); + if (prune < 0) return NULL; } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, - prune_value); + return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, prune); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; sub = vsub.buf; n = vsub.len; - list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit, - prune_value); + list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit, prune); PyBuffer_Release(&vsub); return list; } diff --git a/Objects/clinic/bytearrayobject.c.h b/Objects/clinic/bytearrayobject.c.h index a2a8fe7240a830..82942a15cfed91 100644 --- a/Objects/clinic/bytearrayobject.c.h +++ b/Objects/clinic/bytearrayobject.c.h @@ -366,7 +366,7 @@ bytearray_replace(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t nar } PyDoc_STRVAR(bytearray_split__doc__, -"split($self, /, sep=None, maxsplit=-1, prune=None)\n" +"split($self, /, sep=None, maxsplit=-1, keepempty=None)\n" "--\n" "\n" "Return a list of the sections in the bytearray, using sep as the delimiter.\n" @@ -378,7 +378,7 @@ PyDoc_STRVAR(bytearray_split__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" -" prune\n" +" keepempty\n" " Determines whether or not to keep empty strings in the final list."); #define BYTEARRAY_SPLIT_METHODDEF \ @@ -386,19 +386,19 @@ PyDoc_STRVAR(bytearray_split__doc__, static PyObject * bytearray_split_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit, PyObject *prune); + Py_ssize_t maxsplit, PyObject *keepempty); static PyObject * bytearray_split(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "keepempty", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "split", 0}; PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; - PyObject *prune = Py_None; + PyObject *keepempty = Py_None; args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { @@ -430,9 +430,9 @@ bytearray_split(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t nargs goto skip_optional_pos; } } - prune = args[2]; + keepempty = args[2]; skip_optional_pos: - return_value = bytearray_split_impl(self, sep, maxsplit, prune); + return_value = bytearray_split_impl(self, sep, maxsplit, keepempty); exit: return return_value; @@ -472,7 +472,7 @@ PyDoc_STRVAR(bytearray_rpartition__doc__, {"rpartition", (PyCFunction)bytearray_rpartition, METH_O, bytearray_rpartition__doc__}, PyDoc_STRVAR(bytearray_rsplit__doc__, -"rsplit($self, /, sep=None, maxsplit=-1, prune=None)\n" +"rsplit($self, /, sep=None, maxsplit=-1, keepempty=None)\n" "--\n" "\n" "Return a list of the sections in the bytearray, using sep as the delimiter.\n" @@ -484,7 +484,7 @@ PyDoc_STRVAR(bytearray_rsplit__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" -" prune\n" +" keepempty\n" " Determines whether or not to keep empty strings in the final list.\n" "\n" "Splitting is done starting at the end of the bytearray and working to the front."); @@ -494,19 +494,19 @@ PyDoc_STRVAR(bytearray_rsplit__doc__, static PyObject * bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit, PyObject *prune); + Py_ssize_t maxsplit, PyObject *keepempty); static PyObject * bytearray_rsplit(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "keepempty", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "rsplit", 0}; PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; - PyObject *prune = Py_None; + PyObject *keepempty = Py_None; args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { @@ -538,9 +538,9 @@ bytearray_rsplit(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t narg goto skip_optional_pos; } } - prune = args[2]; + keepempty = args[2]; skip_optional_pos: - return_value = bytearray_rsplit_impl(self, sep, maxsplit, prune); + return_value = bytearray_rsplit_impl(self, sep, maxsplit, keepempty); exit: return return_value; @@ -1138,4 +1138,4 @@ bytearray_sizeof(PyByteArrayObject *self, PyObject *Py_UNUSED(ignored)) { return bytearray_sizeof_impl(self); } -/*[clinic end generated code: output=426d72f7a29be2a2 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=ef8c01f3ce59b58e input=a9049054013a1b77]*/ diff --git a/Objects/clinic/bytesobject.c.h b/Objects/clinic/bytesobject.c.h index be458d4f49b1cd..2ee5e50d5effec 100644 --- a/Objects/clinic/bytesobject.c.h +++ b/Objects/clinic/bytesobject.c.h @@ -3,7 +3,7 @@ preserve [clinic start generated code]*/ PyDoc_STRVAR(bytes_split__doc__, -"split($self, /, sep=None, maxsplit=-1, prune=None)\n" +"split($self, /, sep=None, maxsplit=-1, keepempty=None)\n" "--\n" "\n" "Return a list of the sections in the bytes, using sep as the delimiter.\n" @@ -15,7 +15,7 @@ PyDoc_STRVAR(bytes_split__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" -" prune\n" +" keepempty\n" " Determines whether or not to keep empty strings in the final list"); #define BYTES_SPLIT_METHODDEF \ @@ -23,19 +23,19 @@ PyDoc_STRVAR(bytes_split__doc__, static PyObject * bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, - PyObject *prune); + PyObject *keepempty); static PyObject * bytes_split(PyBytesObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "keepempty", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "split", 0}; PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; - PyObject *prune = Py_None; + PyObject *keepempty = Py_None; args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { @@ -67,9 +67,9 @@ bytes_split(PyBytesObject *self, PyObject *const *args, Py_ssize_t nargs, PyObje goto skip_optional_pos; } } - prune = args[2]; + keepempty = args[2]; skip_optional_pos: - return_value = bytes_split_impl(self, sep, maxsplit, prune); + return_value = bytes_split_impl(self, sep, maxsplit, keepempty); exit: return return_value; @@ -162,7 +162,7 @@ bytes_rpartition(PyBytesObject *self, PyObject *arg) } PyDoc_STRVAR(bytes_rsplit__doc__, -"rsplit($self, /, sep=None, maxsplit=-1, prune=None)\n" +"rsplit($self, /, sep=None, maxsplit=-1, keepempty=None)\n" "--\n" "\n" "Return a list of the sections in the bytes, using sep as the delimiter.\n" @@ -174,7 +174,7 @@ PyDoc_STRVAR(bytes_rsplit__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" -" prune\n" +" keepempty\n" " Determines whether or not to keep empty strings in the final list\n" "\n" "Splitting is done starting at the end of the bytes and working to the front."); @@ -184,19 +184,19 @@ PyDoc_STRVAR(bytes_rsplit__doc__, static PyObject * bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, - PyObject *prune); + PyObject *keepempty); static PyObject * bytes_rsplit(PyBytesObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "keepempty", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "rsplit", 0}; PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; - PyObject *prune = Py_None; + PyObject *keepempty = Py_None; args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { @@ -228,9 +228,9 @@ bytes_rsplit(PyBytesObject *self, PyObject *const *args, Py_ssize_t nargs, PyObj goto skip_optional_pos; } } - prune = args[2]; + keepempty = args[2]; skip_optional_pos: - return_value = bytes_rsplit_impl(self, sep, maxsplit, prune); + return_value = bytes_rsplit_impl(self, sep, maxsplit, keepempty); exit: return return_value; @@ -898,4 +898,4 @@ bytes_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=71549ea5861cef16 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=4d2edeb989e189d9 input=a9049054013a1b77]*/ diff --git a/Objects/clinic/unicodeobject.c.h b/Objects/clinic/unicodeobject.c.h index 58ed98cf2d2e4e..abd3c9e74427b6 100644 --- a/Objects/clinic/unicodeobject.c.h +++ b/Objects/clinic/unicodeobject.c.h @@ -855,7 +855,7 @@ unicode_rjust(PyObject *self, PyObject *const *args, Py_ssize_t nargs) } PyDoc_STRVAR(unicode_split__doc__, -"split($self, /, sep=None, maxsplit=-1, prune=None)\n" +"split($self, /, sep=None, maxsplit=-1, keepempty=None)\n" "--\n" "\n" "Return a list of the words in the string, using sep as the delimiter string.\n" @@ -867,7 +867,7 @@ PyDoc_STRVAR(unicode_split__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" -" prune\n" +" keepempty\n" " Determines whether or not to keep empty strings in the final list.\n" "\n" "If maxsplit is given, at most maxsplit splits are done.\n" @@ -881,19 +881,19 @@ PyDoc_STRVAR(unicode_split__doc__, static PyObject * unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, - PyObject *prune); + PyObject *keepempty); static PyObject * unicode_split(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "keepempty", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "split", 0}; PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; - PyObject *prune = Py_None; + PyObject *keepempty = Py_None; args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { @@ -925,9 +925,9 @@ unicode_split(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject goto skip_optional_pos; } } - prune = args[2]; + keepempty = args[2]; skip_optional_pos: - return_value = unicode_split_impl(self, sep, maxsplit, prune); + return_value = unicode_split_impl(self, sep, maxsplit, keepempty); exit: return return_value; @@ -966,7 +966,7 @@ PyDoc_STRVAR(unicode_rpartition__doc__, {"rpartition", (PyCFunction)unicode_rpartition, METH_O, unicode_rpartition__doc__}, PyDoc_STRVAR(unicode_rsplit__doc__, -"rsplit($self, /, sep=None, maxsplit=-1, prune=None)\n" +"rsplit($self, /, sep=None, maxsplit=-1, keepempty=None)\n" "--\n" "\n" "Return a list of the words in the string, using sep as the delimiter string.\n" @@ -978,7 +978,7 @@ PyDoc_STRVAR(unicode_rsplit__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" -" prune\n" +" keepempty\n" " Determines whether or not to keep empty strings in the final list.\n" "\n" "Splits are done starting at the end of the string and working to the front."); @@ -988,19 +988,19 @@ PyDoc_STRVAR(unicode_rsplit__doc__, static PyObject * unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, - PyObject *prune); + PyObject *keepempty); static PyObject * unicode_rsplit(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "keepempty", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "rsplit", 0}; PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; - PyObject *prune = Py_None; + PyObject *keepempty = Py_None; args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { @@ -1032,9 +1032,9 @@ unicode_rsplit(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject goto skip_optional_pos; } } - prune = args[2]; + keepempty = args[2]; skip_optional_pos: - return_value = unicode_rsplit_impl(self, sep, maxsplit, prune); + return_value = unicode_rsplit_impl(self, sep, maxsplit, keepempty); exit: return return_value; @@ -1353,4 +1353,4 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=7ebd449cc09bcc9b input=a9049054013a1b77]*/ +/*[clinic end generated code: output=d5919c82872bf04d input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 941ce700833054..e5c393bb9a8e80 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13115,12 +13115,12 @@ unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) } PyObject * -PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int prune) +PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, PyObject *keepempty) { if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - return split(s, sep, maxsplit, prune); + return split(s, sep, maxsplit, keepempty); } /*[clinic input] @@ -13133,7 +13133,7 @@ str.split as unicode_split maxsplit: Py_ssize_t = -1 Maximum number of splits to do. -1 (the default value) means no limit. - prune: object = None + keepempty: object = None Determines whether or not to keep empty strings in the final list. Return a list of the words in the string, using sep as the delimiter string. @@ -13147,26 +13147,26 @@ sep is None, False otherwise. static PyObject * unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, - PyObject *prune) -/*[clinic end generated code: output=bd1017c441cc3c85 input=e5dc515c226ad88f]*/ + PyObject *keepempty) +/*[clinic end generated code: output=c182ae533ca1ef53 input=bcd1a211e53ae5a9]*/ { - int prune_value; + int prune; - if (prune == Py_None) { + if (keepempty == Py_None) { if (sep == Py_None) - prune_value = 1; + prune = 1; else - prune_value = 0; + prune = 0; } else { - prune_value = PyObject_IsTrue(prune); - if (prune_value < 0) + prune = ! PyObject_IsTrue(keepempty); + if (prune < 0) return NULL; } if (sep == Py_None) - return split(self, NULL, maxsplit, prune_value); + return split(self, NULL, maxsplit, prune); if (PyUnicode_Check(sep)) - return split(self, sep, maxsplit, prune_value); + return split(self, sep, maxsplit, prune); PyErr_Format(PyExc_TypeError, "must be str or None, not %.100s", @@ -13321,12 +13321,12 @@ unicode_rpartition(PyObject *self, PyObject *sep) } PyObject * -PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int prune) +PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, PyObject *keepempty) { if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - return rsplit(s, sep, maxsplit, prune); + return rsplit(s, sep, maxsplit, keepempty); } /*[clinic input] @@ -13339,26 +13339,26 @@ Splits are done starting at the end of the string and working to the front. static PyObject * unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, - PyObject *prune) -/*[clinic end generated code: output=bc79058595de0bf6 input=12ad4bf57dd35f15]*/ + PyObject *keepempty) +/*[clinic end generated code: output=27ba2177eb6cdfcf input=12ad4bf57dd35f15]*/ { - int prune_value; + int prune; - if (prune == Py_None) { + if (keepempty == Py_None) { if (sep == Py_None) - prune_value = 1; + prune = 1; else - prune_value = 0; + prune = 0; } else { - prune_value = PyObject_IsTrue(prune); - if (prune_value < 0) + prune = ! PyObject_IsTrue(keepempty); + if (prune < 0) return NULL; } if (sep == Py_None) - return rsplit(self, NULL, maxsplit, prune_value); + return rsplit(self, NULL, maxsplit, prune); if (PyUnicode_Check(sep)) - return rsplit(self, sep, maxsplit, prune_value); + return rsplit(self, sep, maxsplit, prune); PyErr_Format(PyExc_TypeError, "must be str or None, not %.100s", From f4a101f2830a1869c8d32d79f29bef9285e9b617 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Mon, 17 May 2021 23:13:28 +0100 Subject: [PATCH 03/25] Adding in string tests. --- Lib/test/string_tests.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 840d7bb7550f71..9eff99ba70ad97 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -465,6 +465,22 @@ def test_split(self): self.checkraises(ValueError, 'hello', 'split', '') self.checkraises(ValueError, 'hello', 'split', '', 0) + # without args, any whitespace is a separator + self.checkequal(['a', 'b', 'c', 'd', 'e'], 'a b\tc\nd \n e ', 'split') + + # with sep=None, any whitespace is a separator + self.checkequal(['a', 'b', 'c', 'd', 'e'], 'a b\tc\nd \n e ', 'split', sep=None) + + # Without an explicit `sep`, or sep=None, empty strings are pruned from result + self.checkequal([], '', 'split') + self.checkequal([], '', 'split', sep=None) + + # With an explicit, non-None `sep`, empty strings are not pruned from result + self.checkequal([''], '', 'split', sep=',') + + # keepempty=False to remove empty strings from result + self.checkequal([], '', 'split', sep=',', keepempty=False) + def test_rsplit(self): # by a char self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|') From ade96883ceb41bd28b5417601b3a0eb63aeb24af Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Mon, 17 May 2021 23:37:20 +0100 Subject: [PATCH 04/25] Switched back to PyUnicode_Split taking an int flag. --- Include/unicodeobject.h | 6 ++---- Modules/_pickle.c | 2 +- Objects/unicodeobject.c | 8 ++++---- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index f105cdd70ad52b..a2fee1a212dbfd 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -845,14 +845,13 @@ PyAPI_FUNC(void) PyUnicode_AppendAndDel( At most maxsplit splits will be done. If negative, no limit is set. Separators are not included in the resulting list. - */ PyAPI_FUNC(PyObject*) PyUnicode_Split( PyObject *s, /* String to split */ PyObject *sep, /* String separator */ Py_ssize_t maxsplit, /* Maxsplit count */ - PyObject *keepempty /* Whether to remove empty strings */ + int keepempty /* Whether to remove empty strings */ ); /* Dito, but split at line breaks. @@ -890,14 +889,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_RPartition( no limit is set. Separators are not included in the resulting list. - */ PyAPI_FUNC(PyObject*) PyUnicode_RSplit( PyObject *s, /* String to split */ PyObject *sep, /* String separator */ Py_ssize_t maxsplit, /* Maxsplit count */ - PyObject *keepempty /* Whether to remove empty strings */ + int keepempty /* Whether to remove empty strings */ ); /* Translate a string by applying a character mapping table to it and diff --git a/Modules/_pickle.c b/Modules/_pickle.c index c475b3f34f6c8d..8dd755c0d7dc07 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -1810,7 +1810,7 @@ get_dotted_path(PyObject *obj, PyObject *name) PyObject *dotted_path; Py_ssize_t i, n; - dotted_path = PyUnicode_Split(name, _PyUnicode_FromId(&PyId_dot), -1, Py_True); + dotted_path = PyUnicode_Split(name, _PyUnicode_FromId(&PyId_dot), -1, 1); if (dotted_path == NULL) return NULL; n = PyList_GET_SIZE(dotted_path); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e5c393bb9a8e80..d65b2241023cb7 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13115,12 +13115,12 @@ unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) } PyObject * -PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, PyObject *keepempty) +PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int keepempty) { if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - return split(s, sep, maxsplit, keepempty); + return split(s, sep, maxsplit, ! keepempty); } /*[clinic input] @@ -13321,12 +13321,12 @@ unicode_rpartition(PyObject *self, PyObject *sep) } PyObject * -PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, PyObject *keepempty) +PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int keepempty) { if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - return rsplit(s, sep, maxsplit, keepempty); + return rsplit(s, sep, maxsplit, ! keepempty); } /*[clinic input] From aa4ee9adb0cc92b3faf92a360e65fe31072b5342 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Mon, 17 May 2021 23:37:34 +0100 Subject: [PATCH 05/25] Whitespace. --- Objects/unicodeobject.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index d65b2241023cb7..560656df8e30d9 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10402,19 +10402,15 @@ split(PyObject *self, switch (kind1) { case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) - out = asciilib_split( - self, buf1, len1, buf2, len2, maxcount, prune); + out = asciilib_split(self, buf1, len1, buf2, len2, maxcount, prune); else - out = ucs1lib_split( - self, buf1, len1, buf2, len2, maxcount, prune); + out = ucs1lib_split(self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_2BYTE_KIND: - out = ucs2lib_split( - self, buf1, len1, buf2, len2, maxcount, prune); + out = ucs2lib_split(self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_4BYTE_KIND: - out = ucs4lib_split( - self, buf1, len1, buf2, len2, maxcount, prune); + out = ucs4lib_split(self, buf1, len1, buf2, len2, maxcount, prune); break; default: out = NULL; @@ -10447,22 +10443,22 @@ rsplit(PyObject *self, case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self)) return asciilib_rsplit_whitespace( - self, PyUnicode_1BYTE_DATA(self), + self, PyUnicode_1BYTE_DATA(self), PyUnicode_GET_LENGTH(self), maxcount, prune ); else return ucs1lib_rsplit_whitespace( - self, PyUnicode_1BYTE_DATA(self), + self, PyUnicode_1BYTE_DATA(self), PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_2BYTE_KIND: return ucs2lib_rsplit_whitespace( - self, PyUnicode_2BYTE_DATA(self), + self, PyUnicode_2BYTE_DATA(self), PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_4BYTE_KIND: return ucs4lib_rsplit_whitespace( - self, PyUnicode_4BYTE_DATA(self), + self, PyUnicode_4BYTE_DATA(self), PyUnicode_GET_LENGTH(self), maxcount, prune ); default: @@ -10495,19 +10491,15 @@ rsplit(PyObject *self, switch (kind1) { case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) - out = asciilib_rsplit( - self, buf1, len1, buf2, len2, maxcount, prune); + out = asciilib_rsplit(self, buf1, len1, buf2, len2, maxcount, prune); else - out = ucs1lib_rsplit( - self, buf1, len1, buf2, len2, maxcount, prune); + out = ucs1lib_rsplit(self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_2BYTE_KIND: - out = ucs2lib_rsplit( - self, buf1, len1, buf2, len2, maxcount, prune); + out = ucs2lib_rsplit(self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_4BYTE_KIND: - out = ucs4lib_rsplit( - self, buf1, len1, buf2, len2, maxcount, prune); + out = ucs4lib_rsplit(self, buf1, len1, buf2, len2, maxcount, prune); break; default: out = NULL; @@ -13140,8 +13132,9 @@ Return a list of the words in the string, using sep as the delimiter string. If maxsplit is given, at most maxsplit splits are done. If sep is not specified or is None, any whitespace string is a separator. -If prune is given and True, empty strings are removed from the result. -If it is not given or None, the default behaviour is used: it is set to True if +If keepempty is False, empty strings are removed from the result. +If keepempty is True, empty strings are retained in the result. +If keepempty is not given or None, the default behaviour is used: it is set to True if sep is None, False otherwise. [clinic start generated code]*/ From 76ec07d0d8c1a7451630b590e9d310eaaf34772e Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Tue, 18 May 2021 00:11:37 +0100 Subject: [PATCH 06/25] Regenerated. --- Objects/clinic/unicodeobject.c.h | 7 ++++--- Objects/unicodeobject.c | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Objects/clinic/unicodeobject.c.h b/Objects/clinic/unicodeobject.c.h index abd3c9e74427b6..b5c4513aa66c72 100644 --- a/Objects/clinic/unicodeobject.c.h +++ b/Objects/clinic/unicodeobject.c.h @@ -872,8 +872,9 @@ PyDoc_STRVAR(unicode_split__doc__, "\n" "If maxsplit is given, at most maxsplit splits are done.\n" "If sep is not specified or is None, any whitespace string is a separator.\n" -"If prune is given and True, empty strings are removed from the result.\n" -"If it is not given or None, the default behaviour is used: it is set to True if\n" +"If keepempty is False, empty strings are removed from the result.\n" +"If keepempty is True, empty strings are retained in the result.\n" +"If keepempty is not given or None, the default behaviour is used: it is set to True if\n" "sep is None, False otherwise."); #define UNICODE_SPLIT_METHODDEF \ @@ -1353,4 +1354,4 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=d5919c82872bf04d input=a9049054013a1b77]*/ +/*[clinic end generated code: output=e5700a0f2fa3f723 input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 560656df8e30d9..9e1850af6f0d65 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13141,7 +13141,7 @@ sep is None, False otherwise. static PyObject * unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, PyObject *keepempty) -/*[clinic end generated code: output=c182ae533ca1ef53 input=bcd1a211e53ae5a9]*/ +/*[clinic end generated code: output=c182ae533ca1ef53 input=2fe5525dbaaf44ee]*/ { int prune; From 38810e583fd224c1dae1d0873eb4c827f842aa0b Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Tue, 18 May 2021 00:56:30 +0100 Subject: [PATCH 07/25] Added empty string checks to split methods. --- Objects/stringlib/split.h | 54 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h index 9e57b7ff75c31a..1bd3d3c417a5b9 100644 --- a/Objects/stringlib/split.h +++ b/Objects/stringlib/split.h @@ -56,9 +56,17 @@ STRINGLIB(split_whitespace)(PyObject* str_obj, Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, k, count=0; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *list; PyObject *sub; + if ((str_len == 0) && prune) { + list = PyList_New(0); + if (list == NULL) + return NULL; + return list; + } + + list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; @@ -111,10 +119,18 @@ STRINGLIB(split_char)(PyObject* str_obj, Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, count=0; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *list; PyObject *sub; int pruned = 0; + if ((str_len == 0) && prune) { + list = PyList_New(0); + if (list == NULL) + return NULL; + return list; + } + + list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; @@ -169,6 +185,13 @@ STRINGLIB(split)(PyObject* str_obj, else if (sep_len == 1) return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount, prune); + if ((str_len == 0) && prune) { + list = PyList_New(0); + if (list == NULL) + return NULL; + return list; + } + list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; @@ -214,9 +237,17 @@ STRINGLIB(rsplit_whitespace)(PyObject* str_obj, Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, k, count=0; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *list; PyObject *sub; + if ((str_len == 0) && prune) { + list = PyList_New(0); + if (list == NULL) + return NULL; + return list; + } + + list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; @@ -271,10 +302,18 @@ STRINGLIB(rsplit_char)(PyObject* str_obj, Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, count=0; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *list; PyObject *sub; int pruned = 0; + if ((str_len == 0) && prune) { + list = PyList_New(0); + if (list == NULL) + return NULL; + return list; + } + + list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; @@ -330,6 +369,13 @@ STRINGLIB(rsplit)(PyObject* str_obj, else if (sep_len == 1) return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount, prune); + if ((str_len == 0) && prune) { + list = PyList_New(0); + if (list == NULL) + return NULL; + return list; + } + list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; From e0468c1ca816b4b73edc420009205a752ace6fce Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Tue, 18 May 2021 00:59:44 +0100 Subject: [PATCH 08/25] Cant autobail when string has length 0. --- Objects/unicodeobject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 9e1850af6f0d65..4698ebc9178a17 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10383,7 +10383,7 @@ split(PyObject *self, kind2 = PyUnicode_KIND(substring); len1 = PyUnicode_GET_LENGTH(self); len2 = PyUnicode_GET_LENGTH(substring); - if (kind1 < kind2 || len1 < len2) { + if (kind1 < kind2 || (len1 > 0 && len1 < len2)) { out = PyList_New(1); if (out == NULL) return NULL; @@ -10472,7 +10472,7 @@ rsplit(PyObject *self, kind2 = PyUnicode_KIND(substring); len1 = PyUnicode_GET_LENGTH(self); len2 = PyUnicode_GET_LENGTH(substring); - if (kind1 < kind2 || len1 < len2) { + if (kind1 < kind2 || (len1 > 0 && len1 < len2)) { out = PyList_New(1); if (out == NULL) return NULL; From 5c844754c4f56e64fbc4f495bec971a9fb1b3bdc Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Tue, 18 May 2021 08:44:53 +0100 Subject: [PATCH 09/25] Better small test. --- Objects/unicodeobject.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4698ebc9178a17..adde8a2f8bf249 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10383,13 +10383,15 @@ split(PyObject *self, kind2 = PyUnicode_KIND(substring); len1 = PyUnicode_GET_LENGTH(self); len2 = PyUnicode_GET_LENGTH(substring); - if (kind1 < kind2 || (len1 > 0 && len1 < len2)) { - out = PyList_New(1); - if (out == NULL) - return NULL; - Py_INCREF(self); - PyList_SET_ITEM(out, 0, self); - return out; + if (kind1 < kind2 || len1 < len2) { + if (len1 > 0 ) { + out = PyList_New(1); + if (out == NULL) + return NULL; + Py_INCREF(self); + PyList_SET_ITEM(out, 0, self); + return out; + } } buf1 = PyUnicode_DATA(self); buf2 = PyUnicode_DATA(substring); From ae0f1ee3a99287240f794e41a16ac7fe27a7dff6 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Tue, 18 May 2021 08:45:01 +0100 Subject: [PATCH 10/25] More tests. --- Lib/test/string_tests.py | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 9eff99ba70ad97..413313664e50cc 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -474,12 +474,30 @@ def test_split(self): # Without an explicit `sep`, or sep=None, empty strings are pruned from result self.checkequal([], '', 'split') self.checkequal([], '', 'split', sep=None) + self.checkequal([], ' ', 'split') + self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'split') # With an explicit, non-None `sep`, empty strings are not pruned from result self.checkequal([''], '', 'split', sep=',') + self.checkequal(['', '', '', ''], ' ', 'split', sep=' ') + self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'split', sep=' ') + self.checkequal(['', '', ' y z '], 'xx y z ', 'split', sep='x') # keepempty=False to remove empty strings from result + self.checkequal([], '', 'split', keepempty=False) + self.checkequal([], ' ', 'split', keepempty=False) self.checkequal([], '', 'split', sep=',', keepempty=False) + self.checkequal([], ' ', 'split', sep=' ', keepempty=False) + self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'split', sep=' ', keepempty=False) + self.checkequal([' y z '], 'xx y z ', 'split', sep='x', keepempty=False) + + # keepempty=True to retain empty strings in result + self.checkequal([''], '', 'split', keepempty=True) + self.checkequal(['', '', '', ''], ' ', 'split', keepempty=True) + self.checkequal([''], '', 'split', sep=',', keepempty=True) + self.checkequal(['', '', '', ''], ' ', 'split', sep=' ', keepempty=True) + self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'split', sep=' ', keepempty=True) + self.checkequal(['', '', ' y z '], 'xx y z ', 'split', sep='x', keepempty=True) def test_rsplit(self): # by a char @@ -551,6 +569,34 @@ def test_rsplit(self): self.checkraises(ValueError, 'hello', 'rsplit', '') self.checkraises(ValueError, 'hello', 'rsplit', '', 0) + # Without an explicit `sep`, or sep=None, empty strings are pruned from result + self.checkequal([], '', 'rsplit') + self.checkequal([], '', 'rsplit', sep=None) + self.checkequal([], ' ', 'rsplit') + self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'rsplit') + + # With an explicit, non-None `sep`, empty strings are not pruned from result + self.checkequal([''], '', 'rsplit', sep=',') + self.checkequal(['', '', '', ''], ' ', 'rsplit', sep=' ') + self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'rsplit', sep=' ') + self.checkequal(['', '', ' y z '], 'xx y z ', 'rsplit', sep='x') + + # keepempty=False to remove empty strings from result + self.checkequal([], '', 'rsplit', keepempty=False) + self.checkequal([], ' ', 'rsplit', keepempty=False) + self.checkequal([], '', 'rsplit', sep=',', keepempty=False) + self.checkequal([], ' ', 'rsplit', sep=' ', keepempty=False) + self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'rsplit', sep=' ', keepempty=False) + self.checkequal([' y z '], 'xx y z ', 'rsplit', sep='x', keepempty=False) + + # keepempty=True to retain empty strings in result + self.checkequal([''], '', 'rsplit', keepempty=True) + self.checkequal(['', '', '', ''], ' ', 'rsplit', keepempty=True) + self.checkequal([''], '', 'rsplit', sep=',', keepempty=True) + self.checkequal(['', '', '', ''], ' ', 'rsplit', sep=' ', keepempty=True) + self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'rsplit', sep=' ', keepempty=True) + self.checkequal(['', '', ' y z '], 'xx y z ', 'rsplit', sep='x', keepempty=True) + def test_replace(self): EQ = self.checkequal From b781cfcf0690f4fc99c80b088158d92e5eb9b833 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Tue, 18 May 2021 08:49:04 +0100 Subject: [PATCH 11/25] Better small case. --- Objects/stringlib/split.h | 114 ++++++++++++++++++++++++++++---------- 1 file changed, 84 insertions(+), 30 deletions(-) diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h index 1bd3d3c417a5b9..3b721a5c5ad687 100644 --- a/Objects/stringlib/split.h +++ b/Objects/stringlib/split.h @@ -59,11 +59,20 @@ STRINGLIB(split_whitespace)(PyObject* str_obj, PyObject *list; PyObject *sub; - if ((str_len == 0) && prune) { - list = PyList_New(0); - if (list == NULL) - return NULL; - return list; + if (str_len == 0) { + if (prune) { + list = PyList_New(0); + if (list == NULL) + return NULL; + return list; + } else { + list = PyList_New(1); + if (list == NULL) + return NULL; + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + return list; + } } list = PyList_New(PREALLOC_SIZE(maxcount)); @@ -123,11 +132,20 @@ STRINGLIB(split_char)(PyObject* str_obj, PyObject *sub; int pruned = 0; - if ((str_len == 0) && prune) { - list = PyList_New(0); - if (list == NULL) - return NULL; - return list; + if (str_len == 0) { + if (prune) { + list = PyList_New(0); + if (list == NULL) + return NULL; + return list; + } else { + list = PyList_New(1); + if (list == NULL) + return NULL; + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + return list; + } } list = PyList_New(PREALLOC_SIZE(maxcount)); @@ -185,11 +203,20 @@ STRINGLIB(split)(PyObject* str_obj, else if (sep_len == 1) return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount, prune); - if ((str_len == 0) && prune) { - list = PyList_New(0); - if (list == NULL) - return NULL; - return list; + if (str_len == 0) { + if (prune) { + list = PyList_New(0); + if (list == NULL) + return NULL; + return list; + } else { + list = PyList_New(1); + if (list == NULL) + return NULL; + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + return list; + } } list = PyList_New(PREALLOC_SIZE(maxcount)); @@ -240,11 +267,20 @@ STRINGLIB(rsplit_whitespace)(PyObject* str_obj, PyObject *list; PyObject *sub; - if ((str_len == 0) && prune) { - list = PyList_New(0); - if (list == NULL) - return NULL; - return list; + if (str_len == 0) { + if (prune) { + list = PyList_New(0); + if (list == NULL) + return NULL; + return list; + } else { + list = PyList_New(1); + if (list == NULL) + return NULL; + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + return list; + } } list = PyList_New(PREALLOC_SIZE(maxcount)); @@ -306,11 +342,20 @@ STRINGLIB(rsplit_char)(PyObject* str_obj, PyObject *sub; int pruned = 0; - if ((str_len == 0) && prune) { - list = PyList_New(0); - if (list == NULL) - return NULL; - return list; + if (str_len == 0) { + if (prune) { + list = PyList_New(0); + if (list == NULL) + return NULL; + return list; + } else { + list = PyList_New(1); + if (list == NULL) + return NULL; + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + return list; + } } list = PyList_New(PREALLOC_SIZE(maxcount)); @@ -369,11 +414,20 @@ STRINGLIB(rsplit)(PyObject* str_obj, else if (sep_len == 1) return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount, prune); - if ((str_len == 0) && prune) { - list = PyList_New(0); - if (list == NULL) - return NULL; - return list; + if (str_len == 0) { + if (prune) { + list = PyList_New(0); + if (list == NULL) + return NULL; + return list; + } else { + list = PyList_New(1); + if (list == NULL) + return NULL; + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + return list; + } } list = PyList_New(PREALLOC_SIZE(maxcount)); From 6e09909dfe2db71cd0ed4258546b92e95ef76914 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Tue, 18 May 2021 19:32:53 +0100 Subject: [PATCH 12/25] New splitting algorithms that are compatible with maxsplit. --- Objects/stringlib/split.h | 179 ++++++++++---------------------------- 1 file changed, 47 insertions(+), 132 deletions(-) diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h index 3b721a5c5ad687..8e914a2dc37f44 100644 --- a/Objects/stringlib/split.h +++ b/Objects/stringlib/split.h @@ -55,64 +55,33 @@ STRINGLIB(split_whitespace)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, Py_ssize_t maxcount, int prune) { - Py_ssize_t i, j, k, count=0; + Py_ssize_t i, j, count=0; PyObject *list; PyObject *sub; - if (str_len == 0) { - if (prune) { - list = PyList_New(0); - if (list == NULL) - return NULL; - return list; - } else { - list = PyList_New(1); - if (list == NULL) - return NULL; - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - return list; - } - } - list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; i = j = 0; - while (maxcount-- > 0) { - k = i; - while (i < str_len && STRINGLIB_ISSPACE(str[i])) - i++; - for (; prune == 0 && k < i-1; k++) { - SPLIT_ADD(str, k, k); - } - if (i == str_len) - break; - j = i; - i++; - while (i < str_len && !STRINGLIB_ISSPACE(str[i])) - i++; -#ifndef STRINGLIB_MUTABLE - if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { - /* No whitespace in str_obj, so just use it as list[0] */ - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - count++; - break; + while (j < str_len && !STRINGLIB_ISSPACE(str[j])) + j++; + + while (j < str_len) { + if (j > i || ! prune) { + if (count >= maxcount) + break; + SPLIT_ADD(str, i, j); } -#endif - SPLIT_ADD(str, j, i); + j++; + i = j; + while (j < str_len && !STRINGLIB_ISSPACE(str[j])) + j++; } - if (i < str_len) { - /* Only occurs when maxcount was reached */ - /* Skip any remaining whitespace and copy to end of string */ - while (i < str_len && STRINGLIB_ISSPACE(str[i])) - i++; - if (i != str_len) - SPLIT_ADD(str, i, str_len); - } + if (i < str_len || ! prune) + SPLIT_ADD(str, i, str_len); + FIX_PREALLOC_SIZE(list); return list; @@ -132,52 +101,29 @@ STRINGLIB(split_char)(PyObject* str_obj, PyObject *sub; int pruned = 0; - if (str_len == 0) { - if (prune) { - list = PyList_New(0); - if (list == NULL) - return NULL; - return list; - } else { - list = PyList_New(1); - if (list == NULL) - return NULL; - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - return list; - } - } - list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; i = j = 0; - while ((j < str_len) && (maxcount-- > 0)) { - for(; j < str_len; j++) { - /* I found that using memchr makes no difference */ - if (str[j] == ch) { - if (prune == 0 || i < j) { - SPLIT_ADD(str, i, j); - } else { - pruned = 1; - } - i = j = j + 1; + while (j < str_len && str[j] != ch) + j++; + + while (j < str_len) { + if (j > i || ! prune) { + if (count >= maxcount) break; - } + SPLIT_ADD(str, i, j); } + j++; + i = j; + while (j < str_len && str[j] != ch) + j++; } -#ifndef STRINGLIB_MUTABLE - if (count == 0 && pruned == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { - /* ch not in str_obj, so just use str_obj as list[0] */ - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - count++; - } else -#endif - if (i < str_len || (prune == 0 && i == str_len)) { + + if (i < str_len || ! prune) SPLIT_ADD(str, i, str_len); - } + FIX_PREALLOC_SIZE(list); return list; @@ -263,64 +209,33 @@ STRINGLIB(rsplit_whitespace)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, Py_ssize_t maxcount, int prune) { - Py_ssize_t i, j, k, count=0; + Py_ssize_t i, j, count=0; PyObject *list; PyObject *sub; - if (str_len == 0) { - if (prune) { - list = PyList_New(0); - if (list == NULL) - return NULL; - return list; - } else { - list = PyList_New(1); - if (list == NULL) - return NULL; - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - return list; - } - } - list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; - i = j = str_len - 1; - while (maxcount-- > 0) { - k = i; - while (i >= 0 && STRINGLIB_ISSPACE(str[i])) - i--; - for (; prune == 0 && k > i+1; k--) { - SPLIT_ADD(str, k+1, k+1); - } - if (i < 0) - break; - j = i; - i--; - while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) - i--; -#ifndef STRINGLIB_MUTABLE - if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { - /* No whitespace in str_obj, so just use it as list[0] */ - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - count++; - break; + i = j = str_len; + while (j > 0 && !STRINGLIB_ISSPACE(str[j-1])) + j--; + + while (j > 0) { + if (j < i || ! prune) { + if (count >= maxcount) + break; + SPLIT_ADD(str, j, i); } -#endif - SPLIT_ADD(str, i + 1, j + 1); + j--; + i = j; + while (j > 0 && !STRINGLIB_ISSPACE(str[j-1])) + j--; } - if (i >= 0) { - /* Only occurs when maxcount was reached */ - /* Skip any remaining whitespace and copy to beginning of string */ - while (i >= 0 && STRINGLIB_ISSPACE(str[i])) - i--; - if (i >= 0) - SPLIT_ADD(str, 0, i + 1); - } + if (i > 0 || ! prune) + SPLIT_ADD(str, 0, i); + FIX_PREALLOC_SIZE(list); if (PyList_Reverse(list) < 0) goto onError; From c5ccbeb0c0dbfbb5e209d18c5b13918179492095 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Tue, 18 May 2021 19:42:00 +0100 Subject: [PATCH 13/25] Converted rstrip_char and readded optimisations. --- Objects/stringlib/split.h | 73 ++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 32 deletions(-) diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h index 8e914a2dc37f44..d50e0f9e2d86ae 100644 --- a/Objects/stringlib/split.h +++ b/Objects/stringlib/split.h @@ -79,6 +79,14 @@ STRINGLIB(split_whitespace)(PyObject* str_obj, j++; } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && i == 0 && j == str_len && str_len > 0 && ! prune && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif if (i < str_len || ! prune) SPLIT_ADD(str, i, str_len); @@ -121,6 +129,14 @@ STRINGLIB(split_char)(PyObject* str_obj, j++; } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && i == 0 && j == str_len && str_len > 0 && ! prune && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif if (i < str_len || ! prune) SPLIT_ADD(str, i, str_len); @@ -233,6 +249,14 @@ STRINGLIB(rsplit_whitespace)(PyObject* str_obj, j--; } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && j == 0 && i == str_len && str_len > 0 && ! prune && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif if (i > 0 || ! prune) SPLIT_ADD(str, 0, i); @@ -255,53 +279,38 @@ STRINGLIB(rsplit_char)(PyObject* str_obj, Py_ssize_t i, j, count=0; PyObject *list; PyObject *sub; - int pruned = 0; - - if (str_len == 0) { - if (prune) { - list = PyList_New(0); - if (list == NULL) - return NULL; - return list; - } else { - list = PyList_New(1); - if (list == NULL) - return NULL; - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - return list; - } - } list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; - i = j = str_len - 1; - while ((i >= 0) && (maxcount-- > 0)) { - for(; i >= 0; i--) { - if (str[i] == ch) { - if (prune == 0 || i < j) { - SPLIT_ADD(str, i + 1, j + 1); - } else { - pruned = 1; - } - j = i = i - 1; + i = j = str_len; + while (j > 0 && str[j-1] != ch) + j--; + + while (j > 0) { + if (j < i || ! prune) { + if (count >= maxcount) break; - } + SPLIT_ADD(str, j, i); } + j--; + i = j; + while (j > 0 && str[j-1] != ch) + j--; } + #ifndef STRINGLIB_MUTABLE - if (count == 0 && pruned == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + if (count == 0 && j == 0 && i == str_len && str_len > 0 && ! prune && STRINGLIB_CHECK_EXACT(str_obj)) { /* ch not in str_obj, so just use str_obj as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); count++; } else #endif - if (j > -1 || (prune == 0 && j == -1)) { - SPLIT_ADD(str, 0, j + 1); - } + if (i > 0 || ! prune) + SPLIT_ADD(str, 0, i); + FIX_PREALLOC_SIZE(list); if (PyList_Reverse(list) < 0) goto onError; From 9e1cd1edd528e1c578157b6963c58263a1d9b0a6 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Tue, 18 May 2021 20:18:22 +0100 Subject: [PATCH 14/25] Rewritten split to use new algorithm. --- Objects/stringlib/split.h | 54 +++++++++++++-------------------------- 1 file changed, 18 insertions(+), 36 deletions(-) diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h index d50e0f9e2d86ae..84a9fe40832581 100644 --- a/Objects/stringlib/split.h +++ b/Objects/stringlib/split.h @@ -154,9 +154,8 @@ STRINGLIB(split)(PyObject* str_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, Py_ssize_t maxcount, int prune) { - Py_ssize_t i, j, pos, count=0; + Py_ssize_t i, j, offset, count=0; PyObject *list, *sub; - int pruned = 0; if (sep_len == 0) { PyErr_SetString(PyExc_ValueError, "empty separator"); @@ -165,53 +164,36 @@ STRINGLIB(split)(PyObject* str_obj, else if (sep_len == 1) return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount, prune); - if (str_len == 0) { - if (prune) { - list = PyList_New(0); - if (list == NULL) - return NULL; - return list; - } else { - list = PyList_New(1); - if (list == NULL) - return NULL; - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - return list; - } - } - list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; - i = j = 0; - while (maxcount-- > 0) { - pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); - if (pos < 0) - break; - if (prune && pos == 0) { /* Empty string; ignore */ - i += sep_len; - pruned = 1; - maxcount++; /* Don't count pruned strings in the max count */ - continue; + i = 0; + offset = FASTSEARCH(str+i, str_len, sep, sep_len, -1, FAST_SEARCH); + j = (offset >= 0) ? i + offset : -1; + + while (j >= 0) { + if (j > i || ! prune) { + if (count >= maxcount) + break; + SPLIT_ADD(str, i, j); } - j = i + pos; - SPLIT_ADD(str, i, j); i = j + sep_len; + offset = FASTSEARCH(str+i, str_len, sep, sep_len, -1, FAST_SEARCH); + j = (offset >= 0) ? i + offset : -1; } + #ifndef STRINGLIB_MUTABLE - if (count == 0 && pruned == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { - /* No match in str_obj, so just use it as list[0] */ + if (count == 0 && i == 0 && j < 0 && str_len > 0 && ! prune && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); count++; } else #endif - { - if (prune == 0 || i < str_len) - SPLIT_ADD(str, i, str_len); - } + if (i < str_len || ! prune) + SPLIT_ADD(str, i, str_len); + FIX_PREALLOC_SIZE(list); return list; From 4463211b6c9705944db929a9c63bd4246fb1b835 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Tue, 18 May 2021 20:30:21 +0100 Subject: [PATCH 15/25] Completed conversion of rsplit. --- Objects/stringlib/split.h | 55 ++++++++++++++------------------------- 1 file changed, 19 insertions(+), 36 deletions(-) diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h index 84a9fe40832581..fecfa69e3ecd24 100644 --- a/Objects/stringlib/split.h +++ b/Objects/stringlib/split.h @@ -107,7 +107,6 @@ STRINGLIB(split_char)(PyObject* str_obj, Py_ssize_t i, j, count=0; PyObject *list; PyObject *sub; - int pruned = 0; list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) @@ -309,9 +308,8 @@ STRINGLIB(rsplit)(PyObject* str_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, Py_ssize_t maxcount, int prune) { - Py_ssize_t j, pos, count=0; + Py_ssize_t i, j, offset, count=0; PyObject *list, *sub; - int pruned = 0; if (sep_len == 0) { PyErr_SetString(PyExc_ValueError, "empty separator"); @@ -320,51 +318,36 @@ STRINGLIB(rsplit)(PyObject* str_obj, else if (sep_len == 1) return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount, prune); - if (str_len == 0) { - if (prune) { - list = PyList_New(0); - if (list == NULL) - return NULL; - return list; - } else { - list = PyList_New(1); - if (list == NULL) - return NULL; - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - return list; - } - } - list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; - j = str_len; - while (maxcount-- > 0) { - pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH); - if (pos < 0) - break; - if (prune && pos == j-1) { - j--; - pruned = 1; - maxcount++; - continue; + i = str_len; + offset = FASTSEARCH(str, i, sep, sep_len, -1, FAST_RSEARCH); + j = (offset >= 0) ? offset + sep_len : -1; + + while (j >= 0) { + if (j < i || ! prune) { + if (count >= maxcount) + break; + SPLIT_ADD(str, j, i); } - SPLIT_ADD(str, pos + sep_len, j); - j = pos; + i = j - sep_len; + offset = FASTSEARCH(str, i, sep, sep_len, -1, FAST_RSEARCH); + j = (offset >= 0) ? offset + sep_len : -1; } + #ifndef STRINGLIB_MUTABLE - if (count == 0 && pruned == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { - /* No match in str_obj, so just use it as list[0] */ + if (count == 0 && i == str_len && j < 0 && str_len > 0 && ! prune && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); count++; } else #endif - { - SPLIT_ADD(str, 0, j); - } + if (i > 0 || ! prune) + SPLIT_ADD(str, 0, i); + FIX_PREALLOC_SIZE(list); if (PyList_Reverse(list) < 0) goto onError; From 4501026534470f7350a1f7c0055a475eebcdb2f1 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Tue, 18 May 2021 20:45:55 +0100 Subject: [PATCH 16/25] Fixed use of ! PyObject_IsTrue. --- Objects/bytearrayobject.c | 4 ++-- Objects/bytesobject.c | 4 ++-- Objects/unicodeobject.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index 6fd503155bfbd5..1ae84670e7b0a6 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -1427,7 +1427,7 @@ bytearray_split_impl(PyByteArrayObject *self, PyObject *sep, else prune = 0; } else { - prune = ! PyObject_IsTrue(keepempty); + prune = PyObject_Not(keepempty); if (prune < 0) return NULL; } @@ -1548,7 +1548,7 @@ bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep, else prune = 0; } else { - prune = ! PyObject_IsTrue(keepempty); + prune = PyObject_Not(keepempty); if (prune < 0) return NULL; } diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 257a0d38cba4e2..e853b05f4f839e 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1724,7 +1724,7 @@ bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, else prune = 0; } else { - prune = ! PyObject_IsTrue(keepempty); + prune = PyObject_Not(keepempty); if (prune < 0) return NULL; } @@ -1822,7 +1822,7 @@ bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, else prune = 0; } else { - prune = ! PyObject_IsTrue(keepempty); + prune = PyObject_Not(keepempty); if (prune < 0) return NULL; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index adde8a2f8bf249..8b0152dc502aee 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13153,7 +13153,7 @@ unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, else prune = 0; } else { - prune = ! PyObject_IsTrue(keepempty); + prune = PyObject_Not(keepempty); if (prune < 0) return NULL; } @@ -13345,7 +13345,7 @@ unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, else prune = 0; } else { - prune = ! PyObject_IsTrue(keepempty); + prune = PyObject_Not(keepempty); if (prune < 0) return NULL; } From d2d6fe6c9a90f38ad44a33eb56f4759d36c7de5e Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Tue, 18 May 2021 21:57:23 +0100 Subject: [PATCH 17/25] Fixed bug in FASTSEARCH bounds. --- Objects/stringlib/split.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h index fecfa69e3ecd24..c82db1a466a952 100644 --- a/Objects/stringlib/split.h +++ b/Objects/stringlib/split.h @@ -168,7 +168,7 @@ STRINGLIB(split)(PyObject* str_obj, return NULL; i = 0; - offset = FASTSEARCH(str+i, str_len, sep, sep_len, -1, FAST_SEARCH); + offset = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); j = (offset >= 0) ? i + offset : -1; while (j >= 0) { @@ -178,7 +178,7 @@ STRINGLIB(split)(PyObject* str_obj, SPLIT_ADD(str, i, j); } i = j + sep_len; - offset = FASTSEARCH(str+i, str_len, sep, sep_len, -1, FAST_SEARCH); + offset = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); j = (offset >= 0) ? i + offset : -1; } From cbbf25dbfa9a3d8f0a1600719c24a8ae92e86644 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Tue, 18 May 2021 22:24:32 +0100 Subject: [PATCH 18/25] Added keepempty argument to UserString. --- Lib/collections/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/collections/__init__.py b/Lib/collections/__init__.py index bae0805d6686c5..7cb29b94db4770 100644 --- a/Lib/collections/__init__.py +++ b/Lib/collections/__init__.py @@ -1521,11 +1521,11 @@ def rpartition(self, sep): def rstrip(self, chars=None): return self.__class__(self.data.rstrip(chars)) - def split(self, sep=None, maxsplit=-1): - return self.data.split(sep, maxsplit) + def split(self, sep=None, maxsplit=-1, keepempty=None): + return self.data.split(sep, maxsplit, keepempty) - def rsplit(self, sep=None, maxsplit=-1): - return self.data.rsplit(sep, maxsplit) + def rsplit(self, sep=None, maxsplit=-1, keepempty=None): + return self.data.rsplit(sep, maxsplit, keepempty) def splitlines(self, keepends=False): return self.data.splitlines(keepends) From 720a6a6d3e10237df441e2bb632a89ed15c83c18 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Tue, 18 May 2021 22:51:44 +0000 Subject: [PATCH 19/25] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst b/Misc/NEWS.d/next/Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst new file mode 100644 index 00000000000000..84a660c62ecbbf --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst @@ -0,0 +1 @@ +Add the `keepempty` argument to `string.split`, `bytes.split`, `bytearray.split` and `UserString.split`. \ No newline at end of file From 195912661350a3ba7b2e3118fd460ee4cc721e9a Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Wed, 19 May 2021 07:23:33 +0100 Subject: [PATCH 20/25] News needs to use . --- .../Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst b/Misc/NEWS.d/next/Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst index 84a660c62ecbbf..dccfdf56bb3425 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst @@ -1 +1,2 @@ -Add the `keepempty` argument to `string.split`, `bytes.split`, `bytearray.split` and `UserString.split`. \ No newline at end of file +Add the ``keepempty`` argument to ``string.split``, ``bytes.split``, +``bytearray.split`` and ``UserString.split``. Patch by Mark Bell. From cea8af97753b9b8df74f16478d70e52a7e5cb5ea Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Wed, 19 May 2021 07:30:07 +0100 Subject: [PATCH 21/25] Added tests to check interaction with maxsplit. --- Lib/test/string_tests.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 413313664e50cc..e6293be3461856 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -499,6 +499,15 @@ def test_split(self): self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'split', sep=' ', keepempty=True) self.checkequal(['', '', ' y z '], 'xx y z ', 'split', sep='x', keepempty=True) + # Empty strings kept with keepempty count towards maxsplit + self.checkequal(['', ' y z '], ' y z ', 'split', keepempty=True, maxsplit=1) + self.checkequal(['y', 'z '], ' y z ', 'split', keepempty=False, maxsplit=1) + self.checkequal(['y', 'z '], ' y z ', 'split', maxsplit=1) + self.checkequal(['', ' y z '], ' y z ', 'split', sep=' ', keepempty=True, maxsplit=1) + self.checkequal(['y', 'z '], ' y z ', 'split', sep=' ', keepempty=False, maxsplit=1) + self.checkequal(['', ' y z '], ' y z ', 'split', sep=' ', maxsplit=1) + + def test_rsplit(self): # by a char self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|') @@ -597,6 +606,14 @@ def test_rsplit(self): self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'rsplit', sep=' ', keepempty=True) self.checkequal(['', '', ' y z '], 'xx y z ', 'rsplit', sep='x', keepempty=True) + # Empty strings kept with keepempty count towards maxsplit + self.checkequal([' y z ', ''], ' y z ', 'rsplit', keepempty=True, maxsplit=1) + self.checkequal([' y', 'z'], ' y z ', 'rsplit', keepempty=False, maxsplit=1) + self.checkequal([' y', 'z'], ' y z ', 'rsplit', maxsplit=1) + self.checkequal([' y z ', ''], ' y z ', 'rsplit', sep=' ', keepempty=True, maxsplit=1) + self.checkequal([' y', 'z'], ' y z ', 'rsplit', sep=' ', keepempty=False, maxsplit=1) + self.checkequal([' y z ', ''], ' y z ', 'rsplit', sep=' ', maxsplit=1) + def test_replace(self): EQ = self.checkequal From 21a2fc4271e5e968d760a2a0e9c6d623408e5227 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Wed, 19 May 2021 07:38:30 +0100 Subject: [PATCH 22/25] Also documentation change to PyUnicode_Split and RSplit. --- .../Core and Builtins/2021-05-19-07-37-12.bpo-28937.SkwyIg.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-05-19-07-37-12.bpo-28937.SkwyIg.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-05-19-07-37-12.bpo-28937.SkwyIg.rst b/Misc/NEWS.d/next/Core and Builtins/2021-05-19-07-37-12.bpo-28937.SkwyIg.rst new file mode 100644 index 00000000000000..295205217c250c --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-05-19-07-37-12.bpo-28937.SkwyIg.rst @@ -0,0 +1,2 @@ +Added ``keepempty`` flag to ``PyUnicode_Split`` and ``PyUnicode_RSplit``. +Patch added my Mark Bell. From e7828fcca76108eb8cbe57b5c63fc2a083541727 Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Wed, 19 May 2021 17:42:00 +0100 Subject: [PATCH 23/25] Created separate PyUnicode_SplitWithKeepempty and PyUnicode_RSplitWithKeepempty. --- Include/unicodeobject.h | 18 +++++++++++ .../2021-05-19-17-39-54.bpo-28937.hZ42f3.rst | 5 ++++ .../2021-05-19-07-37-12.bpo-28937.SkwyIg.rst | 2 -- Modules/_pickle.c | 2 +- Objects/unicodeobject.c | 30 +++++++++++++++++-- 5 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2021-05-19-17-39-54.bpo-28937.hZ42f3.rst delete mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-05-19-07-37-12.bpo-28937.SkwyIg.rst diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index a2fee1a212dbfd..f261a000b7c3f8 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -845,9 +845,18 @@ PyAPI_FUNC(void) PyUnicode_AppendAndDel( At most maxsplit splits will be done. If negative, no limit is set. Separators are not included in the resulting list. + */ PyAPI_FUNC(PyObject*) PyUnicode_Split( + PyObject *s, /* String to split */ + PyObject *sep, /* String separator */ + Py_ssize_t maxsplit /* Maxsplit count */ + ); + +/* Ditto, but also take a flag indicating whether to keep or remove empty strings */ + +PyAPI_FUNC(PyObject*) PyUnicode_SplitWithKeepempty( PyObject *s, /* String to split */ PyObject *sep, /* String separator */ Py_ssize_t maxsplit, /* Maxsplit count */ @@ -889,9 +898,18 @@ PyAPI_FUNC(PyObject*) PyUnicode_RPartition( no limit is set. Separators are not included in the resulting list. + */ PyAPI_FUNC(PyObject*) PyUnicode_RSplit( + PyObject *s, /* String to split */ + PyObject *sep, /* String separator */ + Py_ssize_t maxsplit /* Maxsplit count */ + ); + +/* Ditto, but also take a flag indicating whether to keep or remove empty strings */ + +PyAPI_FUNC(PyObject*) PyUnicode_RSplitWithKeepempty( PyObject *s, /* String to split */ PyObject *sep, /* String separator */ Py_ssize_t maxsplit, /* Maxsplit count */ diff --git a/Misc/NEWS.d/next/C API/2021-05-19-17-39-54.bpo-28937.hZ42f3.rst b/Misc/NEWS.d/next/C API/2021-05-19-17-39-54.bpo-28937.hZ42f3.rst new file mode 100644 index 00000000000000..6f52bf26948f6b --- /dev/null +++ b/Misc/NEWS.d/next/C API/2021-05-19-17-39-54.bpo-28937.hZ42f3.rst @@ -0,0 +1,5 @@ +Added ``PyUnicode_SplitWithKeepempty`` and ``PyUnicode_RSplitWithKeepempty`` +which act like ``PyUnicode_Split`` and ``PyUnicode_RSplit`` but provides an +additional flag ``keepempty``. This flag specifies whether empty strings are +to be retained or dropped, instead of this behaviour depending on the +whether the separator being split on is None. Patch by Mark Bell diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-05-19-07-37-12.bpo-28937.SkwyIg.rst b/Misc/NEWS.d/next/Core and Builtins/2021-05-19-07-37-12.bpo-28937.SkwyIg.rst deleted file mode 100644 index 295205217c250c..00000000000000 --- a/Misc/NEWS.d/next/Core and Builtins/2021-05-19-07-37-12.bpo-28937.SkwyIg.rst +++ /dev/null @@ -1,2 +0,0 @@ -Added ``keepempty`` flag to ``PyUnicode_Split`` and ``PyUnicode_RSplit``. -Patch added my Mark Bell. diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 8dd755c0d7dc07..3e74fafb384176 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -1810,7 +1810,7 @@ get_dotted_path(PyObject *obj, PyObject *name) PyObject *dotted_path; Py_ssize_t i, n; - dotted_path = PyUnicode_Split(name, _PyUnicode_FromId(&PyId_dot), -1, 1); + dotted_path = PyUnicode_Split(name, _PyUnicode_FromId(&PyId_dot), -1); if (dotted_path == NULL) return NULL; n = PyList_GET_SIZE(dotted_path); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8b0152dc502aee..0dab8694d088af 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13109,7 +13109,20 @@ unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) } PyObject * -PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int keepempty) +PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) +{ + int prune; + + if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) + return NULL; + + prune = (sep == Py_None) ? 1 : 0; + + return split(s, sep, maxsplit, prune); +} + +PyObject * +PyUnicode_SplitWithKeepempty(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int keepempty) { if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; @@ -13316,7 +13329,20 @@ unicode_rpartition(PyObject *self, PyObject *sep) } PyObject * -PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int keepempty) +PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) +{ + int prune; + + if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) + return NULL; + + prune = (sep == Py_None) ? 1 : 0; + + return rsplit(s, sep, maxsplit, prune); +} + +PyObject * +PyUnicode_RSplitWithKeepempty(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int keepempty) { if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; From dc610bbcc3384f5bf72e097970e59d8160b2b57d Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Wed, 19 May 2021 18:05:42 +0100 Subject: [PATCH 24/25] Added PyUnicode_SplitWithKeepempty to API/ABI. --- Doc/data/stable_abi.dat | 2 ++ Misc/stable_abi.txt | 6 ++++++ PC/python3dll.c | 2 ++ 3 files changed, 10 insertions(+) diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index d582204f5626b9..0156c7f6341835 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -745,11 +745,13 @@ function,PyUnicode_Join,3.2, function,PyUnicode_Partition,3.2, function,PyUnicode_RPartition,3.2, function,PyUnicode_RSplit,3.2, +function,PyUnicode_RSplitWithKeepempty,3.10, function,PyUnicode_ReadChar,3.7, function,PyUnicode_Replace,3.2, function,PyUnicode_Resize,3.2, function,PyUnicode_RichCompare,3.2, function,PyUnicode_Split,3.2, +function,PyUnicode_SplitWithKeepempty,3.10, function,PyUnicode_Splitlines,3.2, function,PyUnicode_Substring,3.7, function,PyUnicode_Tailmatch,3.2, diff --git a/Misc/stable_abi.txt b/Misc/stable_abi.txt index a78bcb76b41df6..d41dac81058d07 100644 --- a/Misc/stable_abi.txt +++ b/Misc/stable_abi.txt @@ -2148,6 +2148,12 @@ function PyGC_Enable function PyGC_IsEnabled added 3.10 +# New unicode split functions which support the keepempty flag (https://bugs.python.org/issue28937) + +function PyUnicode_SplitWithKeepempty + added 3.10 +function PyUnicode_RSplitWithKeepempty + added 3.10 # (Detailed comments aren't really needed for further entries: from here on # we can use version control logs.) diff --git a/PC/python3dll.c b/PC/python3dll.c index 200d1d14e294d1..79adfb65265cc8 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -677,8 +677,10 @@ EXPORT_FUNC(PyUnicode_Resize) EXPORT_FUNC(PyUnicode_RichCompare) EXPORT_FUNC(PyUnicode_RPartition) EXPORT_FUNC(PyUnicode_RSplit) +EXPORT_FUNC(PyUnicode_RSplitWithKeepempty) EXPORT_FUNC(PyUnicode_Split) EXPORT_FUNC(PyUnicode_Splitlines) +EXPORT_FUNC(PyUnicode_SplitWithKeepempty) EXPORT_FUNC(PyUnicode_Substring) EXPORT_FUNC(PyUnicode_Tailmatch) EXPORT_FUNC(PyUnicode_Translate) From f95b254c4d900ccf5ee8877b1b9185473056e01e Mon Sep 17 00:00:00 2001 From: Mark Bell Date: Wed, 19 May 2021 19:01:16 +0100 Subject: [PATCH 25/25] Realised C API interfaces to new keepempty flag is not mandatory. --- Doc/data/stable_abi.dat | 2 -- Include/unicodeobject.h | 18 ------------------ .../2021-05-19-17-39-54.bpo-28937.hZ42f3.rst | 5 ----- Misc/stable_abi.txt | 6 ------ Objects/unicodeobject.c | 18 ------------------ PC/python3dll.c | 2 -- 6 files changed, 51 deletions(-) delete mode 100644 Misc/NEWS.d/next/C API/2021-05-19-17-39-54.bpo-28937.hZ42f3.rst diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index 0156c7f6341835..d582204f5626b9 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -745,13 +745,11 @@ function,PyUnicode_Join,3.2, function,PyUnicode_Partition,3.2, function,PyUnicode_RPartition,3.2, function,PyUnicode_RSplit,3.2, -function,PyUnicode_RSplitWithKeepempty,3.10, function,PyUnicode_ReadChar,3.7, function,PyUnicode_Replace,3.2, function,PyUnicode_Resize,3.2, function,PyUnicode_RichCompare,3.2, function,PyUnicode_Split,3.2, -function,PyUnicode_SplitWithKeepempty,3.10, function,PyUnicode_Splitlines,3.2, function,PyUnicode_Substring,3.7, function,PyUnicode_Tailmatch,3.2, diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index f261a000b7c3f8..b0ac086a6be23d 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -854,15 +854,6 @@ PyAPI_FUNC(PyObject*) PyUnicode_Split( Py_ssize_t maxsplit /* Maxsplit count */ ); -/* Ditto, but also take a flag indicating whether to keep or remove empty strings */ - -PyAPI_FUNC(PyObject*) PyUnicode_SplitWithKeepempty( - PyObject *s, /* String to split */ - PyObject *sep, /* String separator */ - Py_ssize_t maxsplit, /* Maxsplit count */ - int keepempty /* Whether to remove empty strings */ - ); - /* Dito, but split at line breaks. CRLF is considered to be one line break. Line breaks are not @@ -907,15 +898,6 @@ PyAPI_FUNC(PyObject*) PyUnicode_RSplit( Py_ssize_t maxsplit /* Maxsplit count */ ); -/* Ditto, but also take a flag indicating whether to keep or remove empty strings */ - -PyAPI_FUNC(PyObject*) PyUnicode_RSplitWithKeepempty( - PyObject *s, /* String to split */ - PyObject *sep, /* String separator */ - Py_ssize_t maxsplit, /* Maxsplit count */ - int keepempty /* Whether to remove empty strings */ - ); - /* Translate a string by applying a character mapping table to it and return the resulting Unicode object. diff --git a/Misc/NEWS.d/next/C API/2021-05-19-17-39-54.bpo-28937.hZ42f3.rst b/Misc/NEWS.d/next/C API/2021-05-19-17-39-54.bpo-28937.hZ42f3.rst deleted file mode 100644 index 6f52bf26948f6b..00000000000000 --- a/Misc/NEWS.d/next/C API/2021-05-19-17-39-54.bpo-28937.hZ42f3.rst +++ /dev/null @@ -1,5 +0,0 @@ -Added ``PyUnicode_SplitWithKeepempty`` and ``PyUnicode_RSplitWithKeepempty`` -which act like ``PyUnicode_Split`` and ``PyUnicode_RSplit`` but provides an -additional flag ``keepempty``. This flag specifies whether empty strings are -to be retained or dropped, instead of this behaviour depending on the -whether the separator being split on is None. Patch by Mark Bell diff --git a/Misc/stable_abi.txt b/Misc/stable_abi.txt index d41dac81058d07..a78bcb76b41df6 100644 --- a/Misc/stable_abi.txt +++ b/Misc/stable_abi.txt @@ -2148,12 +2148,6 @@ function PyGC_Enable function PyGC_IsEnabled added 3.10 -# New unicode split functions which support the keepempty flag (https://bugs.python.org/issue28937) - -function PyUnicode_SplitWithKeepempty - added 3.10 -function PyUnicode_RSplitWithKeepempty - added 3.10 # (Detailed comments aren't really needed for further entries: from here on # we can use version control logs.) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 0dab8694d088af..c584af85f741db 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13121,15 +13121,6 @@ PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) return split(s, sep, maxsplit, prune); } -PyObject * -PyUnicode_SplitWithKeepempty(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int keepempty) -{ - if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) - return NULL; - - return split(s, sep, maxsplit, ! keepempty); -} - /*[clinic input] str.split as unicode_split @@ -13341,15 +13332,6 @@ PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) return rsplit(s, sep, maxsplit, prune); } -PyObject * -PyUnicode_RSplitWithKeepempty(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int keepempty) -{ - if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) - return NULL; - - return rsplit(s, sep, maxsplit, ! keepempty); -} - /*[clinic input] str.rsplit as unicode_rsplit = str.split diff --git a/PC/python3dll.c b/PC/python3dll.c index 79adfb65265cc8..200d1d14e294d1 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -677,10 +677,8 @@ EXPORT_FUNC(PyUnicode_Resize) EXPORT_FUNC(PyUnicode_RichCompare) EXPORT_FUNC(PyUnicode_RPartition) EXPORT_FUNC(PyUnicode_RSplit) -EXPORT_FUNC(PyUnicode_RSplitWithKeepempty) EXPORT_FUNC(PyUnicode_Split) EXPORT_FUNC(PyUnicode_Splitlines) -EXPORT_FUNC(PyUnicode_SplitWithKeepempty) EXPORT_FUNC(PyUnicode_Substring) EXPORT_FUNC(PyUnicode_Tailmatch) EXPORT_FUNC(PyUnicode_Translate)