diff --git a/Lib/collections/__init__.py b/Lib/collections/__init__.py index d989d85d6d8293..1fa6a4aca13d68 100644 --- a/Lib/collections/__init__.py +++ b/Lib/collections/__init__.py @@ -1529,11 +1529,11 @@ def rpartition(self, sep): def rstrip(self, chars=None): return self.__class__(self.data.rstrip(chars)) - def split(self, sep=None, maxsplit=-1): - return self.data.split(sep, maxsplit) + def split(self, sep=None, maxsplit=-1, keepempty=None): + return self.data.split(sep, maxsplit, keepempty) - def rsplit(self, sep=None, maxsplit=-1): - return self.data.rsplit(sep, maxsplit) + def rsplit(self, sep=None, maxsplit=-1, keepempty=None): + return self.data.rsplit(sep, maxsplit, keepempty) def splitlines(self, keepends=False): return self.data.splitlines(keepends) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 0d4c7ecf4a04f2..27a0a0b380cc73 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -468,6 +468,49 @@ def test_split(self): self.checkraises(ValueError, 'hello', 'split', '') self.checkraises(ValueError, 'hello', 'split', '', 0) + # without args, any whitespace is a separator + self.checkequal(['a', 'b', 'c', 'd', 'e'], 'a b\tc\nd \n e ', 'split') + + # with sep=None, any whitespace is a separator + self.checkequal(['a', 'b', 'c', 'd', 'e'], 'a b\tc\nd \n e ', 'split', sep=None) + + # Without an explicit `sep`, or sep=None, empty strings are pruned from result + self.checkequal([], '', 'split') + self.checkequal([], '', 'split', sep=None) + self.checkequal([], ' ', 'split') + self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'split') + + # With an explicit, non-None `sep`, empty strings are not pruned from result + self.checkequal([''], '', 'split', sep=',') + self.checkequal(['', '', '', ''], ' ', 'split', sep=' ') + self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'split', sep=' ') + self.checkequal(['', '', ' y z '], 'xx y z ', 'split', sep='x') + + # keepempty=False to remove empty strings from result + self.checkequal([], '', 'split', keepempty=False) + self.checkequal([], ' ', 'split', keepempty=False) + self.checkequal([], '', 'split', sep=',', keepempty=False) + self.checkequal([], ' ', 'split', sep=' ', keepempty=False) + self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'split', sep=' ', keepempty=False) + self.checkequal([' y z '], 'xx y z ', 'split', sep='x', keepempty=False) + + # keepempty=True to retain empty strings in result + self.checkequal([''], '', 'split', keepempty=True) + self.checkequal(['', '', '', ''], ' ', 'split', keepempty=True) + self.checkequal([''], '', 'split', sep=',', keepempty=True) + self.checkequal(['', '', '', ''], ' ', 'split', sep=' ', keepempty=True) + self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'split', sep=' ', keepempty=True) + self.checkequal(['', '', ' y z '], 'xx y z ', 'split', sep='x', keepempty=True) + + # Empty strings kept with keepempty count towards maxsplit + self.checkequal(['', ' y z '], ' y z ', 'split', keepempty=True, maxsplit=1) + self.checkequal(['y', 'z '], ' y z ', 'split', keepempty=False, maxsplit=1) + self.checkequal(['y', 'z '], ' y z ', 'split', maxsplit=1) + self.checkequal(['', ' y z '], ' y z ', 'split', sep=' ', keepempty=True, maxsplit=1) + self.checkequal(['y', 'z '], ' y z ', 'split', sep=' ', keepempty=False, maxsplit=1) + self.checkequal(['', ' y z '], ' y z ', 'split', sep=' ', maxsplit=1) + + def test_rsplit(self): # by a char self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|') @@ -538,6 +581,42 @@ def test_rsplit(self): self.checkraises(ValueError, 'hello', 'rsplit', '') self.checkraises(ValueError, 'hello', 'rsplit', '', 0) + # Without an explicit `sep`, or sep=None, empty strings are pruned from result + self.checkequal([], '', 'rsplit') + self.checkequal([], '', 'rsplit', sep=None) + self.checkequal([], ' ', 'rsplit') + self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'rsplit') + + # With an explicit, non-None `sep`, empty strings are not pruned from result + self.checkequal([''], '', 'rsplit', sep=',') + self.checkequal(['', '', '', ''], ' ', 'rsplit', sep=' ') + self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'rsplit', sep=' ') + self.checkequal(['', '', ' y z '], 'xx y z ', 'rsplit', sep='x') + + # keepempty=False to remove empty strings from result + self.checkequal([], '', 'rsplit', keepempty=False) + self.checkequal([], ' ', 'rsplit', keepempty=False) + self.checkequal([], '', 'rsplit', sep=',', keepempty=False) + self.checkequal([], ' ', 'rsplit', sep=' ', keepempty=False) + self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'rsplit', sep=' ', keepempty=False) + self.checkequal([' y z '], 'xx y z ', 'rsplit', sep='x', keepempty=False) + + # keepempty=True to retain empty strings in result + self.checkequal([''], '', 'rsplit', keepempty=True) + self.checkequal(['', '', '', ''], ' ', 'rsplit', keepempty=True) + self.checkequal([''], '', 'rsplit', sep=',', keepempty=True) + self.checkequal(['', '', '', ''], ' ', 'rsplit', sep=' ', keepempty=True) + self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'rsplit', sep=' ', keepempty=True) + self.checkequal(['', '', ' y z '], 'xx y z ', 'rsplit', sep='x', keepempty=True) + + # Empty strings kept with keepempty count towards maxsplit + self.checkequal([' y z ', ''], ' y z ', 'rsplit', keepempty=True, maxsplit=1) + self.checkequal([' y', 'z'], ' y z ', 'rsplit', keepempty=False, maxsplit=1) + self.checkequal([' y', 'z'], ' y z ', 'rsplit', maxsplit=1) + self.checkequal([' y z ', ''], ' y z ', 'rsplit', sep=' ', keepempty=True, maxsplit=1) + self.checkequal([' y', 'z'], ' y z ', 'rsplit', sep=' ', keepempty=False, maxsplit=1) + self.checkequal([' y z ', ''], ' y z ', 'rsplit', sep=' ', maxsplit=1) + def test_replace(self): EQ = self.checkequal diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst b/Misc/NEWS.d/next/Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst new file mode 100644 index 00000000000000..dccfdf56bb3425 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst @@ -0,0 +1,2 @@ +Add the ``keepempty`` argument to ``string.split``, ``bytes.split``, +``bytearray.split`` and ``UserString.split``. Patch by Mark Bell. diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index a6009854221ff5..8fd80d38e29150 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -1404,34 +1404,46 @@ bytearray.split maxsplit: Py_ssize_t = -1 Maximum number of splits to do. -1 (the default value) means no limit. + keepempty: object = None + Determines whether or not to keep empty strings in the final list. Return a list of the sections in the bytearray, using sep as the delimiter. [clinic start generated code]*/ static PyObject * bytearray_split_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit) -/*[clinic end generated code: output=833e2cf385d9a04d input=24f82669f41bf523]*/ + Py_ssize_t maxsplit, PyObject *keepempty) +/*[clinic end generated code: output=28286c156d864181 input=908de7e1dd1fd8ca]*/ { Py_ssize_t len = PyByteArray_GET_SIZE(self), n; const char *s = PyByteArray_AS_STRING(self), *sub; PyObject *list; Py_buffer vsub; + int prune; + + if (keepempty == Py_None) { + if (sep == Py_None) + prune = 1; + else + prune = 0; + } else { + prune = PyObject_Not(keepempty); + if (prune < 0) + return NULL; + } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit); + return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, prune); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; sub = vsub.buf; n = vsub.len; - list = stringlib_split( - (PyObject*) self, s, len, sub, n, maxsplit - ); + list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit, prune); PyBuffer_Release(&vsub); return list; } @@ -1521,28 +1533,38 @@ Splitting is done starting at the end of the bytearray and working to the front. static PyObject * bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit) -/*[clinic end generated code: output=a55e0b5a03cb6190 input=a68286e4dd692ffe]*/ + Py_ssize_t maxsplit, PyObject *keepempty) +/*[clinic end generated code: output=d8c2e7552a91a174 input=a68286e4dd692ffe]*/ { Py_ssize_t len = PyByteArray_GET_SIZE(self), n; const char *s = PyByteArray_AS_STRING(self), *sub; PyObject *list; Py_buffer vsub; + int prune; + + if (keepempty == Py_None) { + if (sep == Py_None) + prune = 1; + else + prune = 0; + } else { + prune = PyObject_Not(keepempty); + if (prune < 0) + return NULL; + } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit); + return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, prune); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; sub = vsub.buf; n = vsub.len; - list = stringlib_rsplit( - (PyObject*) self, s, len, sub, n, maxsplit - ); + list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit, prune); PyBuffer_Release(&vsub); return list; } diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 2f7e0a6dde6fe0..34c80df0a58a72 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1723,29 +1723,44 @@ bytes.split maxsplit: Py_ssize_t = -1 Maximum number of splits to do. -1 (the default value) means no limit. + keepempty: object = None + Determines whether or not to keep empty strings in the final list Return a list of the sections in the bytes, using sep as the delimiter. [clinic start generated code]*/ static PyObject * -bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit) -/*[clinic end generated code: output=52126b5844c1d8ef input=8b809b39074abbfa]*/ +bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *keepempty) +/*[clinic end generated code: output=e1b678240fbff2e0 input=e58ccb5eb2569eb4]*/ { Py_ssize_t len = PyBytes_GET_SIZE(self), n; const char *s = PyBytes_AS_STRING(self), *sub; Py_buffer vsub; PyObject *list; + int prune; + + if (keepempty == Py_None) { + if (sep == Py_None) + prune = 1; + else + prune = 0; + } else { + prune = PyObject_Not(keepempty); + if (prune < 0) + return NULL; + } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit); + return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, prune); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; sub = vsub.buf; n = vsub.len; - list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit); + list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit, prune); PyBuffer_Release(&vsub); return list; } @@ -1813,24 +1828,37 @@ Splitting is done starting at the end of the bytes and working to the front. [clinic start generated code]*/ static PyObject * -bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit) -/*[clinic end generated code: output=ba698d9ea01e1c8f input=0f86c9f28f7d7b7b]*/ +bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *keepempty) +/*[clinic end generated code: output=0e304d20c12f7ac0 input=0f86c9f28f7d7b7b]*/ { Py_ssize_t len = PyBytes_GET_SIZE(self), n; const char *s = PyBytes_AS_STRING(self), *sub; Py_buffer vsub; PyObject *list; + int prune; + + if (keepempty == Py_None) { + if (sep == Py_None) + prune = 1; + else + prune = 0; + } else { + prune = PyObject_Not(keepempty); + if (prune < 0) + return NULL; + } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit); + return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, prune); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; sub = vsub.buf; n = vsub.len; - list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit); + list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit, prune); PyBuffer_Release(&vsub); return list; } diff --git a/Objects/clinic/bytearrayobject.c.h b/Objects/clinic/bytearrayobject.c.h index 1e3f197561523f..82942a15cfed91 100644 --- a/Objects/clinic/bytearrayobject.c.h +++ b/Objects/clinic/bytearrayobject.c.h @@ -366,7 +366,7 @@ bytearray_replace(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t nar } PyDoc_STRVAR(bytearray_split__doc__, -"split($self, /, sep=None, maxsplit=-1)\n" +"split($self, /, sep=None, maxsplit=-1, keepempty=None)\n" "--\n" "\n" "Return a list of the sections in the bytearray, using sep as the delimiter.\n" @@ -377,27 +377,30 @@ PyDoc_STRVAR(bytearray_split__doc__, " (space, tab, return, newline, formfeed, vertical tab).\n" " maxsplit\n" " Maximum number of splits to do.\n" -" -1 (the default value) means no limit."); +" -1 (the default value) means no limit.\n" +" keepempty\n" +" Determines whether or not to keep empty strings in the final list."); #define BYTEARRAY_SPLIT_METHODDEF \ {"split", (PyCFunction)(void(*)(void))bytearray_split, METH_FASTCALL|METH_KEYWORDS, bytearray_split__doc__}, static PyObject * bytearray_split_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit); + Py_ssize_t maxsplit, PyObject *keepempty); static PyObject * bytearray_split(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "keepempty", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "split", 0}; - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *keepempty = Py_None; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 2, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { goto exit; } @@ -410,20 +413,26 @@ bytearray_split(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t nargs goto skip_optional_pos; } } - { - Py_ssize_t ival = -1; - PyObject *iobj = _PyNumber_Index(args[1]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); + if (args[1]) { + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + maxsplit = ival; } - if (ival == -1 && PyErr_Occurred()) { - goto exit; + if (!--noptargs) { + goto skip_optional_pos; } - maxsplit = ival; } + keepempty = args[2]; skip_optional_pos: - return_value = bytearray_split_impl(self, sep, maxsplit); + return_value = bytearray_split_impl(self, sep, maxsplit, keepempty); exit: return return_value; @@ -463,7 +472,7 @@ PyDoc_STRVAR(bytearray_rpartition__doc__, {"rpartition", (PyCFunction)bytearray_rpartition, METH_O, bytearray_rpartition__doc__}, PyDoc_STRVAR(bytearray_rsplit__doc__, -"rsplit($self, /, sep=None, maxsplit=-1)\n" +"rsplit($self, /, sep=None, maxsplit=-1, keepempty=None)\n" "--\n" "\n" "Return a list of the sections in the bytearray, using sep as the delimiter.\n" @@ -475,6 +484,8 @@ PyDoc_STRVAR(bytearray_rsplit__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" +" keepempty\n" +" Determines whether or not to keep empty strings in the final list.\n" "\n" "Splitting is done starting at the end of the bytearray and working to the front."); @@ -483,20 +494,21 @@ PyDoc_STRVAR(bytearray_rsplit__doc__, static PyObject * bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit); + Py_ssize_t maxsplit, PyObject *keepempty); static PyObject * bytearray_rsplit(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "keepempty", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "rsplit", 0}; - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *keepempty = Py_None; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 2, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { goto exit; } @@ -509,20 +521,26 @@ bytearray_rsplit(PyByteArrayObject *self, PyObject *const *args, Py_ssize_t narg goto skip_optional_pos; } } - { - Py_ssize_t ival = -1; - PyObject *iobj = _PyNumber_Index(args[1]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); + if (args[1]) { + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + maxsplit = ival; } - if (ival == -1 && PyErr_Occurred()) { - goto exit; + if (!--noptargs) { + goto skip_optional_pos; } - maxsplit = ival; } + keepempty = args[2]; skip_optional_pos: - return_value = bytearray_rsplit_impl(self, sep, maxsplit); + return_value = bytearray_rsplit_impl(self, sep, maxsplit, keepempty); exit: return return_value; @@ -1120,4 +1138,4 @@ bytearray_sizeof(PyByteArrayObject *self, PyObject *Py_UNUSED(ignored)) { return bytearray_sizeof_impl(self); } -/*[clinic end generated code: output=a82659f581e55629 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=ef8c01f3ce59b58e input=a9049054013a1b77]*/ diff --git a/Objects/clinic/bytesobject.c.h b/Objects/clinic/bytesobject.c.h index 103a3642813218..d32e2b1c0737bd 100644 --- a/Objects/clinic/bytesobject.c.h +++ b/Objects/clinic/bytesobject.c.h @@ -21,7 +21,7 @@ bytes___bytes__(PyBytesObject *self, PyObject *Py_UNUSED(ignored)) } PyDoc_STRVAR(bytes_split__doc__, -"split($self, /, sep=None, maxsplit=-1)\n" +"split($self, /, sep=None, maxsplit=-1, keepempty=None)\n" "--\n" "\n" "Return a list of the sections in the bytes, using sep as the delimiter.\n" @@ -32,26 +32,30 @@ PyDoc_STRVAR(bytes_split__doc__, " (space, tab, return, newline, formfeed, vertical tab).\n" " maxsplit\n" " Maximum number of splits to do.\n" -" -1 (the default value) means no limit."); +" -1 (the default value) means no limit.\n" +" keepempty\n" +" Determines whether or not to keep empty strings in the final list"); #define BYTES_SPLIT_METHODDEF \ {"split", (PyCFunction)(void(*)(void))bytes_split, METH_FASTCALL|METH_KEYWORDS, bytes_split__doc__}, static PyObject * -bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit); +bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *keepempty); static PyObject * bytes_split(PyBytesObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "keepempty", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "split", 0}; - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *keepempty = Py_None; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 2, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { goto exit; } @@ -64,20 +68,26 @@ bytes_split(PyBytesObject *self, PyObject *const *args, Py_ssize_t nargs, PyObje goto skip_optional_pos; } } - { - Py_ssize_t ival = -1; - PyObject *iobj = _PyNumber_Index(args[1]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); + if (args[1]) { + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + maxsplit = ival; } - if (ival == -1 && PyErr_Occurred()) { - goto exit; + if (!--noptargs) { + goto skip_optional_pos; } - maxsplit = ival; } + keepempty = args[2]; skip_optional_pos: - return_value = bytes_split_impl(self, sep, maxsplit); + return_value = bytes_split_impl(self, sep, maxsplit, keepempty); exit: return return_value; @@ -170,7 +180,7 @@ bytes_rpartition(PyBytesObject *self, PyObject *arg) } PyDoc_STRVAR(bytes_rsplit__doc__, -"rsplit($self, /, sep=None, maxsplit=-1)\n" +"rsplit($self, /, sep=None, maxsplit=-1, keepempty=None)\n" "--\n" "\n" "Return a list of the sections in the bytes, using sep as the delimiter.\n" @@ -182,6 +192,8 @@ PyDoc_STRVAR(bytes_rsplit__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" +" keepempty\n" +" Determines whether or not to keep empty strings in the final list\n" "\n" "Splitting is done starting at the end of the bytes and working to the front."); @@ -189,20 +201,22 @@ PyDoc_STRVAR(bytes_rsplit__doc__, {"rsplit", (PyCFunction)(void(*)(void))bytes_rsplit, METH_FASTCALL|METH_KEYWORDS, bytes_rsplit__doc__}, static PyObject * -bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit); +bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *keepempty); static PyObject * bytes_rsplit(PyBytesObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "keepempty", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "rsplit", 0}; - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *keepempty = Py_None; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 2, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { goto exit; } @@ -215,20 +229,26 @@ bytes_rsplit(PyBytesObject *self, PyObject *const *args, Py_ssize_t nargs, PyObj goto skip_optional_pos; } } - { - Py_ssize_t ival = -1; - PyObject *iobj = _PyNumber_Index(args[1]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); + if (args[1]) { + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + maxsplit = ival; } - if (ival == -1 && PyErr_Occurred()) { - goto exit; + if (!--noptargs) { + goto skip_optional_pos; } - maxsplit = ival; } + keepempty = args[2]; skip_optional_pos: - return_value = bytes_rsplit_impl(self, sep, maxsplit); + return_value = bytes_rsplit_impl(self, sep, maxsplit, keepempty); exit: return return_value; @@ -896,4 +916,4 @@ bytes_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=d706344859f40122 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=603a90a82f771b87 input=a9049054013a1b77]*/ diff --git a/Objects/clinic/unicodeobject.c.h b/Objects/clinic/unicodeobject.c.h index 9ef8ce2e35364c..b5c4513aa66c72 100644 --- a/Objects/clinic/unicodeobject.c.h +++ b/Objects/clinic/unicodeobject.c.h @@ -855,7 +855,7 @@ unicode_rjust(PyObject *self, PyObject *const *args, Py_ssize_t nargs) } PyDoc_STRVAR(unicode_split__doc__, -"split($self, /, sep=None, maxsplit=-1)\n" +"split($self, /, sep=None, maxsplit=-1, keepempty=None)\n" "--\n" "\n" "Return a list of the words in the string, using sep as the delimiter string.\n" @@ -866,26 +866,37 @@ PyDoc_STRVAR(unicode_split__doc__, " and discard empty strings from the result.\n" " maxsplit\n" " Maximum number of splits to do.\n" -" -1 (the default value) means no limit."); +" -1 (the default value) means no limit.\n" +" keepempty\n" +" Determines whether or not to keep empty strings in the final list.\n" +"\n" +"If maxsplit is given, at most maxsplit splits are done.\n" +"If sep is not specified or is None, any whitespace string is a separator.\n" +"If keepempty is False, empty strings are removed from the result.\n" +"If keepempty is True, empty strings are retained in the result.\n" +"If keepempty is not given or None, the default behaviour is used: it is set to True if\n" +"sep is None, False otherwise."); #define UNICODE_SPLIT_METHODDEF \ {"split", (PyCFunction)(void(*)(void))unicode_split, METH_FASTCALL|METH_KEYWORDS, unicode_split__doc__}, static PyObject * -unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit); +unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *keepempty); static PyObject * unicode_split(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "keepempty", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "split", 0}; - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *keepempty = Py_None; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 2, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { goto exit; } @@ -898,20 +909,26 @@ unicode_split(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject goto skip_optional_pos; } } - { - Py_ssize_t ival = -1; - PyObject *iobj = _PyNumber_Index(args[1]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); + if (args[1]) { + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + maxsplit = ival; } - if (ival == -1 && PyErr_Occurred()) { - goto exit; + if (!--noptargs) { + goto skip_optional_pos; } - maxsplit = ival; } + keepempty = args[2]; skip_optional_pos: - return_value = unicode_split_impl(self, sep, maxsplit); + return_value = unicode_split_impl(self, sep, maxsplit, keepempty); exit: return return_value; @@ -950,7 +967,7 @@ PyDoc_STRVAR(unicode_rpartition__doc__, {"rpartition", (PyCFunction)unicode_rpartition, METH_O, unicode_rpartition__doc__}, PyDoc_STRVAR(unicode_rsplit__doc__, -"rsplit($self, /, sep=None, maxsplit=-1)\n" +"rsplit($self, /, sep=None, maxsplit=-1, keepempty=None)\n" "--\n" "\n" "Return a list of the words in the string, using sep as the delimiter string.\n" @@ -962,6 +979,8 @@ PyDoc_STRVAR(unicode_rsplit__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" +" keepempty\n" +" Determines whether or not to keep empty strings in the final list.\n" "\n" "Splits are done starting at the end of the string and working to the front."); @@ -969,20 +988,22 @@ PyDoc_STRVAR(unicode_rsplit__doc__, {"rsplit", (PyCFunction)(void(*)(void))unicode_rsplit, METH_FASTCALL|METH_KEYWORDS, unicode_rsplit__doc__}, static PyObject * -unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit); +unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *keepempty); static PyObject * unicode_rsplit(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; + static const char * const _keywords[] = {"sep", "maxsplit", "keepempty", NULL}; static _PyArg_Parser _parser = {NULL, _keywords, "rsplit", 0}; - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *keepempty = Py_None; - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 2, 0, argsbuf); + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 3, 0, argsbuf); if (!args) { goto exit; } @@ -995,20 +1016,26 @@ unicode_rsplit(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject goto skip_optional_pos; } } - { - Py_ssize_t ival = -1; - PyObject *iobj = _PyNumber_Index(args[1]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); + if (args[1]) { + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + maxsplit = ival; } - if (ival == -1 && PyErr_Occurred()) { - goto exit; + if (!--noptargs) { + goto skip_optional_pos; } - maxsplit = ival; } + keepempty = args[2]; skip_optional_pos: - return_value = unicode_rsplit_impl(self, sep, maxsplit); + return_value = unicode_rsplit_impl(self, sep, maxsplit, keepempty); exit: return return_value; @@ -1327,4 +1354,4 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=f10cf85d3935b3b7 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=e5700a0f2fa3f723 input=a9049054013a1b77]*/ diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h index 068047f9874a07..c82db1a466a952 100644 --- a/Objects/stringlib/split.h +++ b/Objects/stringlib/split.h @@ -53,43 +53,43 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(split_whitespace)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, count=0; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *list; PyObject *sub; + list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; i = j = 0; - while (maxcount-- > 0) { - while (i < str_len && STRINGLIB_ISSPACE(str[i])) - i++; - if (i == str_len) break; - j = i; i++; - while (i < str_len && !STRINGLIB_ISSPACE(str[i])) - i++; -#ifndef STRINGLIB_MUTABLE - if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { - /* No whitespace in str_obj, so just use it as list[0] */ - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - count++; - break; + while (j < str_len && !STRINGLIB_ISSPACE(str[j])) + j++; + + while (j < str_len) { + if (j > i || ! prune) { + if (count >= maxcount) + break; + SPLIT_ADD(str, i, j); } -#endif - SPLIT_ADD(str, j, i); + j++; + i = j; + while (j < str_len && !STRINGLIB_ISSPACE(str[j])) + j++; } - if (i < str_len) { - /* Only occurs when maxcount was reached */ - /* Skip any remaining whitespace and copy to end of string */ - while (i < str_len && STRINGLIB_ISSPACE(str[i])) - i++; - if (i != str_len) - SPLIT_ADD(str, i, str_len); - } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && i == 0 && j == str_len && str_len > 0 && ! prune && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + if (i < str_len || ! prune) + SPLIT_ADD(str, i, str_len); + FIX_PREALLOC_SIZE(list); return list; @@ -102,37 +102,43 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(split_char)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR ch, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, count=0; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *list; PyObject *sub; + list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; i = j = 0; - while ((j < str_len) && (maxcount-- > 0)) { - for(; j < str_len; j++) { - /* I found that using memchr makes no difference */ - if (str[j] == ch) { - SPLIT_ADD(str, i, j); - i = j = j + 1; + while (j < str_len && str[j] != ch) + j++; + + while (j < str_len) { + if (j > i || ! prune) { + if (count >= maxcount) break; - } + SPLIT_ADD(str, i, j); } + j++; + i = j; + while (j < str_len && str[j] != ch) + j++; } + #ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + if (count == 0 && i == 0 && j == str_len && str_len > 0 && ! prune && STRINGLIB_CHECK_EXACT(str_obj)) { /* ch not in str_obj, so just use str_obj as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); count++; } else #endif - if (i <= str_len) { + if (i < str_len || ! prune) SPLIT_ADD(str, i, str_len); - } + FIX_PREALLOC_SIZE(list); return list; @@ -145,9 +151,9 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(split)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { - Py_ssize_t i, j, pos, count=0; + Py_ssize_t i, j, offset, count=0; PyObject *list, *sub; if (sep_len == 0) { @@ -155,32 +161,38 @@ STRINGLIB(split)(PyObject* str_obj, return NULL; } else if (sep_len == 1) - return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount); + return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount, prune); list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; - i = j = 0; - while (maxcount-- > 0) { - pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); - if (pos < 0) - break; - j = i + pos; - SPLIT_ADD(str, i, j); + i = 0; + offset = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); + j = (offset >= 0) ? i + offset : -1; + + while (j >= 0) { + if (j > i || ! prune) { + if (count >= maxcount) + break; + SPLIT_ADD(str, i, j); + } i = j + sep_len; + offset = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); + j = (offset >= 0) ? i + offset : -1; } + #ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { - /* No match in str_obj, so just use it as list[0] */ + if (count == 0 && i == 0 && j < 0 && str_len > 0 && ! prune && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); count++; } else #endif - { + if (i < str_len || ! prune) SPLIT_ADD(str, i, str_len); - } + FIX_PREALLOC_SIZE(list); return list; @@ -192,43 +204,43 @@ STRINGLIB(split)(PyObject* str_obj, Py_LOCAL_INLINE(PyObject *) STRINGLIB(rsplit_whitespace)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, count=0; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *list; PyObject *sub; + list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; - i = j = str_len - 1; - while (maxcount-- > 0) { - while (i >= 0 && STRINGLIB_ISSPACE(str[i])) - i--; - if (i < 0) break; - j = i; i--; - while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) - i--; -#ifndef STRINGLIB_MUTABLE - if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { - /* No whitespace in str_obj, so just use it as list[0] */ - Py_INCREF(str_obj); - PyList_SET_ITEM(list, 0, (PyObject *)str_obj); - count++; - break; + i = j = str_len; + while (j > 0 && !STRINGLIB_ISSPACE(str[j-1])) + j--; + + while (j > 0) { + if (j < i || ! prune) { + if (count >= maxcount) + break; + SPLIT_ADD(str, j, i); } -#endif - SPLIT_ADD(str, i + 1, j + 1); + j--; + i = j; + while (j > 0 && !STRINGLIB_ISSPACE(str[j-1])) + j--; } - if (i >= 0) { - /* Only occurs when maxcount was reached */ - /* Skip any remaining whitespace and copy to beginning of string */ - while (i >= 0 && STRINGLIB_ISSPACE(str[i])) - i--; - if (i >= 0) - SPLIT_ADD(str, 0, i + 1); - } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && j == 0 && i == str_len && str_len > 0 && ! prune && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + if (i > 0 || ! prune) + SPLIT_ADD(str, 0, i); + FIX_PREALLOC_SIZE(list); if (PyList_Reverse(list) < 0) goto onError; @@ -243,36 +255,43 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(rsplit_char)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR ch, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, count=0; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *list; PyObject *sub; + list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; - i = j = str_len - 1; - while ((i >= 0) && (maxcount-- > 0)) { - for(; i >= 0; i--) { - if (str[i] == ch) { - SPLIT_ADD(str, i + 1, j + 1); - j = i = i - 1; + i = j = str_len; + while (j > 0 && str[j-1] != ch) + j--; + + while (j > 0) { + if (j < i || ! prune) { + if (count >= maxcount) break; - } + SPLIT_ADD(str, j, i); } + j--; + i = j; + while (j > 0 && str[j-1] != ch) + j--; } + #ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + if (count == 0 && j == 0 && i == str_len && str_len > 0 && ! prune && STRINGLIB_CHECK_EXACT(str_obj)) { /* ch not in str_obj, so just use str_obj as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); count++; } else #endif - if (j >= -1) { - SPLIT_ADD(str, 0, j + 1); - } + if (i > 0 || ! prune) + SPLIT_ADD(str, 0, i); + FIX_PREALLOC_SIZE(list); if (PyList_Reverse(list) < 0) goto onError; @@ -287,9 +306,9 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(rsplit)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { - Py_ssize_t j, pos, count=0; + Py_ssize_t i, j, offset, count=0; PyObject *list, *sub; if (sep_len == 0) { @@ -297,31 +316,38 @@ STRINGLIB(rsplit)(PyObject* str_obj, return NULL; } else if (sep_len == 1) - return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount); + return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount, prune); list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; - j = str_len; - while (maxcount-- > 0) { - pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH); - if (pos < 0) - break; - SPLIT_ADD(str, pos + sep_len, j); - j = pos; + i = str_len; + offset = FASTSEARCH(str, i, sep, sep_len, -1, FAST_RSEARCH); + j = (offset >= 0) ? offset + sep_len : -1; + + while (j >= 0) { + if (j < i || ! prune) { + if (count >= maxcount) + break; + SPLIT_ADD(str, j, i); + } + i = j - sep_len; + offset = FASTSEARCH(str, i, sep, sep_len, -1, FAST_RSEARCH); + j = (offset >= 0) ? offset + sep_len : -1; } + #ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { - /* No match in str_obj, so just use it as list[0] */ + if (count == 0 && i == str_len && j < 0 && str_len > 0 && ! prune && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); count++; } else #endif - { - SPLIT_ADD(str, 0, j); - } + if (i > 0 || ! prune) + SPLIT_ADD(str, 0, i); + FIX_PREALLOC_SIZE(list); if (PyList_Reverse(list) < 0) goto onError; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 14449bce70839f..9dd876067e6f60 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10397,7 +10397,8 @@ PyUnicode_Splitlines(PyObject *string, int keepends) static PyObject * split(PyObject *self, PyObject *substring, - Py_ssize_t maxcount) + Py_ssize_t maxcount, + int prune) { int kind1, kind2; const void *buf1, *buf2; @@ -10416,22 +10417,22 @@ split(PyObject *self, if (PyUnicode_IS_ASCII(self)) return asciilib_split_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); else return ucs1lib_split_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_2BYTE_KIND: return ucs2lib_split_whitespace( self, PyUnicode_2BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_4BYTE_KIND: return ucs4lib_split_whitespace( self, PyUnicode_4BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); default: Py_UNREACHABLE(); @@ -10445,12 +10446,14 @@ split(PyObject *self, len1 = PyUnicode_GET_LENGTH(self); len2 = PyUnicode_GET_LENGTH(substring); if (kind1 < kind2 || len1 < len2) { - out = PyList_New(1); - if (out == NULL) - return NULL; - Py_INCREF(self); - PyList_SET_ITEM(out, 0, self); - return out; + if (len1 > 0 ) { + out = PyList_New(1); + if (out == NULL) + return NULL; + Py_INCREF(self); + PyList_SET_ITEM(out, 0, self); + return out; + } } buf1 = PyUnicode_DATA(self); buf2 = PyUnicode_DATA(substring); @@ -10463,19 +10466,15 @@ split(PyObject *self, switch (kind1) { case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) - out = asciilib_split( - self, buf1, len1, buf2, len2, maxcount); + out = asciilib_split(self, buf1, len1, buf2, len2, maxcount, prune); else - out = ucs1lib_split( - self, buf1, len1, buf2, len2, maxcount); + out = ucs1lib_split(self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_2BYTE_KIND: - out = ucs2lib_split( - self, buf1, len1, buf2, len2, maxcount); + out = ucs2lib_split(self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_4BYTE_KIND: - out = ucs4lib_split( - self, buf1, len1, buf2, len2, maxcount); + out = ucs4lib_split(self, buf1, len1, buf2, len2, maxcount, prune); break; default: out = NULL; @@ -10489,7 +10488,8 @@ split(PyObject *self, static PyObject * rsplit(PyObject *self, PyObject *substring, - Py_ssize_t maxcount) + Py_ssize_t maxcount, + int prune) { int kind1, kind2; const void *buf1, *buf2; @@ -10507,23 +10507,23 @@ rsplit(PyObject *self, case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self)) return asciilib_rsplit_whitespace( - self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + self, PyUnicode_1BYTE_DATA(self), + PyUnicode_GET_LENGTH(self), maxcount, prune ); else return ucs1lib_rsplit_whitespace( - self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + self, PyUnicode_1BYTE_DATA(self), + PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_2BYTE_KIND: return ucs2lib_rsplit_whitespace( - self, PyUnicode_2BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + self, PyUnicode_2BYTE_DATA(self), + PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_4BYTE_KIND: return ucs4lib_rsplit_whitespace( - self, PyUnicode_4BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + self, PyUnicode_4BYTE_DATA(self), + PyUnicode_GET_LENGTH(self), maxcount, prune ); default: Py_UNREACHABLE(); @@ -10536,7 +10536,7 @@ rsplit(PyObject *self, kind2 = PyUnicode_KIND(substring); len1 = PyUnicode_GET_LENGTH(self); len2 = PyUnicode_GET_LENGTH(substring); - if (kind1 < kind2 || len1 < len2) { + if (kind1 < kind2 || (len1 > 0 && len1 < len2)) { out = PyList_New(1); if (out == NULL) return NULL; @@ -10555,19 +10555,15 @@ rsplit(PyObject *self, switch (kind1) { case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) - out = asciilib_rsplit( - self, buf1, len1, buf2, len2, maxcount); + out = asciilib_rsplit(self, buf1, len1, buf2, len2, maxcount, prune); else - out = ucs1lib_rsplit( - self, buf1, len1, buf2, len2, maxcount); + out = ucs1lib_rsplit(self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_2BYTE_KIND: - out = ucs2lib_rsplit( - self, buf1, len1, buf2, len2, maxcount); + out = ucs2lib_rsplit(self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_4BYTE_KIND: - out = ucs4lib_rsplit( - self, buf1, len1, buf2, len2, maxcount); + out = ucs4lib_rsplit(self, buf1, len1, buf2, len2, maxcount, prune); break; default: out = NULL; @@ -13191,10 +13187,14 @@ unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar) PyObject * PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) { + int prune; + if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - return split(s, sep, maxsplit); + prune = (sep == Py_None) ? 1 : 0; + + return split(s, sep, maxsplit, prune); } /*[clinic input] @@ -13207,18 +13207,41 @@ str.split as unicode_split maxsplit: Py_ssize_t = -1 Maximum number of splits to do. -1 (the default value) means no limit. + keepempty: object = None + Determines whether or not to keep empty strings in the final list. Return a list of the words in the string, using sep as the delimiter string. + +If maxsplit is given, at most maxsplit splits are done. +If sep is not specified or is None, any whitespace string is a separator. +If keepempty is False, empty strings are removed from the result. +If keepempty is True, empty strings are retained in the result. +If keepempty is not given or None, the default behaviour is used: it is set to True if +sep is None, False otherwise. [clinic start generated code]*/ static PyObject * -unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit) -/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/ +unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *keepempty) +/*[clinic end generated code: output=c182ae533ca1ef53 input=2fe5525dbaaf44ee]*/ { + int prune; + + if (keepempty == Py_None) { + if (sep == Py_None) + prune = 1; + else + prune = 0; + } else { + prune = PyObject_Not(keepempty); + if (prune < 0) + return NULL; + } + if (sep == Py_None) - return split(self, NULL, maxsplit); + return split(self, NULL, maxsplit, prune); if (PyUnicode_Check(sep)) - return split(self, sep, maxsplit); + return split(self, sep, maxsplit, prune); PyErr_Format(PyExc_TypeError, "must be str or None, not %.100s", @@ -13375,10 +13398,14 @@ unicode_rpartition(PyObject *self, PyObject *sep) PyObject * PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) { + int prune; + if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - return rsplit(s, sep, maxsplit); + prune = (sep == Py_None) ? 1 : 0; + + return rsplit(s, sep, maxsplit, prune); } /*[clinic input] @@ -13390,13 +13417,27 @@ Splits are done starting at the end of the string and working to the front. [clinic start generated code]*/ static PyObject * -unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit) -/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/ +unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *keepempty) +/*[clinic end generated code: output=27ba2177eb6cdfcf input=12ad4bf57dd35f15]*/ { + int prune; + + if (keepempty == Py_None) { + if (sep == Py_None) + prune = 1; + else + prune = 0; + } else { + prune = PyObject_Not(keepempty); + if (prune < 0) + return NULL; + } + if (sep == Py_None) - return rsplit(self, NULL, maxsplit); + return rsplit(self, NULL, maxsplit, prune); if (PyUnicode_Check(sep)) - return rsplit(self, sep, maxsplit); + return rsplit(self, sep, maxsplit, prune); PyErr_Format(PyExc_TypeError, "must be str or None, not %.100s",