Thanks to visit codestin.com
Credit goes to github.com

Skip to content

gh-73123: Add a keepempty argument to string, bytes and bytearray split methods #26222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ac9270f
Initial conversion of abarry's patch file.
MarkCBell May 17, 2021
0912ede
Switched logic to use keepempty.
MarkCBell May 17, 2021
f4a101f
Adding in string tests.
MarkCBell May 17, 2021
ade9688
Switched back to PyUnicode_Split taking an int flag.
MarkCBell May 17, 2021
aa4ee9a
Whitespace.
MarkCBell May 17, 2021
76ec07d
Regenerated.
MarkCBell May 17, 2021
38810e5
Added empty string checks to split methods.
MarkCBell May 17, 2021
e0468c1
Cant autobail when string has length 0.
MarkCBell May 17, 2021
5c84475
Better small test.
MarkCBell May 18, 2021
ae0f1ee
More tests.
MarkCBell May 18, 2021
b781cfc
Better small case.
MarkCBell May 18, 2021
6e09909
New splitting algorithms that are compatible with maxsplit.
MarkCBell May 18, 2021
c5ccbeb
Converted rstrip_char and readded optimisations.
MarkCBell May 18, 2021
9e1cd1e
Rewritten split to use new algorithm.
MarkCBell May 18, 2021
4463211
Completed conversion of rsplit.
MarkCBell May 18, 2021
4501026
Fixed use of ! PyObject_IsTrue.
MarkCBell May 18, 2021
d2d6fe6
Fixed bug in FASTSEARCH bounds.
MarkCBell May 18, 2021
cbbf25d
Added keepempty argument to UserString.
MarkCBell May 18, 2021
720a6a6
📜🤖 Added by blurb_it.
blurb-it[bot] May 18, 2021
1959126
News needs to use .
MarkCBell May 19, 2021
cea8af9
Added tests to check interaction with maxsplit.
MarkCBell May 19, 2021
21a2fc4
Also documentation change to PyUnicode_Split and RSplit.
MarkCBell May 19, 2021
e7828fc
Created separate PyUnicode_SplitWithKeepempty and PyUnicode_RSplitWit…
MarkCBell May 19, 2021
dc610bb
Added PyUnicode_SplitWithKeepempty to API/ABI.
MarkCBell May 19, 2021
f95b254
Realised C API interfaces to new keepempty flag is not mandatory.
MarkCBell May 19, 2021
41cbb49
Merge branch 'main' into split-keepempty
MarkCBell Aug 8, 2021
1b1e755
Merge branch 'main' into split-keepempty
MarkCBell Oct 6, 2021
b48ee71
Merge branch 'main' into split-keepempty
MarkCBell Dec 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Lib/collections/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1529,11 +1529,11 @@ def rpartition(self, sep):
def rstrip(self, chars=None):
return self.__class__(self.data.rstrip(chars))

def split(self, sep=None, maxsplit=-1):
return self.data.split(sep, maxsplit)
def split(self, sep=None, maxsplit=-1, keepempty=None):
return self.data.split(sep, maxsplit, keepempty)

def rsplit(self, sep=None, maxsplit=-1):
return self.data.rsplit(sep, maxsplit)
def rsplit(self, sep=None, maxsplit=-1, keepempty=None):
return self.data.rsplit(sep, maxsplit, keepempty)

def splitlines(self, keepends=False):
return self.data.splitlines(keepends)
Expand Down
79 changes: 79 additions & 0 deletions Lib/test/string_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,49 @@ def test_split(self):
self.checkraises(ValueError, 'hello', 'split', '')
self.checkraises(ValueError, 'hello', 'split', '', 0)

# without args, any whitespace is a separator
self.checkequal(['a', 'b', 'c', 'd', 'e'], 'a b\tc\nd \n e ', 'split')

# with sep=None, any whitespace is a separator
self.checkequal(['a', 'b', 'c', 'd', 'e'], 'a b\tc\nd \n e ', 'split', sep=None)

# Without an explicit `sep`, or sep=None, empty strings are pruned from result
self.checkequal([], '', 'split')
self.checkequal([], '', 'split', sep=None)
self.checkequal([], ' ', 'split')
self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'split')

# With an explicit, non-None `sep`, empty strings are not pruned from result
self.checkequal([''], '', 'split', sep=',')
self.checkequal(['', '', '', ''], ' ', 'split', sep=' ')
self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'split', sep=' ')
self.checkequal(['', '', ' y z '], 'xx y z ', 'split', sep='x')

# keepempty=False to remove empty strings from result
self.checkequal([], '', 'split', keepempty=False)
self.checkequal([], ' ', 'split', keepempty=False)
self.checkequal([], '', 'split', sep=',', keepempty=False)
self.checkequal([], ' ', 'split', sep=' ', keepempty=False)
self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'split', sep=' ', keepempty=False)
self.checkequal([' y z '], 'xx y z ', 'split', sep='x', keepempty=False)

# keepempty=True to retain empty strings in result
self.checkequal([''], '', 'split', keepempty=True)
self.checkequal(['', '', '', ''], ' ', 'split', keepempty=True)
self.checkequal([''], '', 'split', sep=',', keepempty=True)
self.checkequal(['', '', '', ''], ' ', 'split', sep=' ', keepempty=True)
self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'split', sep=' ', keepempty=True)
self.checkequal(['', '', ' y z '], 'xx y z ', 'split', sep='x', keepempty=True)

# Empty strings kept with keepempty count towards maxsplit
self.checkequal(['', ' y z '], ' y z ', 'split', keepempty=True, maxsplit=1)
self.checkequal(['y', 'z '], ' y z ', 'split', keepempty=False, maxsplit=1)
self.checkequal(['y', 'z '], ' y z ', 'split', maxsplit=1)
self.checkequal(['', ' y z '], ' y z ', 'split', sep=' ', keepempty=True, maxsplit=1)
self.checkequal(['y', 'z '], ' y z ', 'split', sep=' ', keepempty=False, maxsplit=1)
self.checkequal(['', ' y z '], ' y z ', 'split', sep=' ', maxsplit=1)


def test_rsplit(self):
# by a char
self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|')
Expand Down Expand Up @@ -538,6 +581,42 @@ def test_rsplit(self):
self.checkraises(ValueError, 'hello', 'rsplit', '')
self.checkraises(ValueError, 'hello', 'rsplit', '', 0)

# Without an explicit `sep`, or sep=None, empty strings are pruned from result
self.checkequal([], '', 'rsplit')
self.checkequal([], '', 'rsplit', sep=None)
self.checkequal([], ' ', 'rsplit')
self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'rsplit')

# With an explicit, non-None `sep`, empty strings are not pruned from result
self.checkequal([''], '', 'rsplit', sep=',')
self.checkequal(['', '', '', ''], ' ', 'rsplit', sep=' ')
self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'rsplit', sep=' ')
self.checkequal(['', '', ' y z '], 'xx y z ', 'rsplit', sep='x')

# keepempty=False to remove empty strings from result
self.checkequal([], '', 'rsplit', keepempty=False)
self.checkequal([], ' ', 'rsplit', keepempty=False)
self.checkequal([], '', 'rsplit', sep=',', keepempty=False)
self.checkequal([], ' ', 'rsplit', sep=' ', keepempty=False)
self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'rsplit', sep=' ', keepempty=False)
self.checkequal([' y z '], 'xx y z ', 'rsplit', sep='x', keepempty=False)

# keepempty=True to retain empty strings in result
self.checkequal([''], '', 'rsplit', keepempty=True)
self.checkequal(['', '', '', ''], ' ', 'rsplit', keepempty=True)
self.checkequal([''], '', 'rsplit', sep=',', keepempty=True)
self.checkequal(['', '', '', ''], ' ', 'rsplit', sep=' ', keepempty=True)
self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'rsplit', sep=' ', keepempty=True)
self.checkequal(['', '', ' y z '], 'xx y z ', 'rsplit', sep='x', keepempty=True)

# Empty strings kept with keepempty count towards maxsplit
self.checkequal([' y z ', ''], ' y z ', 'rsplit', keepempty=True, maxsplit=1)
self.checkequal([' y', 'z'], ' y z ', 'rsplit', keepempty=False, maxsplit=1)
self.checkequal([' y', 'z'], ' y z ', 'rsplit', maxsplit=1)
self.checkequal([' y z ', ''], ' y z ', 'rsplit', sep=' ', keepempty=True, maxsplit=1)
self.checkequal([' y', 'z'], ' y z ', 'rsplit', sep=' ', keepempty=False, maxsplit=1)
self.checkequal([' y z ', ''], ' y z ', 'rsplit', sep=' ', maxsplit=1)

def test_replace(self):
EQ = self.checkequal

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Add the ``keepempty`` argument to ``string.split``, ``bytes.split``,
``bytearray.split`` and ``UserString.split``. Patch by Mark Bell.
46 changes: 34 additions & 12 deletions Objects/bytearrayobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -1404,34 +1404,46 @@ bytearray.split
maxsplit: Py_ssize_t = -1
Maximum number of splits to do.
-1 (the default value) means no limit.
keepempty: object = None
Determines whether or not to keep empty strings in the final list.

Return a list of the sections in the bytearray, using sep as the delimiter.
[clinic start generated code]*/

static PyObject *
bytearray_split_impl(PyByteArrayObject *self, PyObject *sep,
Py_ssize_t maxsplit)
/*[clinic end generated code: output=833e2cf385d9a04d input=24f82669f41bf523]*/
Py_ssize_t maxsplit, PyObject *keepempty)
/*[clinic end generated code: output=28286c156d864181 input=908de7e1dd1fd8ca]*/
{
Py_ssize_t len = PyByteArray_GET_SIZE(self), n;
const char *s = PyByteArray_AS_STRING(self), *sub;
PyObject *list;
Py_buffer vsub;
int prune;

if (keepempty == Py_None) {
if (sep == Py_None)
prune = 1;
else
prune = 0;
} else {
prune = PyObject_Not(keepempty);
if (prune < 0)
return NULL;
}

if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;

if (sep == Py_None)
return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit);
return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, prune);

if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0)
return NULL;
sub = vsub.buf;
n = vsub.len;

list = stringlib_split(
(PyObject*) self, s, len, sub, n, maxsplit
);
list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit, prune);
PyBuffer_Release(&vsub);
return list;
}
Expand Down Expand Up @@ -1521,28 +1533,38 @@ Splitting is done starting at the end of the bytearray and working to the front.

static PyObject *
bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep,
Py_ssize_t maxsplit)
/*[clinic end generated code: output=a55e0b5a03cb6190 input=a68286e4dd692ffe]*/
Py_ssize_t maxsplit, PyObject *keepempty)
/*[clinic end generated code: output=d8c2e7552a91a174 input=a68286e4dd692ffe]*/
{
Py_ssize_t len = PyByteArray_GET_SIZE(self), n;
const char *s = PyByteArray_AS_STRING(self), *sub;
PyObject *list;
Py_buffer vsub;
int prune;

if (keepempty == Py_None) {
if (sep == Py_None)
prune = 1;
else
prune = 0;
} else {
prune = PyObject_Not(keepempty);
if (prune < 0)
return NULL;
}

if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;

if (sep == Py_None)
return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit);
return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, prune);

if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0)
return NULL;
sub = vsub.buf;
n = vsub.len;

list = stringlib_rsplit(
(PyObject*) self, s, len, sub, n, maxsplit
);
list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit, prune);
PyBuffer_Release(&vsub);
return list;
}
Expand Down
44 changes: 36 additions & 8 deletions Objects/bytesobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -1723,29 +1723,44 @@ bytes.split
maxsplit: Py_ssize_t = -1
Maximum number of splits to do.
-1 (the default value) means no limit.
keepempty: object = None
Determines whether or not to keep empty strings in the final list

Return a list of the sections in the bytes, using sep as the delimiter.
[clinic start generated code]*/

static PyObject *
bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit)
/*[clinic end generated code: output=52126b5844c1d8ef input=8b809b39074abbfa]*/
bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit,
PyObject *keepempty)
/*[clinic end generated code: output=e1b678240fbff2e0 input=e58ccb5eb2569eb4]*/
{
Py_ssize_t len = PyBytes_GET_SIZE(self), n;
const char *s = PyBytes_AS_STRING(self), *sub;
Py_buffer vsub;
PyObject *list;
int prune;

if (keepempty == Py_None) {
if (sep == Py_None)
prune = 1;
else
prune = 0;
} else {
prune = PyObject_Not(keepempty);
if (prune < 0)
return NULL;
}

if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
if (sep == Py_None)
return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit);
return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, prune);
if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0)
return NULL;
sub = vsub.buf;
n = vsub.len;

list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit);
list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit, prune);
PyBuffer_Release(&vsub);
return list;
}
Expand Down Expand Up @@ -1813,24 +1828,37 @@ Splitting is done starting at the end of the bytes and working to the front.
[clinic start generated code]*/

static PyObject *
bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit)
/*[clinic end generated code: output=ba698d9ea01e1c8f input=0f86c9f28f7d7b7b]*/
bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit,
PyObject *keepempty)
/*[clinic end generated code: output=0e304d20c12f7ac0 input=0f86c9f28f7d7b7b]*/
{
Py_ssize_t len = PyBytes_GET_SIZE(self), n;
const char *s = PyBytes_AS_STRING(self), *sub;
Py_buffer vsub;
PyObject *list;
int prune;

if (keepempty == Py_None) {
if (sep == Py_None)
prune = 1;
else
prune = 0;
} else {
prune = PyObject_Not(keepempty);
if (prune < 0)
return NULL;
}

if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
if (sep == Py_None)
return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit);
return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, prune);
if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0)
return NULL;
sub = vsub.buf;
n = vsub.len;

list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit);
list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit, prune);
PyBuffer_Release(&vsub);
return list;
}
Expand Down
Loading