python · MarkCBell · May 17, 2021 · May 17, 2021 · May 17, 2021 · May 17, 2021
@@ -1529,11 +1529,11 @@ def rpartition(self, sep):
     def rstrip(self, chars=None):
         return self.__class__(self.data.rstrip(chars))
 
-    def split(self, sep=None, maxsplit=-1):
-        return self.data.split(sep, maxsplit)
+    def split(self, sep=None, maxsplit=-1, keepempty=None):
+        return self.data.split(sep, maxsplit, keepempty)
 
-    def rsplit(self, sep=None, maxsplit=-1):
-        return self.data.rsplit(sep, maxsplit)
+    def rsplit(self, sep=None, maxsplit=-1, keepempty=None):
+        return self.data.rsplit(sep, maxsplit, keepempty)
 
     def splitlines(self, keepends=False):
         return self.data.splitlines(keepends)

diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py
@@ -468,6 +468,49 @@ def test_split(self):
         self.checkraises(ValueError, 'hello', 'split', '')
         self.checkraises(ValueError, 'hello', 'split', '', 0)
 
+        # without args, any whitespace is a separator
+        self.checkequal(['a', 'b', 'c', 'd', 'e'], 'a b\tc\nd \n e ', 'split')
+
+        # with sep=None, any whitespace is a separator
+        self.checkequal(['a', 'b', 'c', 'd', 'e'], 'a b\tc\nd \n e ', 'split', sep=None)
+
+        # Without an explicit `sep`, or sep=None, empty strings are pruned from result
+        self.checkequal([], '', 'split')
+        self.checkequal([], '', 'split', sep=None)
+        self.checkequal([], '   ', 'split')
+        self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'split')
+
+        # With an explicit, non-None `sep`, empty strings are not pruned from result
+        self.checkequal([''], '', 'split', sep=',')
+        self.checkequal(['', '', '', ''], '   ', 'split', sep=' ')
+        self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'split', sep=' ')
+        self.checkequal(['', '', ' y z '], 'xx y z ', 'split', sep='x')
+
+        # keepempty=False to remove empty strings from result
+        self.checkequal([], '', 'split', keepempty=False)
+        self.checkequal([], '   ', 'split', keepempty=False)
+        self.checkequal([], '', 'split', sep=',', keepempty=False)
+        self.checkequal([], '   ', 'split', sep=' ', keepempty=False)
+        self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'split', sep=' ', keepempty=False)
+        self.checkequal([' y z '], 'xx y z ', 'split', sep='x', keepempty=False)
+
+        # keepempty=True to retain empty strings in result
+        self.checkequal([''], '', 'split', keepempty=True)
+        self.checkequal(['', '', '', ''], '   ', 'split', keepempty=True)
+        self.checkequal([''], '', 'split', sep=',', keepempty=True)
+        self.checkequal(['', '', '', ''], '   ', 'split', sep=' ', keepempty=True)
+        self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'split', sep=' ', keepempty=True)
+        self.checkequal(['', '', ' y z '], 'xx y z ', 'split', sep='x', keepempty=True)
+
+        # Empty strings kept with keepempty count towards maxsplit
+        self.checkequal(['', ' y z  '], '  y z  ', 'split', keepempty=True, maxsplit=1)
+        self.checkequal(['y', 'z  '], '  y z  ', 'split', keepempty=False, maxsplit=1)
+        self.checkequal(['y', 'z  '], '  y z  ', 'split', maxsplit=1)
+        self.checkequal(['', ' y z  '], '  y z  ', 'split', sep=' ', keepempty=True, maxsplit=1)
+        self.checkequal(['y', 'z  '], '  y z  ', 'split', sep=' ', keepempty=False, maxsplit=1)
+        self.checkequal(['', ' y z  '], '  y z  ', 'split', sep=' ', maxsplit=1)
+
+
     def test_rsplit(self):
         # by a char
         self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|')
@@ -538,6 +581,42 @@ def test_rsplit(self):
         self.checkraises(ValueError, 'hello', 'rsplit', '')
         self.checkraises(ValueError, 'hello', 'rsplit', '', 0)
 
+        # Without an explicit `sep`, or sep=None, empty strings are pruned from result
+        self.checkequal([], '', 'rsplit')
+        self.checkequal([], '', 'rsplit', sep=None)
+        self.checkequal([], '   ', 'rsplit')
+        self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'rsplit')
+
+        # With an explicit, non-None `sep`, empty strings are not pruned from result
+        self.checkequal([''], '', 'rsplit', sep=',')
+        self.checkequal(['', '', '', ''], '   ', 'rsplit', sep=' ')
+        self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'rsplit', sep=' ')
+        self.checkequal(['', '', ' y z '], 'xx y z ', 'rsplit', sep='x')
+
+        # keepempty=False to remove empty strings from result
+        self.checkequal([], '', 'rsplit', keepempty=False)
+        self.checkequal([], '   ', 'rsplit', keepempty=False)
+        self.checkequal([], '', 'rsplit', sep=',', keepempty=False)
+        self.checkequal([], '   ', 'rsplit', sep=' ', keepempty=False)
+        self.checkequal(['xx', 'y', 'z'], 'xx y z ', 'rsplit', sep=' ', keepempty=False)
+        self.checkequal([' y z '], 'xx y z ', 'rsplit', sep='x', keepempty=False)
+
+        # keepempty=True to retain empty strings in result
+        self.checkequal([''], '', 'rsplit', keepempty=True)
+        self.checkequal(['', '', '', ''], '   ', 'rsplit', keepempty=True)
+        self.checkequal([''], '', 'rsplit', sep=',', keepempty=True)
+        self.checkequal(['', '', '', ''], '   ', 'rsplit', sep=' ', keepempty=True)
+        self.checkequal(['xx', 'y', 'z', ''], 'xx y z ', 'rsplit', sep=' ', keepempty=True)
+        self.checkequal(['', '', ' y z '], 'xx y z ', 'rsplit', sep='x', keepempty=True)
+
+        # Empty strings kept with keepempty count towards maxsplit
+        self.checkequal(['  y z ', ''], '  y z  ', 'rsplit', keepempty=True, maxsplit=1)
+        self.checkequal(['  y', 'z'], '  y z  ', 'rsplit', keepempty=False, maxsplit=1)
+        self.checkequal(['  y', 'z'], '  y z  ', 'rsplit', maxsplit=1)
+        self.checkequal(['  y z ', ''], '  y z  ', 'rsplit', sep=' ', keepempty=True, maxsplit=1)
+        self.checkequal(['  y', 'z'], '  y z  ', 'rsplit', sep=' ', keepempty=False, maxsplit=1)
+        self.checkequal(['  y z ', ''], '  y z  ', 'rsplit', sep=' ', maxsplit=1)
+
     def test_replace(self):
         EQ = self.checkequal
 

diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst b/Misc/NEWS.d/next/Core and Builtins/2021-05-18-22-51-43.bpo-28937.eW9d_I.rst
@@ -0,0 +1,2 @@
+Add the ``keepempty`` argument to ``string.split``, ``bytes.split``,
+``bytearray.split`` and ``UserString.split``. Patch by Mark Bell.
diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c
@@ -1404,34 +1404,46 @@ bytearray.split
     maxsplit: Py_ssize_t = -1
         Maximum number of splits to do.
         -1 (the default value) means no limit.
+    keepempty: object = None
+        Determines whether or not to keep empty strings in the final list.
 
 Return a list of the sections in the bytearray, using sep as the delimiter.
 [clinic start generated code]*/
 
 static PyObject *
 bytearray_split_impl(PyByteArrayObject *self, PyObject *sep,
-                     Py_ssize_t maxsplit)
-/*[clinic end generated code: output=833e2cf385d9a04d input=24f82669f41bf523]*/
+                     Py_ssize_t maxsplit, PyObject *keepempty)
+/*[clinic end generated code: output=28286c156d864181 input=908de7e1dd1fd8ca]*/
 {
     Py_ssize_t len = PyByteArray_GET_SIZE(self), n;
     const char *s = PyByteArray_AS_STRING(self), *sub;
     PyObject *list;
     Py_buffer vsub;
+    int prune;
+
+    if (keepempty == Py_None) {
+        if (sep == Py_None)
+            prune = 1;
+        else
+            prune = 0;
+    } else {
+        prune = PyObject_Not(keepempty);
+        if (prune < 0)
+            return NULL;
+    }
 
     if (maxsplit < 0)
         maxsplit = PY_SSIZE_T_MAX;
 
     if (sep == Py_None)
-        return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit);
+        return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, prune);
 
     if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0)
         return NULL;
     sub = vsub.buf;
     n = vsub.len;
 
-    list = stringlib_split(
-        (PyObject*) self, s, len, sub, n, maxsplit
-        );
+    list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit, prune);
     PyBuffer_Release(&vsub);
     return list;
 }
@@ -1521,28 +1533,38 @@ Splitting is done starting at the end of the bytearray and working to the front.
 
 static PyObject *
 bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep,
-                      Py_ssize_t maxsplit)
-/*[clinic end generated code: output=a55e0b5a03cb6190 input=a68286e4dd692ffe]*/
+                      Py_ssize_t maxsplit, PyObject *keepempty)
+/*[clinic end generated code: output=d8c2e7552a91a174 input=a68286e4dd692ffe]*/
 {
     Py_ssize_t len = PyByteArray_GET_SIZE(self), n;
     const char *s = PyByteArray_AS_STRING(self), *sub;
     PyObject *list;
     Py_buffer vsub;
+    int prune;
+
+    if (keepempty == Py_None) {
+        if (sep == Py_None)
+            prune = 1;
+        else
+            prune = 0;
+    } else {
+        prune = PyObject_Not(keepempty);
+        if (prune < 0)
+            return NULL;
+    }
 
     if (maxsplit < 0)
         maxsplit = PY_SSIZE_T_MAX;
 
     if (sep == Py_None)
-        return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit);
+        return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, prune);
 
     if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0)
         return NULL;
     sub = vsub.buf;
     n = vsub.len;
 
-    list = stringlib_rsplit(
-        (PyObject*) self, s, len, sub, n, maxsplit
-        );
+    list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit, prune);
     PyBuffer_Release(&vsub);
     return list;
 }

diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
@@ -1723,29 +1723,44 @@ bytes.split
     maxsplit: Py_ssize_t = -1
         Maximum number of splits to do.
         -1 (the default value) means no limit.
+    keepempty: object = None
+        Determines whether or not to keep empty strings in the final list
 
 Return a list of the sections in the bytes, using sep as the delimiter.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit)
-/*[clinic end generated code: output=52126b5844c1d8ef input=8b809b39074abbfa]*/
+bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit,
+                 PyObject *keepempty)
+/*[clinic end generated code: output=e1b678240fbff2e0 input=e58ccb5eb2569eb4]*/
 {
     Py_ssize_t len = PyBytes_GET_SIZE(self), n;
     const char *s = PyBytes_AS_STRING(self), *sub;
     Py_buffer vsub;
     PyObject *list;
+    int prune;
+
+    if (keepempty == Py_None) {
+        if (sep == Py_None)
+            prune = 1;
+        else
+            prune = 0;
+    } else {
+        prune = PyObject_Not(keepempty);
+        if (prune < 0)
+            return NULL;
+    }
 
     if (maxsplit < 0)
         maxsplit = PY_SSIZE_T_MAX;
     if (sep == Py_None)
-        return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit);
+        return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, prune);
     if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0)
         return NULL;
     sub = vsub.buf;
     n = vsub.len;
 
-    list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit);
+    list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit, prune);
     PyBuffer_Release(&vsub);
     return list;
 }
@@ -1813,24 +1828,37 @@ Splitting is done starting at the end of the bytes and working to the front.
 [clinic start generated code]*/
 
 static PyObject *
-bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit)
-/*[clinic end generated code: output=ba698d9ea01e1c8f input=0f86c9f28f7d7b7b]*/
+bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit,
+                  PyObject *keepempty)
+/*[clinic end generated code: output=0e304d20c12f7ac0 input=0f86c9f28f7d7b7b]*/
 {
     Py_ssize_t len = PyBytes_GET_SIZE(self), n;
     const char *s = PyBytes_AS_STRING(self), *sub;
     Py_buffer vsub;
     PyObject *list;
+    int prune;
+
+    if (keepempty == Py_None) {
+        if (sep == Py_None)
+            prune = 1;
+        else
+            prune = 0;
+    } else {
+        prune = PyObject_Not(keepempty);
+        if (prune < 0)
+            return NULL;
+    }
 
     if (maxsplit < 0)
         maxsplit = PY_SSIZE_T_MAX;
     if (sep == Py_None)
-        return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit);
+        return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, prune);
     if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0)
         return NULL;
     sub = vsub.buf;
     n = vsub.len;
 
-    list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit);
+    list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit, prune);
     PyBuffer_Release(&vsub);
     return list;
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Add the ``keepempty`` argument to ``string.split``, ``bytes.split``,
		``bytearray.split`` and ``UserString.split``. Patch by Mark Bell.