diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index 31d71031bca12c..6a4fe494ea7018 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -2323,12 +2323,6 @@ expression support in the :mod:`re` module). +-----------------------+-----------------------------+ | ``\f`` or ``\x0c`` | Form Feed | +-----------------------+-----------------------------+ - | ``\x1c`` | File Separator | - +-----------------------+-----------------------------+ - | ``\x1d`` | Group Separator | - +-----------------------+-----------------------------+ - | ``\x1e`` | Record Separator | - +-----------------------+-----------------------------+ | ``\x85`` | Next Line (C1 Control Code) | +-----------------------+-----------------------------+ | ``\u2028`` | Line Separator | diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-04-10-18-26-49.gh-issue-66428.sJ9yJn.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-04-10-18-26-49.gh-issue-66428.sJ9yJn.rst new file mode 100644 index 00000000000000..96a504e504ca97 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-04-10-18-26-49.gh-issue-66428.sJ9yJn.rst @@ -0,0 +1,4 @@ +Remove Unicode characters that have the bidirectional B property but are not +mandatory line breakers (U+001C, U+001D and U+001E) from the list of +line-breaking characters. ``str.splitlines()`` will not break on these +characters any more. diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h index 5be810dd67426a..93de16bc42b925 100644 --- a/Objects/unicodetype_db.h +++ b/Objects/unicodetype_db.h @@ -2971,7 +2971,7 @@ static const unsigned short index1[] = { static const unsigned short index2[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 2, 2, 2, 1, 3, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4, + 0, 0, 0, 0, 1, 1, 1, 1, 3, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5, 4, 4, 4, 4, 4, 4, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 4, 4, 4, 5, 17, 5, 18, 18, 18, 18, 18, 18, 18, 18, 18, @@ -6711,8 +6711,7 @@ int _PyUnicode_IsWhitespace(const Py_UCS4 ch) } /* Returns 1 for Unicode characters having the line break - * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional - * type 'B', 0 otherwise. + * property 'BK', 'CR', 'LF' or 'NL', 0 otherwise. */ int _PyUnicode_IsLinebreak(const Py_UCS4 ch) { @@ -6721,9 +6720,6 @@ int _PyUnicode_IsLinebreak(const Py_UCS4 ch) case 0x000B: case 0x000C: case 0x000D: - case 0x001C: - case 0x001D: - case 0x001E: case 0x0085: case 0x2028: case 0x2029: diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index d4cca68c3e3e71..33d74fcc745b2f 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -437,7 +437,7 @@ def makeunicodetype(unicode, trace): flags |= ALPHA_MASK if "Lowercase" in properties: flags |= LOWER_MASK - if 'Line_Break' in properties or bidirectional == "B": + if 'Line_Break' in properties: flags |= LINEBREAK_MASK linebreaks.append(char) if category == "Zs" or bidirectional in ("WS", "B", "S"): @@ -603,8 +603,7 @@ def makeunicodetype(unicode, trace): # Generate code for _PyUnicode_IsLinebreak() fprint("/* Returns 1 for Unicode characters having the line break") - fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional") - fprint(" * type 'B', 0 otherwise.") + fprint(" * property 'BK', 'CR', 'LF' or 'NL', 0 otherwise.") fprint(" */") fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)') fprint('{')