From 592a769bd74a95775646d53f940956bf813b1df8 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 19 Nov 2024 10:47:01 +0200 Subject: [PATCH 1/2] gh-124130: Fix a bug in matching regular expression \B in empty string --- Doc/library/re.rst | 7 ++----- Lib/test/test_re.py | 13 +++++-------- .../2024-11-19-10-46-57.gh-issue-124130.OZ_vR5.rst | 2 ++ Modules/_sre/sre_lib.h | 12 ------------ 4 files changed, 9 insertions(+), 25 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-11-19-10-46-57.gh-issue-124130.OZ_vR5.rst diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 9db6f1da3be4db..55b3e73e317b78 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -572,11 +572,8 @@ character ``'$'``. Word boundaries are determined by the current locale if the :py:const:`~re.LOCALE` flag is used. - .. note:: - - Note that ``\B`` does not match an empty string, which differs from - RE implementations in other programming languages such as Perl. - This behavior is kept for compatibility reasons. + .. versionchanged:: 3.14 + ``\B`` now matches the whole empty string. .. index:: single: \d; in regular expressions diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 0d3599be87f228..5538de60b2a03a 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -978,18 +978,15 @@ def test_word_boundaries(self): self.assertIsNone(re.fullmatch(br".+\B", b"abc", re.LOCALE)) self.assertIsNone(re.fullmatch(r".+\B", "ьюя")) self.assertTrue(re.fullmatch(r".+\B", "ьюя", re.ASCII)) - # However, an empty string contains no word boundaries, and also no - # non-boundaries. + # However, an empty string contains no word boundaries. self.assertIsNone(re.search(r"\b", "")) self.assertIsNone(re.search(r"\b", "", re.ASCII)) self.assertIsNone(re.search(br"\b", b"")) self.assertIsNone(re.search(br"\b", b"", re.LOCALE)) - # This one is questionable and different from the perlre behaviour, - # but describes current behavior. - self.assertIsNone(re.search(r"\B", "")) - self.assertIsNone(re.search(r"\B", "", re.ASCII)) - self.assertIsNone(re.search(br"\B", b"")) - self.assertIsNone(re.search(br"\B", b"", re.LOCALE)) + self.assertTrue(re.search(r"\B", "")) + self.assertTrue(re.search(r"\B", "", re.ASCII)) + self.assertTrue(re.search(br"\B", b"")) + self.assertTrue(re.search(br"\B", b"", re.LOCALE)) # A single word-character string has two boundaries, but no # non-boundary gaps. self.assertEqual(len(re.findall(r"\b", "a")), 2) diff --git a/Misc/NEWS.d/next/Library/2024-11-19-10-46-57.gh-issue-124130.OZ_vR5.rst b/Misc/NEWS.d/next/Library/2024-11-19-10-46-57.gh-issue-124130.OZ_vR5.rst new file mode 100644 index 00000000000000..c5cd9648000832 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-19-10-46-57.gh-issue-124130.OZ_vR5.rst @@ -0,0 +1,2 @@ +Fix a bug in matching regular expression ``\B`` in empty string. Now it +is always the opposite of ``\b``. diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index af4bfc56083bcb..df377905bfae0d 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -42,8 +42,6 @@ SRE(at)(SRE_STATE* state, const SRE_CHAR* ptr, SRE_CODE at) return ((void*) ptr == state->end); case SRE_AT_BOUNDARY: - if (state->beginning == state->end) - return 0; thatp = ((void*) ptr > state->beginning) ? SRE_IS_WORD((int) ptr[-1]) : 0; thisp = ((void*) ptr < state->end) ? @@ -51,8 +49,6 @@ SRE(at)(SRE_STATE* state, const SRE_CHAR* ptr, SRE_CODE at) return thisp != thatp; case SRE_AT_NON_BOUNDARY: - if (state->beginning == state->end) - return 0; thatp = ((void*) ptr > state->beginning) ? SRE_IS_WORD((int) ptr[-1]) : 0; thisp = ((void*) ptr < state->end) ? @@ -60,8 +56,6 @@ SRE(at)(SRE_STATE* state, const SRE_CHAR* ptr, SRE_CODE at) return thisp == thatp; case SRE_AT_LOC_BOUNDARY: - if (state->beginning == state->end) - return 0; thatp = ((void*) ptr > state->beginning) ? SRE_LOC_IS_WORD((int) ptr[-1]) : 0; thisp = ((void*) ptr < state->end) ? @@ -69,8 +63,6 @@ SRE(at)(SRE_STATE* state, const SRE_CHAR* ptr, SRE_CODE at) return thisp != thatp; case SRE_AT_LOC_NON_BOUNDARY: - if (state->beginning == state->end) - return 0; thatp = ((void*) ptr > state->beginning) ? SRE_LOC_IS_WORD((int) ptr[-1]) : 0; thisp = ((void*) ptr < state->end) ? @@ -78,8 +70,6 @@ SRE(at)(SRE_STATE* state, const SRE_CHAR* ptr, SRE_CODE at) return thisp == thatp; case SRE_AT_UNI_BOUNDARY: - if (state->beginning == state->end) - return 0; thatp = ((void*) ptr > state->beginning) ? SRE_UNI_IS_WORD((int) ptr[-1]) : 0; thisp = ((void*) ptr < state->end) ? @@ -87,8 +77,6 @@ SRE(at)(SRE_STATE* state, const SRE_CHAR* ptr, SRE_CODE at) return thisp != thatp; case SRE_AT_UNI_NON_BOUNDARY: - if (state->beginning == state->end) - return 0; thatp = ((void*) ptr > state->beginning) ? SRE_UNI_IS_WORD((int) ptr[-1]) : 0; thisp = ((void*) ptr < state->end) ? From bbace330a580088fe797777c60ae3f064088b1e7 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 22 Nov 2024 21:12:00 +0200 Subject: [PATCH 2/2] Update docs. --- Doc/library/re.rst | 4 ++-- Doc/whatsnew/3.14.rst | 4 ++++ .../Library/2024-11-19-10-46-57.gh-issue-124130.OZ_vR5.rst | 6 ++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 55b3e73e317b78..29387a429b844c 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -572,8 +572,8 @@ character ``'$'``. Word boundaries are determined by the current locale if the :py:const:`~re.LOCALE` flag is used. - .. versionchanged:: 3.14 - ``\B`` now matches the whole empty string. + .. versionchanged:: next + ``\B`` now matches empty input string. .. index:: single: \d; in regular expressions diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 958efbe73c1c27..be0c8c2e5eef1a 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -202,6 +202,10 @@ Other language changes making it a :term:`generic type`. (Contributed by Brian Schubert in :gh:`126012`.) +* ``\B`` in :mod:`regular expression ` now matches empty input string. + Now it is always the opposite of ``\b``. + (Contributed by Serhiy Storchaka in :gh:`124130`.) + New modules =========== diff --git a/Misc/NEWS.d/next/Library/2024-11-19-10-46-57.gh-issue-124130.OZ_vR5.rst b/Misc/NEWS.d/next/Library/2024-11-19-10-46-57.gh-issue-124130.OZ_vR5.rst index c5cd9648000832..a1d4fc8ff4c22f 100644 --- a/Misc/NEWS.d/next/Library/2024-11-19-10-46-57.gh-issue-124130.OZ_vR5.rst +++ b/Misc/NEWS.d/next/Library/2024-11-19-10-46-57.gh-issue-124130.OZ_vR5.rst @@ -1,2 +1,4 @@ -Fix a bug in matching regular expression ``\B`` in empty string. Now it -is always the opposite of ``\b``. +Fix a bug in matching regular expression ``\B`` in empty input string. +Now it is always the opposite of ``\b``. +To get an old behavior, use ``(?!\A\Z)\B``. +To get a new behavior in old Python versions, use ``(?!\b)``.