python · duaneg · Apr 15, 2025 · Apr 16, 2025 · Apr 16, 2025 · Jun 21, 2025
diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py
@@ -767,6 +767,40 @@ def test_replace(self):
         self.checkraises(TypeError, 'hello', 'replace', 42, 'h')
         self.checkraises(TypeError, 'hello', 'replace', 'h', 42)
 
+    def test_replacement_on_buffer_boundary(self):
+
+        # gh-127971: Check we don't read past the end of the buffer when a
+        # potential match misses on the last character. Note this will likely
+        # not cause a failure unless ASAN is enabled, and even that may be
+        # dependent on implementation details subject to change.
+        any_3_nonblank_codepoints = '!!!'
+        seven_codepoints = any_3_nonblank_codepoints + ' ' + any_3_nonblank_codepoints
+        a = (' ' * 243) + seven_codepoints + (' ' * 7)
+        b = ' ' * 6 + chr(256)
+        a.replace(seven_codepoints, b)
+
+    def test_adaptive_find_on_buffer_boundary(self):
+
+        # gh-127971: This exercises the adaptive search algorithm to trigger a
+        # corner-case where it might examine the character *after* the last
+        # position that could be the start of the pattern.
+        #
+        # Unfortunately there is nothing to *test* to confirm whether the
+        # character is read or not, nor in fact does it matter for correctness
+        # with the implementation at time of writing: the adaptive algorithm is
+        # only triggered if the input is over a certain size and with a pattern
+        # with more than one character, so with the current implementation even
+        # though the final character read is not necessary or significant, it
+        # won't cause a fault.
+        #
+        # This test at least intentionally exercises this path, and might
+        # possibly catch a regression if the implementation changes and breaks
+        # those assumptions.
+        prefix = ' ' * (1024 * 4)
+        haystack = prefix + 'x'
+        needle = prefix + 'y'
+        self.assertEqual(haystack.find(needle), -1)
+
     def test_replace_uses_two_way_maxcount(self):
         # Test that maxcount works in _two_way_count in fastsearch.h
         A, B = "A"*1000, "B"*1000

diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-04-16-12-01-13.gh-issue-127971.pMDOQ0.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-04-16-12-01-13.gh-issue-127971.pMDOQ0.rst
@@ -0,0 +1 @@
+Fix off-by-one read beyond the end of a string in string search
diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h
@@ -595,7 +595,7 @@ STRINGLIB(default_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
                 continue;
             }
             /* miss: check if next character is part of pattern */
-            if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
+            if (i + 1 <= w && !STRINGLIB_BLOOM(mask, ss[i+1])) {
                 i = i + m;
             }
             else {
@@ -604,7 +604,7 @@ STRINGLIB(default_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
         }
         else {
             /* skip: check if next character is part of pattern */
-            if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
+            if (i + 1 <= w && !STRINGLIB_BLOOM(mask, ss[i+1])) {
                 i = i + m;
             }
         }
@@ -667,7 +667,16 @@ STRINGLIB(adaptive_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
                     return res + count;
                 }
             }
-            /* miss: check if next character is part of pattern */
+
+            /* Miss: check if next character is part of pattern.
+               Note that in contrast to default_find and default_rfind we do
+               *not* need to prevent the algorithm from reading one character
+               beyond the last character in the input that the pattern could
+               start in. I.e. if i == w it is safe to read ss[i + 1] since the
+               input and pattern length requirements on when this variant
+               algorithm will be called ensure it will always be a valid part
+               of the input. In that case it doesn't matter what the character
+               read is since the loop will terminate regardless. */
             if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
                 i = i + m;
             }
@@ -676,7 +685,9 @@ STRINGLIB(adaptive_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
             }
         }
         else {
-            /* skip: check if next character is part of pattern */
+            /* Skip: check if next character is part of pattern.
+               See comment above re safety of accessing ss[i+1] when i == w.
+             */
             if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
                 i = i + m;
             }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Fix off-by-one read beyond the end of a string in string search