Thanks to visit codestin.com
Credit goes to github.com

Skip to content

gh-127971: fix off-by-one read beyond the end of a string during search #132574

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions Lib/test/string_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,40 @@ def test_replace(self):
self.checkraises(TypeError, 'hello', 'replace', 42, 'h')
self.checkraises(TypeError, 'hello', 'replace', 'h', 42)

def test_replacement_on_buffer_boundary(self):

# gh-127971: Check we don't read past the end of the buffer when a
# potential match misses on the last character. Note this will likely
# not cause a failure unless ASAN is enabled, and even that may be
# dependent on implementation details subject to change.
any_3_nonblank_codepoints = '!!!'
seven_codepoints = any_3_nonblank_codepoints + ' ' + any_3_nonblank_codepoints
a = (' ' * 243) + seven_codepoints + (' ' * 7)
b = ' ' * 6 + chr(256)
a.replace(seven_codepoints, b)

def test_adaptive_find_on_buffer_boundary(self):

# gh-127971: This exercises the adaptive search algorithm to trigger a
# corner-case where it might examine the character *after* the last
# position that could be the start of the pattern.
#
# Unfortunately there is nothing to *test* to confirm whether the
# character is read or not, nor in fact does it matter for correctness
# with the implementation at time of writing: the adaptive algorithm is
# only triggered if the input is over a certain size and with a pattern
# with more than one character, so with the current implementation even
# though the final character read is not necessary or significant, it
# won't cause a fault.
#
# This test at least intentionally exercises this path, and might
# possibly catch a regression if the implementation changes and breaks
# those assumptions.
prefix = ' ' * (1024 * 4)
haystack = prefix + 'x'
needle = prefix + 'y'
self.assertEqual(haystack.find(needle), -1)

def test_replace_uses_two_way_maxcount(self):
# Test that maxcount works in _two_way_count in fastsearch.h
A, B = "A"*1000, "B"*1000
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix off-by-one read beyond the end of a string in string search
19 changes: 15 additions & 4 deletions Objects/stringlib/fastsearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ STRINGLIB(default_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
continue;
}
/* miss: check if next character is part of pattern */
if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
if (i + 1 <= w && !STRINGLIB_BLOOM(mask, ss[i+1])) {
i = i + m;
}
else {
Expand All @@ -604,7 +604,7 @@ STRINGLIB(default_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
}
else {
/* skip: check if next character is part of pattern */
if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
if (i + 1 <= w && !STRINGLIB_BLOOM(mask, ss[i+1])) {
i = i + m;
}
}
Expand Down Expand Up @@ -667,7 +667,16 @@ STRINGLIB(adaptive_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
return res + count;
}
}
/* miss: check if next character is part of pattern */

/* Miss: check if next character is part of pattern.
Note that in contrast to default_find and default_rfind we do
*not* need to prevent the algorithm from reading one character
beyond the last character in the input that the pattern could
start in. I.e. if i == w it is safe to read ss[i + 1] since the
input and pattern length requirements on when this variant
algorithm will be called ensure it will always be a valid part
of the input. In that case it doesn't matter what the character
read is since the loop will terminate regardless. */
if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
i = i + m;
}
Expand All @@ -676,7 +685,9 @@ STRINGLIB(adaptive_find)(const STRINGLIB_CHAR* s, Py_ssize_t n,
}
}
else {
/* skip: check if next character is part of pattern */
/* Skip: check if next character is part of pattern.
See comment above re safety of accessing ss[i+1] when i == w.
*/
if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
i = i + m;
}
Expand Down
Loading