Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 797485e

Browse files
committed
Issue #25318: Avoid sprintf() in backslashreplace()
Rewrite backslashreplace() to be closer to PyCodec_BackslashReplaceErrors(). Add also unit tests for non-BMP characters.
1 parent b13b97d commit 797485e

2 files changed

Lines changed: 22 additions & 9 deletions

File tree

Lib/test/test_codecs.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3155,7 +3155,8 @@ def test_encode_error(self):
31553155
('[\x80\xff\u20ac]', 'ignore', b'[]'),
31563156
('[\x80\xff\u20ac]', 'replace', b'[???]'),
31573157
('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'),
3158-
('[\x80\xff\u20ac]', 'backslashreplace', b'[\\x80\\xff\\u20ac]'),
3158+
('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3159+
b'[\\x80\\xff\\u20ac\\U000abcde]'),
31593160
('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
31603161
):
31613162
with self.subTest(data=data, error_handler=error_handler,
@@ -3197,7 +3198,8 @@ def test_encode_errors(self):
31973198
for data, error_handler, expected in (
31983199
('[\u20ac\udc80]', 'ignore', b'[]'),
31993200
('[\u20ac\udc80]', 'replace', b'[??]'),
3200-
('[\u20ac\udc80]', 'backslashreplace', b'[\\u20ac\\udc80]'),
3201+
('[\u20ac\U000abcde]', 'backslashreplace',
3202+
b'[\\u20ac\\U000abcde]'),
32013203
('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'),
32023204
('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
32033205
):

Objects/unicodeobject.c

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -610,14 +610,25 @@ backslashreplace(_PyBytesWriter *writer, Py_ssize_t prealloc_per_char,
610610
/* generate replacement */
611611
for (i = collstart; i < collend; ++i) {
612612
ch = PyUnicode_READ(kind, data, i);
613-
if (ch < 0x100)
614-
str += sprintf(str, "\\x%02x", ch);
615-
else if (ch < 0x10000)
616-
str += sprintf(str, "\\u%04x", ch);
617-
else {
618-
assert(ch <= MAX_UNICODE);
619-
str += sprintf(str, "\\U%08x", ch);
613+
*str++ = '\\';
614+
if (ch >= 0x00010000) {
615+
*str++ = 'U';
616+
*str++ = Py_hexdigits[(ch>>28)&0xf];
617+
*str++ = Py_hexdigits[(ch>>24)&0xf];
618+
*str++ = Py_hexdigits[(ch>>20)&0xf];
619+
*str++ = Py_hexdigits[(ch>>16)&0xf];
620+
*str++ = Py_hexdigits[(ch>>12)&0xf];
621+
*str++ = Py_hexdigits[(ch>>8)&0xf];
622+
}
623+
else if (ch >= 0x100) {
624+
*str++ = 'u';
625+
*str++ = Py_hexdigits[(ch>>12)&0xf];
626+
*str++ = Py_hexdigits[(ch>>8)&0xf];
620627
}
628+
else
629+
*str++ = 'x';
630+
*str++ = Py_hexdigits[(ch>>4)&0xf];
631+
*str++ = Py_hexdigits[ch&0xf];
621632
}
622633
return str;
623634
}

0 commit comments

Comments
 (0)