From 0c9d5ad737b8393fccd2718e6a77f0ec0a06afa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 9 Feb 2025 13:48:05 +0100 Subject: [PATCH 01/11] Use new helpers in the `xmlcharrefreplace` handler. --- Python/codecs.c | 125 ++++++++++++++++++++++++------------------------ 1 file changed, 62 insertions(+), 63 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 6c9f8222079ec8..cec753c5abd116 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -730,6 +730,56 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch) } +static inline void +codec_handler_unicode_log10_max(Py_UCS4 ch, uint64_t *base, uint64_t *digits) +{ +#define MAKE_BRANCH(D, N) \ + do { \ + if (ch < (N)) { \ + if (base != NULL) { \ + *base = (N); \ + } \ + if (digits != NULL) { \ + *digits = (D); \ + } \ + } \ + } while (0) + MAKE_BRANCH(1, 10); + MAKE_BRANCH(2, 100); + MAKE_BRANCH(3, 1000); + MAKE_BRANCH(4, 10000); + MAKE_BRANCH(5, 100000); + MAKE_BRANCH(6, 1000000); + MAKE_BRANCH(7, 10000000); +#undef MAKE_BRANCH + Py_UNREACHABLE(); +} + + +/* + * Write the decimal representation of 'ch' to the buffer pointed by 'p' + * using at most 7 characters prefixed by '&#' and suffixed by ';'. + */ +static inline void +codec_handler_write_unicode_dec(Py_UCS1 **p, Py_UCS4 ch) +{ + uint64_t base = 0, digits = 0; + codec_handler_unicode_log10_max(ch, &base, &digits); + assert(base != 0 && digits != 0); + assert(digits <= 7); + + *(*p)++ = '&'; + *(*p)++ = '#'; + while (digits-- > 0) { + assert(base >= 1); + *(*p)++ = '0' + ch / base; + ch %= base; + base /= 10; + } + *(*p)++ = ';'; +} + + // --- handler: 'strict' ------------------------------------------------------ PyObject *PyCodec_StrictErrors(PyObject *exc) @@ -825,9 +875,12 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) } } + +// --- handler: 'xmlcharrefreplace' ------------------------------------------- + PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) { - if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { + if (!_PyIsUnicodeEncodeError(exc)) { wrong_exception_type(exc); return NULL; } @@ -856,28 +909,11 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) for (Py_ssize_t i = start; i < end; ++i) { /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); - if (ch < 10) { - ressize += 2 + 1 + 1; - } - else if (ch < 100) { - ressize += 2 + 2 + 1; - } - else if (ch < 1000) { - ressize += 2 + 3 + 1; - } - else if (ch < 10000) { - ressize += 2 + 4 + 1; - } - else if (ch < 100000) { - ressize += 2 + 5 + 1; - } - else if (ch < 1000000) { - ressize += 2 + 6 + 1; - } - else { - assert(ch < 10000000); - ressize += 2 + 7 + 1; - } + uint64_t k = 0; + codec_handler_unicode_log10_max(ch, NULL, &k); + assert(k != 0); + assert(k <= 7); + ressize += 2 + k + 1; } /* allocate replacement */ @@ -889,46 +925,8 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); /* generate replacement */ for (Py_ssize_t i = start; i < end; ++i) { - int digits, base; Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); - if (ch < 10) { - digits = 1; - base = 1; - } - else if (ch < 100) { - digits = 2; - base = 10; - } - else if (ch < 1000) { - digits = 3; - base = 100; - } - else if (ch < 10000) { - digits = 4; - base = 1000; - } - else if (ch < 100000) { - digits = 5; - base = 10000; - } - else if (ch < 1000000) { - digits = 6; - base = 100000; - } - else { - assert(ch < 10000000); - digits = 7; - base = 1000000; - } - *outp++ = '&'; - *outp++ = '#'; - while (digits-- > 0) { - assert(base >= 1); - *outp++ = '0' + ch / base; - ch %= base; - base /= 10; - } - *outp++ = ';'; + codec_handler_write_unicode_dec(&outp, ch); } assert(_PyUnicode_CheckConsistency(res, 1)); PyObject *restuple = Py_BuildValue("(Nn)", res, end); @@ -1419,7 +1417,8 @@ static PyObject *replace_errors(PyObject *self, PyObject *exc) } -static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) +static inline PyObject * +xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) { return PyCodec_XMLCharRefReplaceErrors(exc); } From cb7114a4e089d28d480879be21dcf345ab3a90c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 9 Feb 2025 14:13:51 +0100 Subject: [PATCH 02/11] Fix tests --- Python/codecs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Python/codecs.c b/Python/codecs.c index cec753c5abd116..2ca3593a72ede7 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -743,6 +743,7 @@ codec_handler_unicode_log10_max(Py_UCS4 ch, uint64_t *base, uint64_t *digits) *digits = (D); \ } \ } \ + return; \ } while (0) MAKE_BRANCH(1, 10); MAKE_BRANCH(2, 100); From c6246930a2e22b365dd4344eaf18ea08de02ca15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 9 Feb 2025 14:19:55 +0100 Subject: [PATCH 03/11] Update Python/codecs.c --- Python/codecs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index 2ca3593a72ede7..fa25357fa922a0 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -742,8 +742,8 @@ codec_handler_unicode_log10_max(Py_UCS4 ch, uint64_t *base, uint64_t *digits) if (digits != NULL) { \ *digits = (D); \ } \ + return; \ } \ - return; \ } while (0) MAKE_BRANCH(1, 10); MAKE_BRANCH(2, 100); From bf2f4de73c85e58269c138998823b6a20f4fdf32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sun, 9 Feb 2025 15:14:22 +0100 Subject: [PATCH 04/11] Fix tests --- Python/codecs.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index fa25357fa922a0..6d8ddbebe50be1 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -731,11 +731,11 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch) static inline void -codec_handler_unicode_log10_max(Py_UCS4 ch, uint64_t *base, uint64_t *digits) +codec_handler_unicode_log10_max(Py_UCS4 ch, int *base, int *digits) { #define MAKE_BRANCH(D, N) \ do { \ - if (ch < (N)) { \ + if (ch < 10 * (N)) { \ if (base != NULL) { \ *base = (N); \ } \ @@ -745,13 +745,13 @@ codec_handler_unicode_log10_max(Py_UCS4 ch, uint64_t *base, uint64_t *digits) return; \ } \ } while (0) - MAKE_BRANCH(1, 10); - MAKE_BRANCH(2, 100); - MAKE_BRANCH(3, 1000); - MAKE_BRANCH(4, 10000); - MAKE_BRANCH(5, 100000); - MAKE_BRANCH(6, 1000000); - MAKE_BRANCH(7, 10000000); + MAKE_BRANCH(1, 1); + MAKE_BRANCH(2, 10); + MAKE_BRANCH(3, 100); + MAKE_BRANCH(4, 1000); + MAKE_BRANCH(5, 10000); + MAKE_BRANCH(6, 100000); + MAKE_BRANCH(7, 1000000); #undef MAKE_BRANCH Py_UNREACHABLE(); } @@ -764,7 +764,7 @@ codec_handler_unicode_log10_max(Py_UCS4 ch, uint64_t *base, uint64_t *digits) static inline void codec_handler_write_unicode_dec(Py_UCS1 **p, Py_UCS4 ch) { - uint64_t base = 0, digits = 0; + int base = 0, digits = 0; codec_handler_unicode_log10_max(ch, &base, &digits); assert(base != 0 && digits != 0); assert(digits <= 7); @@ -910,7 +910,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) for (Py_ssize_t i = start; i < end; ++i) { /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); - uint64_t k = 0; + int k = 0; codec_handler_unicode_log10_max(ch, NULL, &k); assert(k != 0); assert(k <= 7); From 713ece5506f0cfeee362db059de010f5372697bd Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 26 Feb 2025 13:33:56 +0100 Subject: [PATCH 05/11] Get log10 only, fill buffer backwards --- Python/codecs.c | 78 ++++++++++++++++++------------------------------- 1 file changed, 28 insertions(+), 50 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 0b5d0cba68220b..4061491d93eca9 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -730,54 +730,20 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch) } -static inline void -codec_handler_unicode_log10_max(Py_UCS4 ch, int *base, int *digits) -{ -#define MAKE_BRANCH(D, N) \ - do { \ - if (ch < 10 * (N)) { \ - if (base != NULL) { \ - *base = (N); \ - } \ - if (digits != NULL) { \ - *digits = (D); \ - } \ - return; \ - } \ - } while (0) - MAKE_BRANCH(1, 1); - MAKE_BRANCH(2, 10); - MAKE_BRANCH(3, 100); - MAKE_BRANCH(4, 1000); - MAKE_BRANCH(5, 10000); - MAKE_BRANCH(6, 100000); - MAKE_BRANCH(7, 1000000); -#undef MAKE_BRANCH - Py_UNREACHABLE(); -} - - -/* - * Write the decimal representation of 'ch' to the buffer pointed by 'p' - * using at most 7 characters prefixed by '&#' and suffixed by ';'. +/* Determine the number of digits for a decimal representation of codepoint ch */ -static inline void -codec_handler_write_unicode_dec(Py_UCS1 **p, Py_UCS4 ch) -{ - int base = 0, digits = 0; - codec_handler_unicode_log10_max(ch, &base, &digits); - assert(base != 0 && digits != 0); - assert(digits <= 7); - - *(*p)++ = '&'; - *(*p)++ = '#'; - while (digits-- > 0) { - assert(base >= 1); - *(*p)++ = '0' + ch / base; - ch %= base; - base /= 10; - } - *(*p)++ = ';'; +static inline int +n_decimal_digits_for_codepoint(Py_UCS4 ch) +{ + if (ch < 10) return 1; + if (ch < 100) return 2; + if (ch < 1000) return 3; + if (ch < 10000) return 4; + if (ch < 100000) return 5; + if (ch < 1000000) return 6; + if (ch < 10000000) return 7; + // Unicode codepoints are limited to 1114111 (7 decimal digits) + Py_UNREACHABLE(); } /* @@ -951,8 +917,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) for (Py_ssize_t i = start; i < end; ++i) { /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); - int k = 0; - codec_handler_unicode_log10_max(ch, NULL, &k); + int k = n_decimal_digits_for_codepoint(ch); assert(k != 0); assert(k <= 7); ressize += 2 + k + 1; @@ -968,7 +933,20 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) /* generate replacement */ for (Py_ssize_t i = start; i < end; ++i) { Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); - codec_handler_write_unicode_dec(&outp, ch); + /* + * Write the decimal representation of 'ch' to the buffer pointed by 'p' + * using at most 7 characters prefixed by '&#' and suffixed by ';'. + */ + *outp++ = '&'; + *outp++ = '#'; + Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch); + for (Py_UCS1 *digitp = digit_end - 1; digitp >= outp; --digitp) { + *digitp = '0' + (ch % 10); + ch /= 10; + } + assert(ch == 0); + outp = digit_end; + *outp++ = ';'; } assert(_PyUnicode_CheckConsistency(res, 1)); PyObject *restuple = Py_BuildValue("(Nn)", res, end); From 6edcfef40f036284688904bc34c1a694963a9e5a Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 26 Feb 2025 13:34:31 +0100 Subject: [PATCH 06/11] Remove obsolete comment --- Python/codecs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index 4061491d93eca9..686d231c3a51bf 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -915,7 +915,6 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) Py_ssize_t ressize = 0; for (Py_ssize_t i = start; i < end; ++i) { - /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); int k = n_decimal_digits_for_codepoint(ch); assert(k != 0); From b8fe3b63f9bdfaf6012085eeb9e37c9f5b3fa698 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 1 Mar 2025 10:22:57 +0100 Subject: [PATCH 07/11] post-merge --- Python/codecs.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 686d231c3a51bf..a066e2eff9e2bf 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -730,18 +730,34 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch) } -/* Determine the number of digits for a decimal representation of codepoint ch +/* + * Determine the number of digits for a decimal representation of Unicode + * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits). */ static inline int n_decimal_digits_for_codepoint(Py_UCS4 ch) { - if (ch < 10) return 1; - if (ch < 100) return 2; - if (ch < 1000) return 3; - if (ch < 10000) return 4; - if (ch < 100000) return 5; - if (ch < 1000000) return 6; - if (ch < 10000000) return 7; + if (ch < 10) { + return 1; + } + if (ch < 100) { + return 2; + } + if (ch < 1000) { + return 3; + } + if (ch < 10000) { + return 4; + } + if (ch < 100000) { + return 5; + } + if (ch < 1000000) { + return 6; + } + if (ch < 10000000) { + return 7; + } // Unicode codepoints are limited to 1114111 (7 decimal digits) Py_UNREACHABLE(); } From 51664c14cdda3b093c865daa1b3cb4af18f2c8b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 1 Mar 2025 10:23:48 +0100 Subject: [PATCH 08/11] post-merge --- Python/codecs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index a066e2eff9e2bf..029b22e84e18c1 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -954,9 +954,9 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) */ *outp++ = '&'; *outp++ = '#'; - Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch); - for (Py_UCS1 *digitp = digit_end - 1; digitp >= outp; --digitp) { - *digitp = '0' + (ch % 10); + const Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch); + for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) { + *p_digit = '0' + (ch % 10); ch /= 10; } assert(ch == 0); From c6feca6565a19e49b0ca6384cfc1da7d69712590 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Sat, 1 Mar 2025 10:24:25 +0100 Subject: [PATCH 09/11] post-merge --- Python/codecs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index 029b22e84e18c1..0059dcef795296 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -954,7 +954,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) */ *outp++ = '&'; *outp++ = '#'; - const Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch); + Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch); for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) { *p_digit = '0' + (ch % 10); ch /= 10; From 97c04b5d3c1a7ba8cb9dc5eedad1588d5a343a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 3 Mar 2025 10:48:08 +0100 Subject: [PATCH 10/11] Invoke forgotten PEP-7 rule --- Python/codecs.c | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 0059dcef795296..799c54aeceea90 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -737,27 +737,13 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch) static inline int n_decimal_digits_for_codepoint(Py_UCS4 ch) { - if (ch < 10) { - return 1; - } - if (ch < 100) { - return 2; - } - if (ch < 1000) { - return 3; - } - if (ch < 10000) { - return 4; - } - if (ch < 100000) { - return 5; - } - if (ch < 1000000) { - return 6; - } - if (ch < 10000000) { - return 7; - } + if (ch < 10) return 1; + if (ch < 100) return 2; + if (ch < 1000) return 3; + if (ch < 10000) return 4; + if (ch < 100000) return 5; + if (ch < 1000000) return 6; + if (ch < 10000000) return 7; // Unicode codepoints are limited to 1114111 (7 decimal digits) Py_UNREACHABLE(); } From f9a305d9cf5d1f563ebab64021ca597a4317ce42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Mon, 3 Mar 2025 12:12:45 +0100 Subject: [PATCH 11/11] add empty line --- Python/codecs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Python/codecs.c b/Python/codecs.c index 799c54aeceea90..d5d9a4a8bcabb7 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -748,6 +748,7 @@ n_decimal_digits_for_codepoint(Py_UCS4 ch) Py_UNREACHABLE(); } + /* * Create a Unicode string containing 'count' copies of the official * Unicode REPLACEMENT CHARACTER (0xFFFD).