Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0d3721d

Browse files
author
Victor Stinner
committed
Issue #13441: Disable temporary the check on the maximum character until
the Solaris issue is solved. But add assertion on the maximum character in various encoders: UTF-7, UTF-8, wide character (wchar_t*, Py_UNICODE*), unicode-escape, raw-unicode-escape. Fix also unicode_encode_ucs1() for backslashreplace error handler: Python is now always "wide".
1 parent f8facac commit 0d3721d

1 file changed

Lines changed: 12 additions & 20 deletions

File tree

Objects/unicodeobject.c

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -379,19 +379,6 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
379379
if (ch > maxchar)
380380
maxchar = ch;
381381
}
382-
if (maxchar > 0x10FFFF) {
383-
printf("Invalid Unicode string! {");
384-
for (i=0; i < ascii->length; i++)
385-
{
386-
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
387-
if (i)
388-
printf(", U+%04x", ch);
389-
else
390-
printf("U+%04x", ch);
391-
}
392-
printf("} (len=%lu)\n", ascii->length);
393-
abort();
394-
}
395382
if (kind == PyUnicode_1BYTE_KIND) {
396383
if (ascii->state.ascii == 0) {
397384
assert(maxchar >= 128);
@@ -406,7 +393,9 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
406393
}
407394
else {
408395
assert(maxchar >= 0x10000);
409-
assert(maxchar <= 0x10FFFF);
396+
/* FIXME: Issue #13441: on Solaris, localeconv() and strxfrm()
397+
return characters outside the range U+0000-U+10FFFF. */
398+
/* assert(maxchar <= 0x10FFFF); */
410399
}
411400
}
412401
return 1;
@@ -3482,6 +3471,7 @@ PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
34823471
four_bytes = PyUnicode_4BYTE_DATA(unicode);
34833472
for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
34843473
if (*four_bytes > 0xFFFF) {
3474+
assert(*four_bytes <= 0x10FFFF);
34853475
/* encode surrogate pair in this case */
34863476
*w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
34873477
*w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
@@ -4128,6 +4118,8 @@ _PyUnicode_EncodeUTF7(PyObject *str,
41284118
continue;
41294119
encode_char:
41304120
if (ch >= 0x10000) {
4121+
assert(ch <= 0x10FFFF);
4122+
41314123
/* code first surrogate */
41324124
base64bits += 16;
41334125
base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
@@ -4899,6 +4891,7 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
48994891
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
49004892
*p++ = (char)(0x80 | (ch & 0x3f));
49014893
} else /* ch >= 0x10000 */ {
4894+
assert(ch <= 0x10FFFF);
49024895
/* Encode UCS4 Unicode ordinals */
49034896
*p++ = (char)(0xf0 | (ch >> 18));
49044897
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
@@ -5971,6 +5964,7 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
59715964

59725965
/* Map 21-bit characters to '\U00xxxxxx' */
59735966
else if (ch >= 0x10000) {
5967+
assert(ch <= 0x10FFFF);
59745968
*p++ = '\\';
59755969
*p++ = 'U';
59765970
*p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
@@ -6191,6 +6185,7 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
61916185
Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
61926186
/* Map 32-bit characters to '\Uxxxxxxxx' */
61936187
if (ch >= 0x10000) {
6188+
assert(ch <= 0x10FFFF);
61946189
*p++ = '\\';
61956190
*p++ = 'U';
61966191
*p++ = Py_hexdigits[(ch >> 28) & 0xf];
@@ -6546,17 +6541,14 @@ unicode_encode_ucs1(PyObject *unicode,
65466541
repsize += 2+3+1;
65476542
else if (ch < 10000)
65486543
repsize += 2+4+1;
6549-
#ifndef Py_UNICODE_WIDE
6550-
else
6551-
repsize += 2+5+1;
6552-
#else
65536544
else if (ch < 100000)
65546545
repsize += 2+5+1;
65556546
else if (ch < 1000000)
65566547
repsize += 2+6+1;
6557-
else
6548+
else {
6549+
assert(ch <= 0x10FFFF);
65586550
repsize += 2+7+1;
6559-
#endif
6551+
}
65606552
}
65616553
requiredsize = respos+repsize+(size-collend);
65626554
if (requiredsize > ressize) {

0 commit comments

Comments
 (0)