Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 31be90b

Browse files
author
Victor Stinner
committed
Issue #8092: Fix PyUnicode_EncodeUTF8() to support error handler producing
unicode string (eg. backslashreplace)
1 parent 29619b2 commit 31be90b

3 files changed

Lines changed: 93 additions & 47 deletions

File tree

Lib/test/test_codecs.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,16 @@ def test_decoder_state(self):
571571
def test_lone_surrogates(self):
572572
self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
573573
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
574+
self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
575+
b'[\\udc80]')
576+
self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
577+
b'[�]')
578+
self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
579+
b'[\x80]')
580+
self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
581+
b'[]')
582+
self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
583+
b'[?]')
574584

575585
def test_surrogatepass_handler(self):
576586
self.assertEquals("abc\ud800def".encode("utf-8", "surrogatepass"),

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ What's New in Python 3.2 Alpha 1?
1212
Core and Builtins
1313
-----------------
1414

15+
- Issue #8092: Fix PyUnicode_EncodeUTF8() to support error handler producing
16+
unicode string (eg. backslashreplace)
17+
1518
- Issue #8485: PyUnicode_FSConverter() doesn't accept bytearray object anymore,
1619
you have to convert your bytearray filenames to bytes
1720

Objects/unicodeobject.c

Lines changed: 80 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,12 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
159159
const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160160
Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161161

162+
static void raise_encode_exception(PyObject **exceptionObject,
163+
const char *encoding,
164+
const Py_UNICODE *unicode, Py_ssize_t size,
165+
Py_ssize_t startpos, Py_ssize_t endpos,
166+
const char *reason);
167+
162168
/* Same for linebreaks */
163169
static unsigned char ascii_linebreak[] = {
164170
0, 0, 0, 0, 0, 0, 0, 0,
@@ -2542,61 +2548,88 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
25422548
/* Encode Latin-1 */
25432549
*p++ = (char)(0xc0 | (ch >> 6));
25442550
*p++ = (char)(0x80 | (ch & 0x3f));
2545-
}
2546-
else {
2547-
/* Encode UCS2 Unicode ordinals */
2548-
if (ch < 0x10000) {
2551+
} else if (0xD800 <= ch && ch <= 0xDFFF) {
25492552
#ifndef Py_UNICODE_WIDE
2550-
/* Special case: check for high surrogate */
2551-
if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2552-
Py_UCS4 ch2 = s[i];
2553-
/* Check for low surrogate and combine the two to
2554-
form a UCS4 value */
2555-
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2556-
ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2557-
i++;
2558-
goto encodeUCS4;
2559-
}
2560-
/* Fall through: handles isolated high surrogates */
2561-
}
2553+
/* Special case: check for high and low surrogate */
2554+
if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2555+
Py_UCS4 ch2 = s[i];
2556+
/* Combine the two surrogates to form a UCS4 value */
2557+
ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2558+
i++;
2559+
2560+
/* Encode UCS4 Unicode ordinals */
2561+
*p++ = (char)(0xf0 | (ch >> 18));
2562+
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2563+
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2564+
*p++ = (char)(0x80 | (ch & 0x3f));
2565+
25622566
#endif
2563-
if (ch >= 0xd800 && ch <= 0xdfff) {
2564-
Py_ssize_t newpos;
2565-
PyObject *rep;
2566-
char *prep;
2567-
int k;
2568-
rep = unicode_encode_call_errorhandler
2569-
(errors, &errorHandler, "utf-8", "surrogates not allowed",
2570-
s, size, &exc, i-1, i, &newpos);
2571-
if (!rep)
2572-
goto error;
2573-
/* Implementation limitations: only support error handler that return
2574-
bytes, and only support up to four replacement bytes. */
2575-
if (!PyBytes_Check(rep)) {
2576-
PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2577-
Py_DECREF(rep);
2567+
} else {
2568+
Py_ssize_t newpos;
2569+
PyObject *rep;
2570+
Py_ssize_t repsize, k;
2571+
rep = unicode_encode_call_errorhandler
2572+
(errors, &errorHandler, "utf-8", "surrogates not allowed",
2573+
s, size, &exc, i-1, i, &newpos);
2574+
if (!rep)
2575+
goto error;
2576+
2577+
if (PyBytes_Check(rep))
2578+
repsize = PyBytes_GET_SIZE(rep);
2579+
else
2580+
repsize = PyUnicode_GET_SIZE(rep);
2581+
2582+
if (repsize > 4) {
2583+
Py_ssize_t offset;
2584+
2585+
if (result == NULL)
2586+
offset = p - stackbuf;
2587+
else
2588+
offset = p - PyBytes_AS_STRING(result);
2589+
2590+
if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2591+
/* integer overflow */
2592+
PyErr_NoMemory();
25782593
goto error;
25792594
}
2580-
if (PyBytes_Size(rep) > 4) {
2581-
PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2582-
Py_DECREF(rep);
2583-
goto error;
2595+
nallocated += repsize - 4;
2596+
if (result != NULL) {
2597+
if (_PyBytes_Resize(&result, nallocated) < 0)
2598+
goto error;
2599+
} else {
2600+
result = PyBytes_FromStringAndSize(NULL, nallocated);
2601+
if (result == NULL)
2602+
goto error;
2603+
Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
25842604
}
2585-
prep = PyBytes_AsString(rep);
2586-
for(k = PyBytes_Size(rep); k > 0; k--)
2605+
p = PyBytes_AS_STRING(result) + offset;
2606+
}
2607+
2608+
if (PyBytes_Check(rep)) {
2609+
char *prep = PyBytes_AS_STRING(rep);
2610+
for(k = repsize; k > 0; k--)
25872611
*p++ = *prep++;
2588-
Py_DECREF(rep);
2589-
continue;
2590-
2612+
} else /* rep is unicode */ {
2613+
Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2614+
Py_UNICODE c;
2615+
2616+
for(k=0; k<repsize; k++) {
2617+
c = prep[k];
2618+
if (0x80 <= c) {
2619+
raise_encode_exception(&exc, "utf-8", s, size,
2620+
i-1, i, "surrogates not allowed");
2621+
goto error;
2622+
}
2623+
*p++ = (char)prep[k];
2624+
}
25912625
}
2592-
*p++ = (char)(0xe0 | (ch >> 12));
2593-
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2594-
*p++ = (char)(0x80 | (ch & 0x3f));
2595-
continue;
2626+
Py_DECREF(rep);
25962627
}
2597-
#ifndef Py_UNICODE_WIDE
2598-
encodeUCS4:
2599-
#endif
2628+
} else if (ch < 0x10000) {
2629+
*p++ = (char)(0xe0 | (ch >> 12));
2630+
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2631+
*p++ = (char)(0x80 | (ch & 0x3f));
2632+
} else /* ch >= 0x10000 */ {
26002633
/* Encode UCS4 Unicode ordinals */
26012634
*p++ = (char)(0xf0 | (ch >> 18));
26022635
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));

0 commit comments

Comments
 (0)