Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 6bd525b

Browse files
committed
Optimize error handlers of ASCII and Latin1 encoders when the replacement
string is pure ASCII: use _PyBytesWriter_WriteBytes(), don't check individual character. Cleanup unicode_encode_ucs1(): * Rename repunicode to rep * Clear rep object on error * Factorize code between bytes and unicode path
1 parent ce179bf commit 6bd525b

2 files changed

Lines changed: 47 additions & 43 deletions

File tree

Objects/stringlib/codecs.h

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
311311
#if STRINGLIB_SIZEOF_CHAR > 1
312312
else if (Py_UNICODE_IS_SURROGATE(ch)) {
313313
Py_ssize_t startpos, endpos, newpos;
314-
Py_ssize_t repsize, k;
314+
Py_ssize_t k;
315315
if (error_handler == _Py_ERROR_UNKNOWN)
316316
error_handler = get_error_handler(errors);
317317

@@ -392,20 +392,12 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
392392
p = _PyBytesWriter_WriteBytes(&writer, p,
393393
PyBytes_AS_STRING(rep),
394394
PyBytes_GET_SIZE(rep));
395-
if (p == NULL)
396-
goto error;
397395
}
398396
else {
399397
/* rep is unicode */
400398
if (PyUnicode_READY(rep) < 0)
401399
goto error;
402400

403-
repsize = PyUnicode_GET_LENGTH(rep);
404-
405-
p = _PyBytesWriter_Prepare(&writer, p, repsize);
406-
if (p == NULL)
407-
goto error;
408-
409401
if (!PyUnicode_IS_ASCII(rep)) {
410402
raise_encode_exception(&exc, "utf-8",
411403
unicode,
@@ -415,9 +407,13 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
415407
}
416408

417409
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
418-
memcpy(p, PyUnicode_DATA(rep), repsize);
419-
p += repsize;
410+
p = _PyBytesWriter_WriteBytes(&writer, p,
411+
PyUnicode_DATA(rep),
412+
PyUnicode_GET_LENGTH(rep));
420413
}
414+
415+
if (p == NULL)
416+
goto error;
421417
Py_CLEAR(rep);
422418

423419
i = newpos;

Objects/unicodeobject.c

Lines changed: 40 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6599,6 +6599,7 @@ unicode_encode_ucs1(PyObject *unicode,
65996599
PyObject *error_handler_obj = NULL;
66006600
PyObject *exc = NULL;
66016601
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6602+
PyObject *rep = NULL;
66026603
/* output object */
66036604
_PyBytesWriter writer;
66046605

@@ -6627,8 +6628,7 @@ unicode_encode_ucs1(PyObject *unicode,
66276628
++pos;
66286629
}
66296630
else {
6630-
PyObject *repunicode;
6631-
Py_ssize_t repsize, newpos, i;
6631+
Py_ssize_t newpos, i;
66326632
/* startpos for collecting unencodable chars */
66336633
Py_ssize_t collstart = pos;
66346634
Py_ssize_t collend = collstart + 1;
@@ -6694,52 +6694,59 @@ unicode_encode_ucs1(PyObject *unicode,
66946694
/* fallback to general error handling */
66956695

66966696
default:
6697-
repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6698-
encoding, reason, unicode, &exc,
6699-
collstart, collend, &newpos);
6700-
if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6701-
PyUnicode_READY(repunicode) == -1))
6697+
rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6698+
encoding, reason, unicode, &exc,
6699+
collstart, collend, &newpos);
6700+
if (rep == NULL)
67026701
goto onError;
67036702

67046703
/* substract preallocated bytes */
67056704
writer.min_size -= 1;
67066705

6707-
if (PyBytes_Check(repunicode)) {
6706+
if (PyBytes_Check(rep)) {
67086707
/* Directly copy bytes result to output. */
67096708
str = _PyBytesWriter_WriteBytes(&writer, str,
6710-
PyBytes_AS_STRING(repunicode),
6711-
PyBytes_GET_SIZE(repunicode));
6709+
PyBytes_AS_STRING(rep),
6710+
PyBytes_GET_SIZE(rep));
67126711
if (str == NULL)
67136712
goto onError;
6714-
6715-
pos = newpos;
6716-
Py_DECREF(repunicode);
6717-
break;
67186713
}
6714+
else {
6715+
assert(PyUnicode_Check(rep));
67196716

6720-
/* need more space? (at least enough for what we
6721-
have+the replacement+the rest of the string, so
6722-
we won't have to check space for encodable characters) */
6723-
repsize = PyUnicode_GET_LENGTH(repunicode);
6717+
if (PyUnicode_READY(rep) < 0)
6718+
goto onError;
67246719

6725-
str = _PyBytesWriter_Prepare(&writer, str, repsize);
6726-
if (str == NULL)
6727-
goto onError;
6720+
if (PyUnicode_IS_ASCII(rep)) {
6721+
/* Fast path: all characters are smaller than limit */
6722+
assert(limit >= 128);
6723+
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6724+
str = _PyBytesWriter_WriteBytes(&writer, str,
6725+
PyUnicode_DATA(rep),
6726+
PyUnicode_GET_LENGTH(rep));
6727+
}
6728+
else {
6729+
Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
67286730

6729-
/* check if there is anything unencodable in the replacement
6730-
and copy it to the output */
6731-
for (i = 0; repsize-->0; ++i, ++str) {
6732-
ch = PyUnicode_READ_CHAR(repunicode, i);
6733-
if (ch >= limit) {
6734-
raise_encode_exception(&exc, encoding, unicode,
6735-
pos, pos+1, reason);
6736-
Py_DECREF(repunicode);
6737-
goto onError;
6731+
str = _PyBytesWriter_Prepare(&writer, str, repsize);
6732+
if (str == NULL)
6733+
goto onError;
6734+
6735+
/* check if there is anything unencodable in the
6736+
replacement and copy it to the output */
6737+
for (i = 0; repsize-->0; ++i, ++str) {
6738+
ch = PyUnicode_READ_CHAR(rep, i);
6739+
if (ch >= limit) {
6740+
raise_encode_exception(&exc, encoding, unicode,
6741+
pos, pos+1, reason);
6742+
goto onError;
6743+
}
6744+
*str = (char)ch;
6745+
}
67386746
}
6739-
*str = (char)ch;
67406747
}
67416748
pos = newpos;
6742-
Py_DECREF(repunicode);
6749+
Py_CLEAR(rep);
67436750
}
67446751

67456752
/* If overallocation was disabled, ensure that it was the last
@@ -6753,6 +6760,7 @@ unicode_encode_ucs1(PyObject *unicode,
67536760
return _PyBytesWriter_Finish(&writer, str);
67546761

67556762
onError:
6763+
Py_XDECREF(rep);
67566764
_PyBytesWriter_Dealloc(&writer);
67576765
Py_XDECREF(error_handler_obj);
67586766
Py_XDECREF(exc);

0 commit comments

Comments
 (0)