Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 99f4b13

Browse files
committed
Fix unicode_decode_utf8() perf regression
1 parent 33b1c4a commit 99f4b13

1 file changed

Lines changed: 75 additions & 33 deletions

File tree

Objects/unicodeobject.c

Lines changed: 75 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4750,35 +4750,12 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
47504750

47514751

47524752
static int
4753-
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
4754-
const char *s, Py_ssize_t size,
4755-
_Py_error_handler error_handler, const char *errors,
4756-
Py_ssize_t *consumed)
4753+
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
4754+
const char *starts, const char *s, const char *end,
4755+
_Py_error_handler error_handler,
4756+
const char *errors,
4757+
Py_ssize_t *consumed)
47574758
{
4758-
const char *starts = s;
4759-
const char *end = s + size;
4760-
4761-
// fast path: try ASCII string.
4762-
if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
4763-
return -1;
4764-
}
4765-
4766-
Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
4767-
if (writer->kind == PyUnicode_1BYTE_KIND
4768-
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
4769-
{
4770-
Py_ssize_t decoded = ascii_decode(s, end, dest);
4771-
writer->pos += decoded;
4772-
4773-
if (decoded == size) {
4774-
if (consumed) {
4775-
*consumed = size;
4776-
}
4777-
return 0;
4778-
}
4779-
s += decoded;
4780-
}
4781-
47824759
Py_ssize_t startinpos, endinpos;
47834760
const char *errmsg = "";
47844761
PyObject *error_handler_obj = NULL;
@@ -4828,6 +4805,8 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
48284805
endinpos = startinpos + ch - 1;
48294806
break;
48304807
default:
4808+
// ch doesn't fit into kind, so change the buffer kind to write
4809+
// the character
48314810
if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
48324811
goto onError;
48334812
continue;
@@ -4899,8 +4878,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48994878
Py_ssize_t *consumed)
49004879
{
49014880
if (size == 0) {
4902-
if (consumed)
4881+
if (consumed) {
49034882
*consumed = 0;
4883+
}
49044884
_Py_RETURN_UNICODE_EMPTY();
49054885
}
49064886

@@ -4912,19 +4892,81 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
49124892
return get_latin1_char((unsigned char)s[0]);
49134893
}
49144894

4895+
// fast path: try ASCII string.
4896+
const char *starts = s;
4897+
const char *end = s + size;
4898+
PyObject *u = PyUnicode_New(size, 127);
4899+
if (u == NULL) {
4900+
return NULL;
4901+
}
4902+
Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
4903+
if (decoded == size) {
4904+
if (consumed) {
4905+
*consumed = size;
4906+
}
4907+
return u;
4908+
}
4909+
s += decoded;
4910+
size -= decoded;
4911+
4912+
// Use _PyUnicodeWriter after fast path is failed.
49154913
_PyUnicodeWriter writer;
4916-
_PyUnicodeWriter_Init(&writer);
4914+
_PyUnicodeWriter_InitWithBuffer(&writer, u);
4915+
writer.pos = decoded;
49174916

4918-
if (unicode_decode_utf8_writer(&writer, s, size,
4919-
error_handler, errors,
4920-
consumed) < 0) {
4917+
if (unicode_decode_utf8_impl(&writer, starts, s, end,
4918+
error_handler, errors,
4919+
consumed) < 0) {
49214920
_PyUnicodeWriter_Dealloc(&writer);
49224921
return NULL;
49234922
}
49244923
return _PyUnicodeWriter_Finish(&writer);
49254924
}
49264925

49274926

4927+
static int
4928+
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
4929+
const char *s, Py_ssize_t size,
4930+
_Py_error_handler error_handler, const char *errors,
4931+
Py_ssize_t *consumed)
4932+
{
4933+
if (size == 0) {
4934+
if (consumed) {
4935+
*consumed = 0;
4936+
}
4937+
return 0;
4938+
}
4939+
4940+
// fast path: try ASCII string.
4941+
if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
4942+
return -1;
4943+
}
4944+
4945+
const char *starts = s;
4946+
const char *end = s + size;
4947+
Py_ssize_t decoded = 0;
4948+
Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
4949+
if (writer->kind == PyUnicode_1BYTE_KIND
4950+
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
4951+
{
4952+
decoded = ascii_decode(s, end, dest);
4953+
writer->pos += decoded;
4954+
4955+
if (decoded == size) {
4956+
if (consumed) {
4957+
*consumed = size;
4958+
}
4959+
return 0;
4960+
}
4961+
s += decoded;
4962+
size -= decoded;
4963+
}
4964+
4965+
return unicode_decode_utf8_impl(writer, starts, s, end,
4966+
error_handler, errors, consumed);
4967+
}
4968+
4969+
49284970
PyObject *
49294971
PyUnicode_DecodeUTF8Stateful(const char *s,
49304972
Py_ssize_t size,

0 commit comments

Comments
 (0)