Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 942889a

Browse files
committed
Issue #27938: Add a fast-path for us-ascii encoding
Other changes: * Rewrite _Py_normalize_encoding() as a C implementation of encodings.normalize_encoding(). For example, " utf-8 " is now normalized to "utf_8". So the fast path is now used for more name variants of the same encoding. * Avoid strcpy() when encoding is NULL: call directly the UTF-8 codec
1 parent a9ab165 commit 942889a

1 file changed

Lines changed: 110 additions & 56 deletions

File tree

Objects/unicodeobject.c

Lines changed: 110 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -3100,9 +3100,9 @@ PyUnicode_FromEncodedObject(PyObject *obj,
31003100
return v;
31013101
}
31023102

3103-
/* Convert encoding to lower case and replace '_' with '-' in order to
3104-
catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3105-
1 on success. */
3103+
/* Normalize an encoding name: C implementation of
3104+
encodings.normalize_encoding(). Return 1 on success, or 0 on error (encoding
3105+
is longer than lower_len-1). */
31063106
int
31073107
_Py_normalize_encoding(const char *encoding,
31083108
char *lower,
@@ -3111,30 +3111,39 @@ _Py_normalize_encoding(const char *encoding,
31113111
const char *e;
31123112
char *l;
31133113
char *l_end;
3114+
int punct;
3115+
3116+
assert(encoding != NULL);
31143117

3115-
if (encoding == NULL) {
3116-
/* 6 == strlen("utf-8") + 1 */
3117-
if (lower_len < 6)
3118-
return 0;
3119-
strcpy(lower, "utf-8");
3120-
return 1;
3121-
}
31223118
e = encoding;
31233119
l = lower;
31243120
l_end = &lower[lower_len - 1];
3125-
while (*e) {
3126-
if (l == l_end)
3127-
return 0;
3128-
if (Py_ISUPPER(*e)) {
3129-
*l++ = Py_TOLOWER(*e++);
3121+
punct = 0;
3122+
while (1) {
3123+
char c = *e;
3124+
if (c == 0) {
3125+
break;
31303126
}
3131-
else if (*e == '_') {
3132-
*l++ = '-';
3133-
e++;
3127+
3128+
if (Py_ISALNUM(c) || c == '.') {
3129+
if (punct && l != lower) {
3130+
if (l == l_end) {
3131+
return 0;
3132+
}
3133+
*l++ = '_';
3134+
}
3135+
punct = 0;
3136+
3137+
if (l == l_end) {
3138+
return 0;
3139+
}
3140+
*l++ = Py_TOLOWER(c);
31343141
}
31353142
else {
3136-
*l++ = *e++;
3143+
punct = 1;
31373144
}
3145+
3146+
e++;
31383147
}
31393148
*l = '\0';
31403149
return 1;
@@ -3148,28 +3157,51 @@ PyUnicode_Decode(const char *s,
31483157
{
31493158
PyObject *buffer = NULL, *unicode;
31503159
Py_buffer info;
3151-
char lower[11]; /* Enough for any encoding shortcut */
3160+
char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3161+
3162+
if (encoding == NULL) {
3163+
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3164+
}
31523165

31533166
/* Shortcuts for common default encodings */
3154-
if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3155-
if ((strcmp(lower, "utf-8") == 0) ||
3156-
(strcmp(lower, "utf8") == 0))
3157-
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3158-
else if ((strcmp(lower, "latin-1") == 0) ||
3159-
(strcmp(lower, "latin1") == 0) ||
3160-
(strcmp(lower, "iso-8859-1") == 0) ||
3161-
(strcmp(lower, "iso8859-1") == 0))
3162-
return PyUnicode_DecodeLatin1(s, size, errors);
3163-
#ifdef HAVE_MBCS
3164-
else if (strcmp(lower, "mbcs") == 0)
3165-
return PyUnicode_DecodeMBCS(s, size, errors);
3166-
#endif
3167-
else if (strcmp(lower, "ascii") == 0)
3168-
return PyUnicode_DecodeASCII(s, size, errors);
3169-
else if (strcmp(lower, "utf-16") == 0)
3170-
return PyUnicode_DecodeUTF16(s, size, errors, 0);
3171-
else if (strcmp(lower, "utf-32") == 0)
3172-
return PyUnicode_DecodeUTF32(s, size, errors, 0);
3167+
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3168+
char *lower = buflower;
3169+
3170+
/* Fast paths */
3171+
if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3172+
lower += 3;
3173+
if (*lower == '_') {
3174+
/* Match "utf8" and "utf_8" */
3175+
lower++;
3176+
}
3177+
3178+
if (lower[0] == '8' && lower[1] == 0) {
3179+
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3180+
}
3181+
else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3182+
return PyUnicode_DecodeUTF16(s, size, errors, 0);
3183+
}
3184+
else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3185+
return PyUnicode_DecodeUTF32(s, size, errors, 0);
3186+
}
3187+
}
3188+
else {
3189+
if (strcmp(lower, "ascii") == 0
3190+
|| strcmp(lower, "us_ascii") == 0) {
3191+
return PyUnicode_DecodeASCII(s, size, errors);
3192+
}
3193+
#ifdef HAVE_MBCS
3194+
else if (strcmp(lower, "mbcs") == 0) {
3195+
return PyUnicode_DecodeMBCS(s, size, errors);
3196+
}
3197+
#endif
3198+
else if (strcmp(lower, "latin1") == 0
3199+
|| strcmp(lower, "latin_1") == 0
3200+
|| strcmp(lower, "iso_8859_1") == 0
3201+
|| strcmp(lower, "iso8859_1") == 0) {
3202+
return PyUnicode_DecodeLatin1(s, size, errors);
3203+
}
3204+
}
31733205
}
31743206

31753207
/* Decode via the codec registry */
@@ -3512,34 +3544,56 @@ PyUnicode_AsEncodedString(PyObject *unicode,
35123544
const char *errors)
35133545
{
35143546
PyObject *v;
3515-
char lower[11]; /* Enough for any encoding shortcut */
3547+
char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
35163548

35173549
if (!PyUnicode_Check(unicode)) {
35183550
PyErr_BadArgument();
35193551
return NULL;
35203552
}
35213553

3554+
if (encoding == NULL) {
3555+
return _PyUnicode_AsUTF8String(unicode, errors);
3556+
}
3557+
35223558
/* Shortcuts for common default encodings */
3523-
if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3524-
if ((strcmp(lower, "utf-8") == 0) ||
3525-
(strcmp(lower, "utf8") == 0))
3526-
{
3527-
if (errors == NULL || strcmp(errors, "strict") == 0)
3528-
return _PyUnicode_AsUTF8String(unicode, NULL);
3529-
else
3559+
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3560+
char *lower = buflower;
3561+
3562+
/* Fast paths */
3563+
if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3564+
lower += 3;
3565+
if (*lower == '_') {
3566+
/* Match "utf8" and "utf_8" */
3567+
lower++;
3568+
}
3569+
3570+
if (lower[0] == '8' && lower[1] == 0) {
35303571
return _PyUnicode_AsUTF8String(unicode, errors);
3572+
}
3573+
else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3574+
return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3575+
}
3576+
else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3577+
return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3578+
}
35313579
}
3532-
else if ((strcmp(lower, "latin-1") == 0) ||
3533-
(strcmp(lower, "latin1") == 0) ||
3534-
(strcmp(lower, "iso-8859-1") == 0) ||
3535-
(strcmp(lower, "iso8859-1") == 0))
3536-
return _PyUnicode_AsLatin1String(unicode, errors);
3580+
else {
3581+
if (strcmp(lower, "ascii") == 0
3582+
|| strcmp(lower, "us_ascii") == 0) {
3583+
return _PyUnicode_AsASCIIString(unicode, errors);
3584+
}
35373585
#ifdef HAVE_MBCS
3538-
else if (strcmp(lower, "mbcs") == 0)
3539-
return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3586+
else if (strcmp(lower, "mbcs") == 0) {
3587+
return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3588+
}
35403589
#endif
3541-
else if (strcmp(lower, "ascii") == 0)
3542-
return _PyUnicode_AsASCIIString(unicode, errors);
3590+
else if (strcmp(lower, "latin1") == 0 ||
3591+
strcmp(lower, "latin_1") == 0 ||
3592+
strcmp(lower, "iso_8859_1") == 0 ||
3593+
strcmp(lower, "iso8859_1") == 0) {
3594+
return _PyUnicode_AsLatin1String(unicode, errors);
3595+
}
3596+
}
35433597
}
35443598

35453599
/* Encode via the codec registry */

0 commit comments

Comments
 (0)