@@ -3100,9 +3100,9 @@ PyUnicode_FromEncodedObject(PyObject *obj,
31003100 return v ;
31013101}
31023102
3103- /* Convert encoding to lower case and replace '_' with '-' in order to
3104- catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3105- 1 on success . */
3103+ /* Normalize an encoding name: C implementation of
3104+ encodings.normalize_encoding(). Return 1 on success, or 0 on error (encoding
3105+ is longer than lower_len-1) . */
31063106int
31073107_Py_normalize_encoding (const char * encoding ,
31083108 char * lower ,
@@ -3111,30 +3111,39 @@ _Py_normalize_encoding(const char *encoding,
31113111 const char * e ;
31123112 char * l ;
31133113 char * l_end ;
3114+ int punct ;
3115+
3116+ assert (encoding != NULL );
31143117
3115- if (encoding == NULL ) {
3116- /* 6 == strlen("utf-8") + 1 */
3117- if (lower_len < 6 )
3118- return 0 ;
3119- strcpy (lower , "utf-8" );
3120- return 1 ;
3121- }
31223118 e = encoding ;
31233119 l = lower ;
31243120 l_end = & lower [lower_len - 1 ];
3125- while ( * e ) {
3126- if ( l == l_end )
3127- return 0 ;
3128- if (Py_ISUPPER ( * e ) ) {
3129- * l ++ = Py_TOLOWER ( * e ++ ) ;
3121+ punct = 0 ;
3122+ while ( 1 ) {
3123+ char c = * e ;
3124+ if (c == 0 ) {
3125+ break ;
31303126 }
3131- else if (* e == '_' ) {
3132- * l ++ = '-' ;
3133- e ++ ;
3127+
3128+ if (Py_ISALNUM (c ) || c == '.' ) {
3129+ if (punct && l != lower ) {
3130+ if (l == l_end ) {
3131+ return 0 ;
3132+ }
3133+ * l ++ = '_' ;
3134+ }
3135+ punct = 0 ;
3136+
3137+ if (l == l_end ) {
3138+ return 0 ;
3139+ }
3140+ * l ++ = Py_TOLOWER (c );
31343141 }
31353142 else {
3136- * l ++ = * e ++ ;
3143+ punct = 1 ;
31373144 }
3145+
3146+ e ++ ;
31383147 }
31393148 * l = '\0' ;
31403149 return 1 ;
@@ -3148,28 +3157,51 @@ PyUnicode_Decode(const char *s,
31483157{
31493158 PyObject * buffer = NULL , * unicode ;
31503159 Py_buffer info ;
3151- char lower [11 ]; /* Enough for any encoding shortcut */
3160+ char buflower [11 ]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3161+
3162+ if (encoding == NULL ) {
3163+ return PyUnicode_DecodeUTF8Stateful (s , size , errors , NULL );
3164+ }
31523165
31533166 /* Shortcuts for common default encodings */
3154- if (_Py_normalize_encoding (encoding , lower , sizeof (lower ))) {
3155- if ((strcmp (lower , "utf-8" ) == 0 ) ||
3156- (strcmp (lower , "utf8" ) == 0 ))
3157- return PyUnicode_DecodeUTF8Stateful (s , size , errors , NULL );
3158- else if ((strcmp (lower , "latin-1" ) == 0 ) ||
3159- (strcmp (lower , "latin1" ) == 0 ) ||
3160- (strcmp (lower , "iso-8859-1" ) == 0 ) ||
3161- (strcmp (lower , "iso8859-1" ) == 0 ))
3162- return PyUnicode_DecodeLatin1 (s , size , errors );
3163- #ifdef HAVE_MBCS
3164- else if (strcmp (lower , "mbcs ") == 0 )
3165- return PyUnicode_DecodeMBCS (s , size , errors );
3166- #endif
3167- else if (strcmp (lower , "ascii ") == 0 )
3168- return PyUnicode_DecodeASCII (s , size , errors );
3169- else if (strcmp (lower , "utf -16 ") == 0 )
3170- return PyUnicode_DecodeUTF16 (s , size , errors , 0 );
3171- else if (strcmp (lower , "utf - 32 ") == 0 )
3172- return PyUnicode_DecodeUTF32 (s , size , errors , 0 );
3167+ if (_Py_normalize_encoding (encoding , buflower , sizeof (buflower ))) {
3168+ char * lower = buflower ;
3169+
3170+ /* Fast paths */
3171+ if (lower [0 ] == 'u' && lower [1 ] == 't' && lower [2 ] == 'f' ) {
3172+ lower += 3 ;
3173+ if (* lower == '_' ) {
3174+ /* Match "utf8" and "utf_8" */
3175+ lower ++ ;
3176+ }
3177+
3178+ if (lower [0 ] == '8' && lower [1 ] == 0 ) {
3179+ return PyUnicode_DecodeUTF8Stateful (s , size , errors , NULL );
3180+ }
3181+ else if (lower [0 ] == '1' && lower [1 ] == '6' && lower [2 ] == 0 ) {
3182+ return PyUnicode_DecodeUTF16 (s , size , errors , 0 );
3183+ }
3184+ else if (lower [0 ] == '3' && lower [1 ] == '2' && lower [2 ] == 0 ) {
3185+ return PyUnicode_DecodeUTF32 (s , size , errors , 0 );
3186+ }
3187+ }
3188+ else {
3189+ if (strcmp (lower , "ascii" ) == 0
3190+ || strcmp (lower , "us_ascii" ) == 0 ) {
3191+ return PyUnicode_DecodeASCII (s , size , errors );
3192+ }
3193+ #ifdef HAVE_MBCS
3194+ else if (strcmp (lower , "mbcs ") == 0 ) {
3195+ return PyUnicode_DecodeMBCS (s , size , errors );
3196+ }
3197+ #endif
3198+ else if (strcmp (lower , "latin1 ") == 0
3199+ || strcmp (lower , "latin_1 ") == 0
3200+ || strcmp (lower , "iso_8859_1 ") == 0
3201+ || strcmp (lower , "iso8859_1 ") == 0 ) {
3202+ return PyUnicode_DecodeLatin1 (s , size , errors );
3203+ }
3204+ }
31733205 }
31743206
31753207 /* Decode via the codec registry */
@@ -3512,34 +3544,56 @@ PyUnicode_AsEncodedString(PyObject *unicode,
35123544 const char * errors )
35133545{
35143546 PyObject * v ;
3515- char lower [11 ]; /* Enough for any encoding shortcut */
3547+ char buflower [11 ]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
35163548
35173549 if (!PyUnicode_Check (unicode )) {
35183550 PyErr_BadArgument ();
35193551 return NULL ;
35203552 }
35213553
3554+ if (encoding == NULL ) {
3555+ return _PyUnicode_AsUTF8String (unicode , errors );
3556+ }
3557+
35223558 /* Shortcuts for common default encodings */
3523- if (_Py_normalize_encoding (encoding , lower , sizeof (lower ))) {
3524- if ((strcmp (lower , "utf-8" ) == 0 ) ||
3525- (strcmp (lower , "utf8" ) == 0 ))
3526- {
3527- if (errors == NULL || strcmp (errors , "strict" ) == 0 )
3528- return _PyUnicode_AsUTF8String (unicode , NULL );
3529- else
3559+ if (_Py_normalize_encoding (encoding , buflower , sizeof (buflower ))) {
3560+ char * lower = buflower ;
3561+
3562+ /* Fast paths */
3563+ if (lower [0 ] == 'u' && lower [1 ] == 't' && lower [2 ] == 'f' ) {
3564+ lower += 3 ;
3565+ if (* lower == '_' ) {
3566+ /* Match "utf8" and "utf_8" */
3567+ lower ++ ;
3568+ }
3569+
3570+ if (lower [0 ] == '8' && lower [1 ] == 0 ) {
35303571 return _PyUnicode_AsUTF8String (unicode , errors );
3572+ }
3573+ else if (lower [0 ] == '1' && lower [1 ] == '6' && lower [2 ] == 0 ) {
3574+ return _PyUnicode_EncodeUTF16 (unicode , errors , 0 );
3575+ }
3576+ else if (lower [0 ] == '3' && lower [1 ] == '2' && lower [2 ] == 0 ) {
3577+ return _PyUnicode_EncodeUTF32 (unicode , errors , 0 );
3578+ }
35313579 }
3532- else if (( strcmp ( lower , "latin-1" ) == 0 ) ||
3533- (strcmp (lower , "latin1 " ) == 0 ) ||
3534- ( strcmp (lower , "iso-8859-1 " ) == 0 ) ||
3535- ( strcmp ( lower , "iso8859-1" ) == 0 ))
3536- return _PyUnicode_AsLatin1String ( unicode , errors );
3580+ else {
3581+ if (strcmp (lower , "ascii " ) == 0
3582+ || strcmp (lower , "us_ascii " ) == 0 ) {
3583+ return _PyUnicode_AsASCIIString ( unicode , errors );
3584+ }
35373585#ifdef HAVE_MBCS
3538- else if (strcmp (lower , "mbcs ") == 0 )
3539- return PyUnicode_EncodeCodePage (CP_ACP , unicode , errors );
3586+ else if (strcmp (lower , "mbcs ") == 0 ) {
3587+ return PyUnicode_EncodeCodePage (CP_ACP , unicode , errors );
3588+ }
35403589#endif
3541- else if (strcmp (lower , "ascii ") == 0 )
3542- return _PyUnicode_AsASCIIString (unicode , errors );
3590+ else if (strcmp (lower , "latin1 ") == 0 ||
3591+ strcmp (lower , "latin_1 ") == 0 ||
3592+ strcmp (lower , "iso_8859_1 ") == 0 ||
3593+ strcmp (lower , "iso8859_1 ") == 0 ) {
3594+ return _PyUnicode_AsLatin1String (unicode , errors );
3595+ }
3596+ }
35433597 }
35443598
35453599 /* Encode via the codec registry */
0 commit comments