@@ -117,6 +117,16 @@ static PyUnicodeObject *unicode_empty = NULL;
117117static PyUnicodeObject * unicode_freelist = NULL ;
118118static int unicode_freelist_size = 0 ;
119119
120+ /* Default encoding to use and assume when NULL is passed as encoding
121+ parameter; it is initialized by _PyUnicode_Init().
122+
123+ Always use the PyUnicode_SetDefaultEncoding() and
124+ PyUnicode_GetDefaultEncoding() APIs to access this global.
125+
126+ */
127+
128+ static char unicode_default_encoding [100 ];
129+
120130/* --- Unicode Object ----------------------------------------------------- */
121131
122132static
@@ -366,7 +376,7 @@ PyObject *PyUnicode_FromObject(register PyObject *obj)
366376 Py_INCREF (unicode_empty );
367377 return (PyObject * )unicode_empty ;
368378 }
369- return PyUnicode_DecodeUTF8 (s , len , "strict" );
379+ return PyUnicode_Decode (s , len , NULL , "strict" );
370380}
371381
372382PyObject * PyUnicode_Decode (const char * s ,
@@ -376,10 +386,16 @@ PyObject *PyUnicode_Decode(const char *s,
376386{
377387 PyObject * buffer = NULL , * unicode ;
378388
379- /* Shortcut for the default encoding UTF-8 */
380- if (encoding == NULL ||
381- (strcmp (encoding , "utf-8" ) == 0 ))
389+ if (encoding == NULL )
390+ encoding = PyUnicode_GetDefaultEncoding ();
391+
392+ /* Shortcuts for common default encodings */
393+ if (strcmp (encoding , "utf-8" ) == 0 )
382394 return PyUnicode_DecodeUTF8 (s , size , errors );
395+ else if (strcmp (encoding , "latin-1" ) == 0 )
396+ return PyUnicode_DecodeLatin1 (s , size , errors );
397+ else if (strcmp (encoding , "ascii" ) == 0 )
398+ return PyUnicode_DecodeASCII (s , size , errors );
383399
384400 /* Decode via the codec registry */
385401 buffer = PyBuffer_FromMemory ((void * )s , size );
@@ -428,11 +444,19 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
428444 PyErr_BadArgument ();
429445 goto onError ;
430446 }
431- /* Shortcut for the default encoding UTF-8 */
432- if ((encoding == NULL ||
433- (strcmp (encoding , "utf-8" ) == 0 )) &&
434- errors == NULL )
447+
448+ if (encoding == NULL )
449+ encoding = PyUnicode_GetDefaultEncoding ();
450+
451+ /* Shortcuts for common default encodings */
452+ if (errors == NULL ) {
453+ if (strcmp (encoding , "utf-8" ) == 0 )
435454 return PyUnicode_AsUTF8String (unicode );
455+ else if (strcmp (encoding , "latin-1" ) == 0 )
456+ return PyUnicode_AsLatin1String (unicode );
457+ else if (strcmp (encoding , "ascii" ) == 0 )
458+ return PyUnicode_AsASCIIString (unicode );
459+ }
436460
437461 /* Encode via the codec registry */
438462 v = PyCodec_Encode (unicode , encoding , errors );
@@ -476,6 +500,30 @@ int PyUnicode_GetSize(PyObject *unicode)
476500 return -1 ;
477501}
478502
503+ const char * PyUnicode_GetDefaultEncoding ()
504+ {
505+ return unicode_default_encoding ;
506+ }
507+
508+ int PyUnicode_SetDefaultEncoding (const char * encoding )
509+ {
510+ PyObject * v ;
511+
512+ /* Make sure the encoding is valid. As side effect, this also
513+ loads the encoding into the codec registry cache. */
514+ v = _PyCodec_Lookup (encoding );
515+ if (v == NULL )
516+ goto onError ;
517+ Py_DECREF (v );
518+ strncpy (unicode_default_encoding ,
519+ encoding ,
520+ sizeof (unicode_default_encoding ));
521+ return 0 ;
522+
523+ onError :
524+ return -1 ;
525+ }
526+
479527/* --- UTF-8 Codec -------------------------------------------------------- */
480528
481529static
@@ -772,7 +820,8 @@ int utf16_decoding_error(const Py_UNICODE **source,
772820 }
773821 else {
774822 PyErr_Format (PyExc_ValueError ,
775- "UTF-16 decoding error; unknown error handling code: %.400s" ,
823+ "UTF-16 decoding error; "
824+ "unknown error handling code: %.400s" ,
776825 errors );
777826 return -1 ;
778827 }
@@ -3057,10 +3106,10 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
30573106static char encode__doc__ [] =
30583107"S.encode([encoding[,errors]]) -> string\n\
30593108\n\
3060- Return an encoded string version of S. Default encoding is 'UTF-8'. \n\
3061- errors may be given to set a different error handling scheme. Default \n\
3062- is 'strict' meaning that encoding errors raise a ValueError. Other \n\
3063- possible values are 'ignore' and 'replace'." ;
3109+ Return an encoded string version of S. Default encoding is the current \n\
3110+ default string encoding. errors may be given to set a different error\n\
3111+ handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3112+ a ValueError. Other possible values are 'ignore' and 'replace'." ;
30643113
30653114static PyObject *
30663115unicode_encode (PyUnicodeObject * self , PyObject * args )
@@ -3816,7 +3865,7 @@ unicode_splitlines(PyUnicodeObject *self, PyObject *args)
38163865static
38173866PyObject * unicode_str (PyUnicodeObject * self )
38183867{
3819- return PyUnicode_AsUTF8String ((PyObject * )self );
3868+ return PyUnicode_AsEncodedString ((PyObject * )self , NULL , NULL );
38203869}
38213870
38223871static char strip__doc__ [] =
@@ -4246,6 +4295,8 @@ PyObject *PyUnicode_Format(PyObject *format,
42464295 return NULL ;
42474296 }
42484297 uformat = PyUnicode_FromObject (format );
4298+ if (uformat == NULL )
4299+ return NULL ;
42494300 fmt = PyUnicode_AS_UNICODE (uformat );
42504301 fmtcnt = PyUnicode_GET_SIZE (uformat );
42514302
@@ -4322,13 +4373,10 @@ PyObject *PyUnicode_Format(PyObject *format,
43224373 "incomplete format key" );
43234374 goto onError ;
43244375 }
4325- /* keys are converted to strings ( using UTF-8) and
4376+ /* keys are converted to strings using UTF-8 and
43264377 then looked up since Python uses strings to hold
43274378 variables names etc. in its namespaces and we
4328- wouldn't want to break common idioms. The
4329- alternative would be using Unicode objects for the
4330- lookup but u"abc" and "abc" have different hash
4331- values (on purpose). */
4379+ wouldn't want to break common idioms. */
43324380 key = PyUnicode_EncodeUTF8 (keystart ,
43334381 keylen ,
43344382 NULL );
@@ -4472,8 +4520,9 @@ PyObject *PyUnicode_Format(PyObject *format,
44724520 "%s argument has non-string str()" );
44734521 goto onError ;
44744522 }
4475- unicode = PyUnicode_DecodeUTF8 (PyString_AS_STRING (temp ),
4523+ unicode = PyUnicode_Decode (PyString_AS_STRING (temp ),
44764524 PyString_GET_SIZE (temp ),
4525+ NULL ,
44774526 "strict" );
44784527 Py_DECREF (temp );
44794528 temp = unicode ;
@@ -4659,7 +4708,9 @@ void _PyUnicode_Init()
46594708 Py_FatalError ("Unicode configuration error: "
46604709 "sizeof(Py_UNICODE) != 2 bytes" );
46614710
4711+ /* Init the implementation */
46624712 unicode_empty = _PyUnicode_New (0 );
4713+ strcpy (unicode_default_encoding , "utf-8" );
46634714}
46644715
46654716/* Finalize the Unicode implementation */
0 commit comments