Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit e4315f5

Browse files
committed
M.-A. Lemburg <[email protected]>:
Added support for user settable default encodings. The current implementation uses a per-process global which defines the value of the encoding parameter in case it is set to NULL (meaning: use the default encoding).
1 parent aff6018 commit e4315f5

1 file changed

Lines changed: 71 additions & 20 deletions

File tree

Objects/unicodeobject.c

Lines changed: 71 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,16 @@ static PyUnicodeObject *unicode_empty = NULL;
117117
static PyUnicodeObject *unicode_freelist = NULL;
118118
static int unicode_freelist_size = 0;
119119

120+
/* Default encoding to use and assume when NULL is passed as encoding
121+
parameter; it is initialized by _PyUnicode_Init().
122+
123+
Always use the PyUnicode_SetDefaultEncoding() and
124+
PyUnicode_GetDefaultEncoding() APIs to access this global.
125+
126+
*/
127+
128+
static char unicode_default_encoding[100];
129+
120130
/* --- Unicode Object ----------------------------------------------------- */
121131

122132
static
@@ -366,7 +376,7 @@ PyObject *PyUnicode_FromObject(register PyObject *obj)
366376
Py_INCREF(unicode_empty);
367377
return (PyObject *)unicode_empty;
368378
}
369-
return PyUnicode_DecodeUTF8(s, len, "strict");
379+
return PyUnicode_Decode(s, len, NULL, "strict");
370380
}
371381

372382
PyObject *PyUnicode_Decode(const char *s,
@@ -376,10 +386,16 @@ PyObject *PyUnicode_Decode(const char *s,
376386
{
377387
PyObject *buffer = NULL, *unicode;
378388

379-
/* Shortcut for the default encoding UTF-8 */
380-
if (encoding == NULL ||
381-
(strcmp(encoding, "utf-8") == 0))
389+
if (encoding == NULL)
390+
encoding = PyUnicode_GetDefaultEncoding();
391+
392+
/* Shortcuts for common default encodings */
393+
if (strcmp(encoding, "utf-8") == 0)
382394
return PyUnicode_DecodeUTF8(s, size, errors);
395+
else if (strcmp(encoding, "latin-1") == 0)
396+
return PyUnicode_DecodeLatin1(s, size, errors);
397+
else if (strcmp(encoding, "ascii") == 0)
398+
return PyUnicode_DecodeASCII(s, size, errors);
383399

384400
/* Decode via the codec registry */
385401
buffer = PyBuffer_FromMemory((void *)s, size);
@@ -428,11 +444,19 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
428444
PyErr_BadArgument();
429445
goto onError;
430446
}
431-
/* Shortcut for the default encoding UTF-8 */
432-
if ((encoding == NULL ||
433-
(strcmp(encoding, "utf-8") == 0)) &&
434-
errors == NULL)
447+
448+
if (encoding == NULL)
449+
encoding = PyUnicode_GetDefaultEncoding();
450+
451+
/* Shortcuts for common default encodings */
452+
if (errors == NULL) {
453+
if (strcmp(encoding, "utf-8") == 0)
435454
return PyUnicode_AsUTF8String(unicode);
455+
else if (strcmp(encoding, "latin-1") == 0)
456+
return PyUnicode_AsLatin1String(unicode);
457+
else if (strcmp(encoding, "ascii") == 0)
458+
return PyUnicode_AsASCIIString(unicode);
459+
}
436460

437461
/* Encode via the codec registry */
438462
v = PyCodec_Encode(unicode, encoding, errors);
@@ -476,6 +500,30 @@ int PyUnicode_GetSize(PyObject *unicode)
476500
return -1;
477501
}
478502

503+
const char *PyUnicode_GetDefaultEncoding()
504+
{
505+
return unicode_default_encoding;
506+
}
507+
508+
int PyUnicode_SetDefaultEncoding(const char *encoding)
509+
{
510+
PyObject *v;
511+
512+
/* Make sure the encoding is valid. As side effect, this also
513+
loads the encoding into the codec registry cache. */
514+
v = _PyCodec_Lookup(encoding);
515+
if (v == NULL)
516+
goto onError;
517+
Py_DECREF(v);
518+
strncpy(unicode_default_encoding,
519+
encoding,
520+
sizeof(unicode_default_encoding));
521+
return 0;
522+
523+
onError:
524+
return -1;
525+
}
526+
479527
/* --- UTF-8 Codec -------------------------------------------------------- */
480528

481529
static
@@ -772,7 +820,8 @@ int utf16_decoding_error(const Py_UNICODE **source,
772820
}
773821
else {
774822
PyErr_Format(PyExc_ValueError,
775-
"UTF-16 decoding error; unknown error handling code: %.400s",
823+
"UTF-16 decoding error; "
824+
"unknown error handling code: %.400s",
776825
errors);
777826
return -1;
778827
}
@@ -3057,10 +3106,10 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
30573106
static char encode__doc__[] =
30583107
"S.encode([encoding[,errors]]) -> string\n\
30593108
\n\
3060-
Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
3061-
errors may be given to set a different error handling scheme. Default\n\
3062-
is 'strict' meaning that encoding errors raise a ValueError. Other\n\
3063-
possible values are 'ignore' and 'replace'.";
3109+
Return an encoded string version of S. Default encoding is the current\n\
3110+
default string encoding. errors may be given to set a different error\n\
3111+
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3112+
a ValueError. Other possible values are 'ignore' and 'replace'.";
30643113

30653114
static PyObject *
30663115
unicode_encode(PyUnicodeObject *self, PyObject *args)
@@ -3816,7 +3865,7 @@ unicode_splitlines(PyUnicodeObject *self, PyObject *args)
38163865
static
38173866
PyObject *unicode_str(PyUnicodeObject *self)
38183867
{
3819-
return PyUnicode_AsUTF8String((PyObject *)self);
3868+
return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
38203869
}
38213870

38223871
static char strip__doc__[] =
@@ -4246,6 +4295,8 @@ PyObject *PyUnicode_Format(PyObject *format,
42464295
return NULL;
42474296
}
42484297
uformat = PyUnicode_FromObject(format);
4298+
if (uformat == NULL)
4299+
return NULL;
42494300
fmt = PyUnicode_AS_UNICODE(uformat);
42504301
fmtcnt = PyUnicode_GET_SIZE(uformat);
42514302

@@ -4322,13 +4373,10 @@ PyObject *PyUnicode_Format(PyObject *format,
43224373
"incomplete format key");
43234374
goto onError;
43244375
}
4325-
/* keys are converted to strings (using UTF-8) and
4376+
/* keys are converted to strings using UTF-8 and
43264377
then looked up since Python uses strings to hold
43274378
variables names etc. in its namespaces and we
4328-
wouldn't want to break common idioms. The
4329-
alternative would be using Unicode objects for the
4330-
lookup but u"abc" and "abc" have different hash
4331-
values (on purpose). */
4379+
wouldn't want to break common idioms. */
43324380
key = PyUnicode_EncodeUTF8(keystart,
43334381
keylen,
43344382
NULL);
@@ -4472,8 +4520,9 @@ PyObject *PyUnicode_Format(PyObject *format,
44724520
"%s argument has non-string str()");
44734521
goto onError;
44744522
}
4475-
unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
4523+
unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
44764524
PyString_GET_SIZE(temp),
4525+
NULL,
44774526
"strict");
44784527
Py_DECREF(temp);
44794528
temp = unicode;
@@ -4659,7 +4708,9 @@ void _PyUnicode_Init()
46594708
Py_FatalError("Unicode configuration error: "
46604709
"sizeof(Py_UNICODE) != 2 bytes");
46614710

4711+
/* Init the implementation */
46624712
unicode_empty = _PyUnicode_New(0);
4713+
strcpy(unicode_default_encoding, "utf-8");
46634714
}
46644715

46654716
/* Finalize the Unicode implementation */

0 commit comments

Comments
 (0)