Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit af02e1c

Browse files
author
Victor Stinner
committed
Add PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale()
* PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale() decode a string from the current locale encoding * _Py_char2wchar() writes an "error code" in the size argument to indicate if the function failed because of memory allocation failure or because of a decoding error. The function doesn't write the error message directly to stderr. * Fix time.strftime() (if wcsftime() is missing): decode strftime() result from the current locale encoding, not from the filesystem encoding.
1 parent 3607e3d commit af02e1c

7 files changed

Lines changed: 174 additions & 84 deletions

File tree

Doc/c-api/unicode.rst

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -699,6 +699,39 @@ Extension modules can continue using them, as they will not be removed in Python
699699
throughout the interpreter whenever coercion to Unicode is needed.
700700
701701
702+
Locale Encoding
703+
"""""""""""""""
704+
705+
The current locale encoding can be used to decode text from the operating
706+
system.
707+
708+
.. c:function:: PyObject* PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, int surrogateescape)
709+
710+
Decode a string from the current locale encoding. The decoder is strict if
711+
*surrogateescape* is equal to zero, otherwise it uses the
712+
``'surrogateescape'`` error handler (:pep:`383`) to escape undecodable
713+
bytes. If a byte sequence can be decoded as a surrogate character and
714+
*surrogateescape* is not equal to zero, the byte sequence is escaped using
715+
the ``'surrogateescape'`` error handler instead of being decoded. *str*
716+
must end with a null character but cannot contain embedded null character.
717+
718+
.. seealso::
719+
720+
Use :c:func:`PyUnicode_DecodeFSDefaultAndSize` to decode a string from
721+
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
722+
Python startup).
723+
724+
.. versionadded:: 3.3
725+
726+
727+
.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, int surrogateescape)
728+
729+
Similar to :c:func:`PyUnicode_DecodeLocaleAndSize`, but compute the string
730+
length using :c:func:`strlen`.
731+
732+
.. versionadded:: 3.3
733+
734+
702735
File System Encoding
703736
""""""""""""""""""""
704737
@@ -739,6 +772,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function:
739772
If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the
740773
locale encoding.
741774
775+
.. seealso::
776+
777+
:c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the
778+
locale encoding and cannot be modified later. If you need to decode a
779+
string from the current locale encoding, use
780+
:c:func:`PyUnicode_DecodeLocaleAndSize`.
781+
742782
.. versionchanged:: 3.2
743783
Use ``'strict'`` error handler on Windows.
744784

Include/unicodeobject.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1595,6 +1595,28 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
15951595
);
15961596
#endif
15971597

1598+
/* --- Locale encoding --------------------------------------------------- */
1599+
1600+
/* Decode a string from the current locale encoding. The decoder is strict if
1601+
*surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1602+
error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1603+
be decoded as a surrogate character and *surrogateescape* is not equal to
1604+
zero, the byte sequence is escaped using the 'surrogateescape' error handler
1605+
instead of being decoded. *str* must end with a null character but cannot
1606+
contain embedded null character. */
1607+
1608+
PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1609+
const char *str,
1610+
Py_ssize_t len,
1611+
int surrogateescape);
1612+
1613+
/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1614+
length using strlen(). */
1615+
1616+
PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1617+
const char *str,
1618+
int surrogateescape);
1619+
15981620
/* --- File system encoding ---------------------------------------------- */
15991621

16001622
/* ParseTuple converter: encode str objects to bytes using

Modules/_localemodule.c

Lines changed: 10 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -42,43 +42,6 @@ PyDoc_STRVAR(locale__doc__, "Support for POSIX locales.");
4242

4343
static PyObject *Error;
4444

45-
/* Convert a char* to a Unicode object according to the current locale */
46-
static PyObject*
47-
str2uni(const char* s)
48-
{
49-
#ifdef HAVE_BROKEN_MBSTOWCS
50-
size_t needed = strlen(s);
51-
#else
52-
size_t needed = mbstowcs(NULL, s, 0);
53-
#endif
54-
size_t res1;
55-
wchar_t smallbuf[30];
56-
wchar_t *dest;
57-
PyObject *res2;
58-
if (needed == (size_t)-1) {
59-
PyErr_SetString(PyExc_ValueError, "Cannot convert byte to string");
60-
return NULL;
61-
}
62-
if (needed*sizeof(wchar_t) < sizeof(smallbuf))
63-
dest = smallbuf;
64-
else {
65-
dest = PyMem_Malloc((needed+1)*sizeof(wchar_t));
66-
if (!dest)
67-
return PyErr_NoMemory();
68-
}
69-
/* This shouldn't fail now */
70-
res1 = mbstowcs(dest, s, needed+1);
71-
#ifdef HAVE_BROKEN_MBSTOWCS
72-
assert(res1 != (size_t)-1);
73-
#else
74-
assert(res1 == needed);
75-
#endif
76-
res2 = PyUnicode_FromWideChar(dest, res1);
77-
if (dest != smallbuf)
78-
PyMem_Free(dest);
79-
return res2;
80-
}
81-
8245
/* support functions for formatting floating point numbers */
8346

8447
PyDoc_STRVAR(setlocale__doc__,
@@ -149,7 +112,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
149112
PyErr_SetString(Error, "unsupported locale setting");
150113
return NULL;
151114
}
152-
result_object = str2uni(result);
115+
result_object = PyUnicode_DecodeLocale(result, 0);
153116
if (!result_object)
154117
return NULL;
155118
} else {
@@ -159,7 +122,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
159122
PyErr_SetString(Error, "locale query failed");
160123
return NULL;
161124
}
162-
result_object = str2uni(result);
125+
result_object = PyUnicode_DecodeLocale(result, 0);
163126
}
164127
return result_object;
165128
}
@@ -185,7 +148,7 @@ PyLocale_localeconv(PyObject* self)
185148
involved herein */
186149

187150
#define RESULT_STRING(s)\
188-
x = str2uni(l->s); \
151+
x = PyUnicode_DecodeLocale(l->s, 0); \
189152
if (!x) goto failed;\
190153
PyDict_SetItemString(result, #s, x);\
191154
Py_XDECREF(x)
@@ -476,7 +439,7 @@ PyLocale_nl_langinfo(PyObject* self, PyObject* args)
476439
instead of an empty string for nl_langinfo(ERA). */
477440
const char *result = nl_langinfo(item);
478441
result = result != NULL ? result : "";
479-
return str2uni(result);
442+
return PyUnicode_DecodeLocale(result, 0);
480443
}
481444
PyErr_SetString(PyExc_ValueError, "unsupported langinfo constant");
482445
return NULL;
@@ -495,7 +458,7 @@ PyIntl_gettext(PyObject* self, PyObject *args)
495458
char *in;
496459
if (!PyArg_ParseTuple(args, "s", &in))
497460
return 0;
498-
return str2uni(gettext(in));
461+
return PyUnicode_DecodeLocale(gettext(in), 0);
499462
}
500463

501464
PyDoc_STRVAR(dgettext__doc__,
@@ -508,7 +471,7 @@ PyIntl_dgettext(PyObject* self, PyObject *args)
508471
char *domain, *in;
509472
if (!PyArg_ParseTuple(args, "zs", &domain, &in))
510473
return 0;
511-
return str2uni(dgettext(domain, in));
474+
return PyUnicode_DecodeLocale(dgettext(domain, in), 0);
512475
}
513476

514477
PyDoc_STRVAR(dcgettext__doc__,
@@ -522,7 +485,7 @@ PyIntl_dcgettext(PyObject *self, PyObject *args)
522485
int category;
523486
if (!PyArg_ParseTuple(args, "zsi", &domain, &msgid, &category))
524487
return 0;
525-
return str2uni(dcgettext(domain,msgid,category));
488+
return PyUnicode_DecodeLocale(dcgettext(domain,msgid,category), 0);
526489
}
527490

528491
PyDoc_STRVAR(textdomain__doc__,
@@ -540,7 +503,7 @@ PyIntl_textdomain(PyObject* self, PyObject* args)
540503
PyErr_SetFromErrno(PyExc_OSError);
541504
return NULL;
542505
}
543-
return str2uni(domain);
506+
return PyUnicode_DecodeLocale(domain, 0);
544507
}
545508

546509
PyDoc_STRVAR(bindtextdomain__doc__,
@@ -572,7 +535,7 @@ PyIntl_bindtextdomain(PyObject* self,PyObject*args)
572535
PyErr_SetFromErrno(PyExc_OSError);
573536
return NULL;
574537
}
575-
result = str2uni(current_dirname);
538+
result = PyUnicode_DecodeLocale(current_dirname, 0);
576539
Py_XDECREF(dirname_bytes);
577540
return result;
578541
}
@@ -590,7 +553,7 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args)
590553
return NULL;
591554
codeset = bind_textdomain_codeset(domain, codeset);
592555
if (codeset)
593-
return str2uni(codeset);
556+
return PyUnicode_DecodeLocale(codeset, 0);
594557
Py_RETURN_NONE;
595558
}
596559
#endif

Modules/main.c

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -495,16 +495,13 @@ Py_Main(int argc, wchar_t **argv)
495495
/* Use utf-8 on Mac OS X */
496496
unicode = PyUnicode_FromString(p);
497497
#else
498-
wchar_t *wchar;
499-
size_t len;
500-
wchar = _Py_char2wchar(p, &len);
501-
if (wchar == NULL)
502-
continue;
503-
unicode = PyUnicode_FromWideChar(wchar, len);
504-
PyMem_Free(wchar);
498+
unicode = PyUnicode_DecodeLocale(p, 1);
505499
#endif
506-
if (unicode == NULL)
500+
if (unicode == NULL) {
501+
/* ignore errors */
502+
PyErr_Clear();
507503
continue;
504+
}
508505
PySys_AddWarnOptionUnicode(unicode);
509506
Py_DECREF(unicode);
510507
}

Modules/timemodule.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,7 @@ time_strftime(PyObject *self, PyObject *args)
532532
#ifdef HAVE_WCSFTIME
533533
ret = PyUnicode_FromWideChar(outbuf, buflen);
534534
#else
535-
ret = PyUnicode_DecodeFSDefaultAndSize(outbuf, buflen);
535+
ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, 1);
536536
#endif
537537
PyMem_Free(outbuf);
538538
break;
@@ -764,8 +764,8 @@ PyInit_timezone(PyObject *m) {
764764
#endif /* PYOS_OS2 */
765765
#endif
766766
PyModule_AddIntConstant(m, "daylight", daylight);
767-
otz0 = PyUnicode_DecodeFSDefaultAndSize(tzname[0], strlen(tzname[0]));
768-
otz1 = PyUnicode_DecodeFSDefaultAndSize(tzname[1], strlen(tzname[1]));
767+
otz0 = PyUnicode_DecodeLocale(tzname[0], 1);
768+
otz1 = PyUnicode_DecodeLocale(tzname[1], 1);
769769
PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1));
770770
#else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/
771771
#ifdef HAVE_STRUCT_TM_TM_ZONE

Objects/unicodeobject.c

Lines changed: 78 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3234,6 +3234,83 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
32343234
return NULL;
32353235
}
32363236

3237+
PyObject*
3238+
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3239+
int surrogateescape)
3240+
{
3241+
wchar_t smallbuf[256];
3242+
size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3243+
wchar_t *wstr;
3244+
size_t wlen, wlen2;
3245+
PyObject *unicode;
3246+
3247+
if (str[len] != '\0' || len != strlen(str)) {
3248+
PyErr_SetString(PyExc_TypeError, "embedded null character");
3249+
return NULL;
3250+
}
3251+
3252+
if (surrogateescape)
3253+
{
3254+
wstr = _Py_char2wchar(str, &wlen);
3255+
if (wstr == NULL) {
3256+
if (wlen == (size_t)-1)
3257+
PyErr_NoMemory();
3258+
else
3259+
PyErr_SetFromErrno(PyExc_OSError);
3260+
return NULL;
3261+
}
3262+
3263+
unicode = PyUnicode_FromWideChar(wstr, wlen);
3264+
PyMem_Free(wstr);
3265+
}
3266+
else {
3267+
#ifndef HAVE_BROKEN_MBSTOWCS
3268+
wlen = mbstowcs(NULL, str, 0);
3269+
#else
3270+
wlen = len;
3271+
#endif
3272+
if (wlen == (size_t)-1) {
3273+
PyErr_SetFromErrno(PyExc_OSError);
3274+
return NULL;
3275+
}
3276+
if (wlen+1 <= smallbuf_len) {
3277+
wstr = smallbuf;
3278+
}
3279+
else {
3280+
if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3281+
return PyErr_NoMemory();
3282+
3283+
wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3284+
if (!wstr)
3285+
return PyErr_NoMemory();
3286+
}
3287+
3288+
/* This shouldn't fail now */
3289+
wlen2 = mbstowcs(wstr, str, wlen+1);
3290+
if (wlen2 == (size_t)-1) {
3291+
if (wstr != smallbuf)
3292+
PyMem_Free(wstr);
3293+
PyErr_SetFromErrno(PyExc_OSError);
3294+
return NULL;
3295+
}
3296+
#ifdef HAVE_BROKEN_MBSTOWCS
3297+
assert(wlen2 == wlen);
3298+
#endif
3299+
unicode = PyUnicode_FromWideChar(wstr, wlen2);
3300+
if (wstr != smallbuf)
3301+
PyMem_Free(wstr);
3302+
}
3303+
return unicode;
3304+
}
3305+
3306+
PyObject*
3307+
PyUnicode_DecodeLocale(const char *str, int surrogateescape)
3308+
{
3309+
Py_ssize_t size = (Py_ssize_t)strlen(str);
3310+
return PyUnicode_DecodeLocaleAndSize(str, size, surrogateescape);
3311+
}
3312+
3313+
32373314
PyObject*
32383315
PyUnicode_DecodeFSDefault(const char *s) {
32393316
Py_ssize_t size = (Py_ssize_t)strlen(s);
@@ -3264,23 +3341,7 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
32643341
"surrogateescape");
32653342
}
32663343
else {
3267-
/* locale encoding with surrogateescape */
3268-
wchar_t *wchar;
3269-
PyObject *unicode;
3270-
size_t len;
3271-
3272-
if (s[size] != '\0' || size != strlen(s)) {
3273-
PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3274-
return NULL;
3275-
}
3276-
3277-
wchar = _Py_char2wchar(s, &len);
3278-
if (wchar == NULL)
3279-
return PyErr_NoMemory();
3280-
3281-
unicode = PyUnicode_FromWideChar(wchar, len);
3282-
PyMem_Free(wchar);
3283-
return unicode;
3344+
return PyUnicode_DecodeLocaleAndSize(s, size, 1);
32843345
}
32853346
#endif
32863347
}

0 commit comments

Comments
 (0)