Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit f2ea71f

Browse files
author
Victor Stinner
committed
Issue #13560: Add PyUnicode_EncodeLocale()
* Use PyUnicode_EncodeLocale() in time.strftime() if wcsftime() is not available * Document my last changes in Misc/NEWS
1 parent 9987d93 commit f2ea71f

5 files changed

Lines changed: 177 additions & 37 deletions

File tree

Doc/c-api/unicode.rst

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -713,7 +713,7 @@ system.
713713
bytes. If a byte sequence can be decoded as a surrogate character and
714714
*surrogateescape* is not equal to zero, the byte sequence is escaped using
715715
the ``'surrogateescape'`` error handler instead of being decoded. *str*
716-
must end with a null character but cannot contain embedded null character.
716+
must end with a null character but cannot contain embedded null characters.
717717
718718
.. seealso::
719719
@@ -732,6 +732,22 @@ system.
732732
.. versionadded:: 3.3
733733
734734
735+
.. c:function:: PyObject* PyUnicode_EncodeLocale(PyObject *unicode, int surrogateescape)
736+
737+
Encode a Unicode object to the current locale encoding. The encoder is
738+
strict if *surrogateescape* is equal to zero, otherwise it uses the
739+
``'surrogateescape'`` error handler (:pep:`383`). Return a :class:`bytes`
740+
object. *str* cannot contain embedded null characters.
741+
742+
.. seealso::
743+
744+
Use :c:func:`PyUnicode_EncodeFSDefault` to encode a string to
745+
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
746+
Python startup).
747+
748+
.. versionadded:: 3.3
749+
750+
735751
File System Encoding
736752
""""""""""""""""""""
737753
@@ -806,6 +822,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function:
806822
If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the
807823
locale encoding.
808824
825+
.. seealso::
826+
827+
:c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the
828+
locale encoding and cannot be modified later. If you need to encode a
829+
string to the current locale encoding, use
830+
:c:func:`PyUnicode_EncodeLocale`.
831+
809832
.. versionadded:: 3.2
810833
811834

Include/unicodeobject.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1603,7 +1603,7 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
16031603
be decoded as a surrogate character and *surrogateescape* is not equal to
16041604
zero, the byte sequence is escaped using the 'surrogateescape' error handler
16051605
instead of being decoded. *str* must end with a null character but cannot
1606-
contain embedded null character. */
1606+
contain embedded null characters. */
16071607

16081608
PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
16091609
const char *str,
@@ -1617,6 +1617,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
16171617
const char *str,
16181618
int surrogateescape);
16191619

1620+
/* Encode a Unicode object to the current locale encoding. The encoder is
1621+
strict is *surrogateescape* is equal to zero, otherwise the
1622+
"surrogateescape" error handler is used. Return a bytes object. The string
1623+
cannot contain embedded null characters.. */
1624+
1625+
PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1626+
PyObject *unicode,
1627+
int surrogateescape
1628+
);
1629+
16201630
/* --- File system encoding ---------------------------------------------- */
16211631

16221632
/* ParseTuple converter: encode str objects to bytes using

Misc/NEWS

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,10 @@ Core and Builtins
419419
Library
420420
-------
421421

422+
- Issue #13560: Add PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize()
423+
and PyUnicode_EncodeLocale() functions to the C API to decode/encode from/to
424+
the current locale encoding.
425+
422426
- Issue #8373: The filesystem path of AF_UNIX sockets now uses the filesystem
423427
encoding and the surrogateescape error handler, rather than UTF-8. Patch
424428
by David Watson.
@@ -451,8 +455,8 @@ Library
451455
'importlib.abc.PyPycLoader', 'nntplib.NNTP.xgtitle', 'nntplib.NNTP.xpath',
452456
and private attributes of 'smtpd.SMTPChannel'.
453457

454-
- Issue #5905: time.strftime() is now using the locale encoding, instead of
455-
UTF-8, if the wcsftime() function is not available.
458+
- Issue #5905, #13560: time.strftime() is now using the current locale
459+
encoding, instead of UTF-8, if the wcsftime() function is not available.
456460

457461
- Issue #8641: Update IDLE 3 syntax coloring to recognize b".." and not u"..".
458462
Patch by Tal Einat.

Modules/timemodule.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ time_strftime(PyObject *self, PyObject *args)
486486
fmt = format;
487487
#else
488488
/* Convert the unicode string to an ascii one */
489-
format = PyUnicode_EncodeFSDefault(format_arg);
489+
format = PyUnicode_EncodeLocale(format_arg, 1);
490490
if (format == NULL)
491491
return NULL;
492492
fmt = PyBytes_AS_STRING(format);

Objects/unicodeobject.c

Lines changed: 135 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3073,6 +3073,140 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
30733073
return NULL;
30743074
}
30753075

3076+
static size_t
3077+
wcstombs_errorpos(const wchar_t *wstr)
3078+
{
3079+
size_t len;
3080+
#if SIZEOF_WCHAR_T == 2
3081+
wchar_t buf[3];
3082+
#else
3083+
wchar_t buf[2];
3084+
#endif
3085+
char outbuf[MB_LEN_MAX];
3086+
const wchar_t *start, *previous;
3087+
int save_errno;
3088+
3089+
save_errno = errno;
3090+
#if SIZEOF_WCHAR_T == 2
3091+
buf[2] = 0;
3092+
#else
3093+
buf[1] = 0;
3094+
#endif
3095+
start = wstr;
3096+
while (*wstr != L'\0')
3097+
{
3098+
previous = wstr;
3099+
#if SIZEOF_WCHAR_T == 2
3100+
if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3101+
&& Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3102+
{
3103+
buf[0] = wstr[0];
3104+
buf[1] = wstr[1];
3105+
wstr += 2;
3106+
}
3107+
else {
3108+
buf[0] = *wstr;
3109+
buf[1] = 0;
3110+
wstr++;
3111+
}
3112+
#else
3113+
buf[0] = *wstr;
3114+
wstr++;
3115+
#endif
3116+
len = wcstombs(outbuf, buf, sizeof(outbuf));
3117+
if (len == (size_t)-1) {
3118+
errno = save_errno;
3119+
return previous - start;
3120+
}
3121+
}
3122+
3123+
/* failed to find the unencodable character */
3124+
errno = save_errno;
3125+
return 0;
3126+
}
3127+
3128+
PyObject *
3129+
PyUnicode_EncodeLocale(PyObject *unicode, int surrogateescape)
3130+
{
3131+
Py_ssize_t wlen, wlen2;
3132+
wchar_t *wstr;
3133+
PyObject *bytes = NULL;
3134+
char *errmsg;
3135+
PyObject *exc;
3136+
size_t error_pos;
3137+
3138+
wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3139+
if (wstr == NULL)
3140+
return NULL;
3141+
3142+
wlen2 = wcslen(wstr);
3143+
if (wlen2 != wlen) {
3144+
PyMem_Free(wstr);
3145+
PyErr_SetString(PyExc_TypeError, "embedded null character");
3146+
return NULL;
3147+
}
3148+
3149+
if (surrogateescape) {
3150+
/* locale encoding with surrogateescape */
3151+
char *str;
3152+
3153+
str = _Py_wchar2char(wstr, &error_pos);
3154+
if (str == NULL) {
3155+
if (error_pos == (size_t)-1) {
3156+
PyErr_NoMemory();
3157+
PyMem_Free(wstr);
3158+
return NULL;
3159+
}
3160+
else {
3161+
goto encode_error;
3162+
}
3163+
}
3164+
PyMem_Free(wstr);
3165+
3166+
bytes = PyBytes_FromString(str);
3167+
PyMem_Free(str);
3168+
}
3169+
else {
3170+
size_t len, len2;
3171+
3172+
len = wcstombs(NULL, wstr, 0);
3173+
if (len == (size_t)-1) {
3174+
error_pos = wcstombs_errorpos(wstr);
3175+
goto encode_error;
3176+
}
3177+
3178+
bytes = PyBytes_FromStringAndSize(NULL, len);
3179+
if (bytes == NULL) {
3180+
PyMem_Free(wstr);
3181+
return NULL;
3182+
}
3183+
3184+
len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3185+
if (len2 == (size_t)-1 || len2 > len) {
3186+
error_pos = wcstombs_errorpos(wstr);
3187+
goto encode_error;
3188+
}
3189+
PyMem_Free(wstr);
3190+
}
3191+
return bytes;
3192+
3193+
encode_error:
3194+
errmsg = strerror(errno);
3195+
assert(errmsg != NULL);
3196+
if (errmsg == NULL)
3197+
errmsg = "wcstombs() encountered an unencodable wide character";
3198+
PyMem_Free(wstr);
3199+
Py_XDECREF(bytes);
3200+
3201+
exc = NULL;
3202+
raise_encode_exception(&exc,
3203+
"locale", unicode,
3204+
error_pos, error_pos+1,
3205+
errmsg);
3206+
Py_XDECREF(exc);
3207+
return NULL;
3208+
}
3209+
30763210
PyObject *
30773211
PyUnicode_EncodeFSDefault(PyObject *unicode)
30783212
{
@@ -3097,38 +3231,7 @@ PyUnicode_EncodeFSDefault(PyObject *unicode)
30973231
"surrogateescape");
30983232
}
30993233
else {
3100-
/* locale encoding with surrogateescape */
3101-
wchar_t *wchar;
3102-
char *bytes;
3103-
PyObject *bytes_obj;
3104-
size_t error_pos;
3105-
3106-
wchar = PyUnicode_AsWideCharString(unicode, NULL);
3107-
if (wchar == NULL)
3108-
return NULL;
3109-
bytes = _Py_wchar2char(wchar, &error_pos);
3110-
if (bytes == NULL) {
3111-
if (error_pos != (size_t)-1) {
3112-
char *errmsg = strerror(errno);
3113-
PyObject *exc = NULL;
3114-
if (errmsg == NULL)
3115-
errmsg = "Py_wchar2char() failed";
3116-
raise_encode_exception(&exc,
3117-
"filesystemencoding", unicode,
3118-
error_pos, error_pos+1,
3119-
errmsg);
3120-
Py_XDECREF(exc);
3121-
}
3122-
else
3123-
PyErr_NoMemory();
3124-
PyMem_Free(wchar);
3125-
return NULL;
3126-
}
3127-
PyMem_Free(wchar);
3128-
3129-
bytes_obj = PyBytes_FromString(bytes);
3130-
PyMem_Free(bytes);
3131-
return bytes_obj;
3234+
return PyUnicode_EncodeLocale(unicode, 1);
31323235
}
31333236
#endif
31343237
}

0 commit comments

Comments
 (0)