Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 081dfee

Browse files
committed
Issue 4474: On platforms with sizeof(wchar_t) == 4 and
sizeof(Py_UNICODE) == 2, PyUnicode_FromWideChar now converts each character outside the BMP to the appropriate surrogate pair. Thanks Victor Stinner for the patch.
1 parent ecdfd51 commit 081dfee

3 files changed

Lines changed: 113 additions & 0 deletions

File tree

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ What's New in Python 3.1 alpha 2?
1212
Core and Builtins
1313
-----------------
1414

15+
- Issue #4474: PyUnicode_FromWideChar now converts characters outside
16+
the BMP to surrogate pairs, on systems with sizeof(wchar_t) == 4
17+
and sizeof(Py_UNICODE) == 2.
18+
1519
- Issue #5237: Allow auto-numbered fields in str.format(). For
1620
example: '{} {}'.format(1, 2) == '1 2'.
1721

Modules/_testcapimodule.c

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,50 @@ test_Z_code(PyObject *self)
707707
Py_RETURN_NONE;
708708
}
709709

710+
static PyObject *
711+
test_widechar(PyObject *self)
712+
{
713+
#if defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
714+
const wchar_t wtext[2] = {(wchar_t)0x10ABCDu};
715+
size_t wtextlen = 1;
716+
#else
717+
const wchar_t wtext[3] = {(wchar_t)0xDBEAu, (wchar_t)0xDFCDu};
718+
size_t wtextlen = 2;
719+
#endif
720+
PyObject *wide, *utf8;
721+
722+
wide = PyUnicode_FromWideChar(wtext, wtextlen);
723+
if (wide == NULL)
724+
return NULL;
725+
726+
utf8 = PyUnicode_FromString("\xf4\x8a\xaf\x8d");
727+
if (utf8 == NULL) {
728+
Py_DECREF(wide);
729+
return NULL;
730+
}
731+
732+
if (PyUnicode_GET_SIZE(wide) != PyUnicode_GET_SIZE(utf8)) {
733+
Py_DECREF(wide);
734+
Py_DECREF(utf8);
735+
return raiseTestError("test_widechar",
736+
"wide string and utf8 string "
737+
"have different length");
738+
}
739+
if (PyUnicode_Compare(wide, utf8)) {
740+
Py_DECREF(wide);
741+
Py_DECREF(utf8);
742+
if (PyErr_Occurred())
743+
return NULL;
744+
return raiseTestError("test_widechar",
745+
"wide string and utf8 string "
746+
"are different");
747+
}
748+
749+
Py_DECREF(wide);
750+
Py_DECREF(utf8);
751+
Py_RETURN_NONE;
752+
}
753+
710754
static PyObject *
711755
test_empty_argparse(PyObject *self)
712756
{
@@ -1206,6 +1250,7 @@ static PyMethodDef TestMethods[] = {
12061250
{"test_s_code", (PyCFunction)test_s_code, METH_NOARGS},
12071251
{"test_u_code", (PyCFunction)test_u_code, METH_NOARGS},
12081252
{"test_Z_code", (PyCFunction)test_Z_code, METH_NOARGS},
1253+
{"test_widechar", (PyCFunction)test_widechar, METH_NOARGS},
12091254
#ifdef WITH_THREAD
12101255
{"_test_thread_state", test_thread_state, METH_VARARGS},
12111256
{"_pending_threadfunc", pending_threadfunc, METH_VARARGS},

Objects/unicodeobject.c

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,66 @@ PyObject *PyUnicode_FromString(const char *u)
561561

562562
#ifdef HAVE_WCHAR_H
563563

564+
#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
565+
# define CONVERT_WCHAR_TO_SURROGATES
566+
#endif
567+
568+
#ifdef CONVERT_WCHAR_TO_SURROGATES
569+
570+
/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
571+
to convert from UTF32 to UTF16. */
572+
573+
PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
574+
Py_ssize_t size)
575+
{
576+
PyUnicodeObject *unicode;
577+
register Py_ssize_t i;
578+
Py_ssize_t alloc;
579+
const wchar_t *orig_w;
580+
581+
if (w == NULL) {
582+
if (size == 0)
583+
return PyUnicode_FromStringAndSize(NULL, 0);
584+
PyErr_BadInternalCall();
585+
return NULL;
586+
}
587+
588+
if (size == -1) {
589+
size = wcslen(w);
590+
}
591+
592+
alloc = size;
593+
orig_w = w;
594+
for (i = size; i > 0; i--) {
595+
if (*w > 0xFFFF)
596+
alloc++;
597+
w++;
598+
}
599+
w = orig_w;
600+
unicode = _PyUnicode_New(alloc);
601+
if (!unicode)
602+
return NULL;
603+
604+
/* Copy the wchar_t data into the new object */
605+
{
606+
register Py_UNICODE *u;
607+
u = PyUnicode_AS_UNICODE(unicode);
608+
for (i = size; i > 0; i--) {
609+
if (*w > 0xFFFF) {
610+
wchar_t ordinal = *w++;
611+
ordinal -= 0x10000;
612+
*u++ = 0xD800 | (ordinal >> 10);
613+
*u++ = 0xDC00 | (ordinal & 0x3FF);
614+
}
615+
else
616+
*u++ = *w++;
617+
}
618+
}
619+
return (PyObject *)unicode;
620+
}
621+
622+
#else
623+
564624
PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
565625
Py_ssize_t size)
566626
{
@@ -597,6 +657,10 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
597657
return (PyObject *)unicode;
598658
}
599659

660+
#endif /* CONVERT_WCHAR_TO_SURROGATES */
661+
662+
#undef CONVERT_WCHAR_TO_SURROGATES
663+
600664
static void
601665
makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
602666
{

0 commit comments

Comments
 (0)