Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 5593d8a

Browse files
author
Victor Stinner
committed
Issue #8670: PyUnicode_AsWideChar() and PyUnicode_AsWideCharString() replace
UTF-16 surrogate pairs by single non-BMP characters for 16 bits Py_UNICODE and 32 bits wchar_t (eg. Linux in narrow build).
1 parent 1c24bd0 commit 5593d8a

3 files changed

Lines changed: 130 additions & 23 deletions

File tree

Lib/test/test_unicode.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,6 +1419,17 @@ def test_aswidechar(self):
14191419
self.assertEquals(size, 7)
14201420
self.assertEquals(wchar, 'abc\0def\0')
14211421

1422+
nonbmp = chr(0x10ffff)
1423+
if sizeof(c_wchar) == 2:
1424+
buflen = 3
1425+
nchar = 2
1426+
else: # sizeof(c_wchar) == 4
1427+
buflen = 2
1428+
nchar = 1
1429+
wchar, size = test_aswidechar(nonbmp, buflen)
1430+
self.assertEquals(size, nchar)
1431+
self.assertEquals(wchar, nonbmp + '\0')
1432+
14221433
# Test PyUnicode_AsWideCharString()
14231434
def test_aswidecharstring(self):
14241435
from _testcapi import test_aswidecharstring
@@ -1432,6 +1443,15 @@ def test_aswidecharstring(self):
14321443
self.assertEquals(size, 7)
14331444
self.assertEquals(wchar, 'abc\0def\0')
14341445

1446+
nonbmp = chr(0x10ffff)
1447+
if sizeof(c_wchar) == 2:
1448+
nchar = 2
1449+
else: # sizeof(c_wchar) == 4
1450+
nchar = 1
1451+
wchar, size = test_aswidecharstring(nonbmp)
1452+
self.assertEquals(size, nchar)
1453+
self.assertEquals(wchar, nonbmp + '\0')
1454+
14351455

14361456
def test_main():
14371457
support.run_unittest(__name__)

Misc/NEWS

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,14 @@ What's New in Python 3.2 Alpha 3?
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #8670: PyUnicode_AsWideChar() and PyUnicode_AsWideCharString() replace
14+
UTF-16 surrogate pairs by single non-BMP characters for 16 bits Py_UNICODE
15+
and 32 bits wchar_t (eg. Linux in narrow build).
16+
1317
- Issue #10006: type.__abstractmethods__ now raises an AttributeError.
1418

1519
- Issue #10003: Allow handling of SIGBREAK on Windows. Fixes a regression
16-
introduced by issue #9324.
20+
introduced by issue #9324.
1721

1822
- Issue #9979: Create function PyUnicode_AsWideCharString().
1923

Objects/unicodeobject.c

Lines changed: 105 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1153,19 +1153,112 @@ PyUnicode_FromFormat(const char *format, ...)
11531153
return ret;
11541154
}
11551155

1156-
static void
1156+
/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1157+
convert a Unicode object to a wide character string.
1158+
1159+
- If w is NULL: return the number of wide characters (including the nul
1160+
character) required to convert the unicode object. Ignore size argument.
1161+
1162+
- Otherwise: return the number of wide characters (excluding the nul
1163+
character) written into w. Write at most size wide characters (including
1164+
the nul character). */
1165+
static Py_ssize_t
11571166
unicode_aswidechar(PyUnicodeObject *unicode,
11581167
wchar_t *w,
11591168
Py_ssize_t size)
11601169
{
11611170
#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1162-
memcpy(w, unicode->str, size * sizeof(wchar_t));
1163-
#else
1164-
register Py_UNICODE *u;
1171+
Py_ssize_t res;
1172+
if (w != NULL) {
1173+
res = PyUnicode_GET_SIZE(unicode);
1174+
if (size > res)
1175+
size = res + 1;
1176+
else
1177+
res = size;
1178+
memcpy(w, unicode->str, size * sizeof(wchar_t));
1179+
return res;
1180+
}
1181+
else
1182+
return PyUnicode_GET_SIZE(unicode) + 1;
1183+
#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1184+
register const Py_UNICODE *u;
1185+
const Py_UNICODE *uend;
1186+
const wchar_t *worig, *wend;
1187+
Py_ssize_t nchar;
1188+
1189+
u = PyUnicode_AS_UNICODE(unicode);
1190+
uend = u + PyUnicode_GET_SIZE(unicode);
1191+
if (w != NULL) {
1192+
worig = w;
1193+
wend = w + size;
1194+
while (u != uend && w != wend) {
1195+
if (0xD800 <= u[0] && u[0] <= 0xDBFF
1196+
&& 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1197+
{
1198+
*w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1199+
u += 2;
1200+
}
1201+
else {
1202+
*w = *u;
1203+
u++;
1204+
}
1205+
w++;
1206+
}
1207+
if (w != wend)
1208+
*w = L'\0';
1209+
return w - worig;
1210+
}
1211+
else {
1212+
nchar = 1; /* nul character at the end */
1213+
while (u != uend) {
1214+
if (0xD800 <= u[0] && u[0] <= 0xDBFF
1215+
&& 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1216+
u += 2;
1217+
else
1218+
u++;
1219+
nchar++;
1220+
}
1221+
}
1222+
return nchar;
1223+
#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1224+
register Py_UNICODE *u, *uend, ordinal;
11651225
register Py_ssize_t i;
1226+
wchar_t *worig, *wend;
1227+
Py_ssize_t nchar;
1228+
11661229
u = PyUnicode_AS_UNICODE(unicode);
1167-
for (i = size; i > 0; i--)
1168-
*w++ = *u++;
1230+
uend = u + PyUnicode_GET_SIZE(u);
1231+
if (w != NULL) {
1232+
worig = w;
1233+
wend = w + size;
1234+
while (u != uend && w != wend) {
1235+
ordinal = *u;
1236+
if (ordinal > 0xffff) {
1237+
ordinal -= 0x10000;
1238+
*w++ = 0xD800 | (ordinal >> 10);
1239+
*w++ = 0xDC00 | (ordinal & 0x3FF);
1240+
}
1241+
else
1242+
*w++ = ordinal;
1243+
u++;
1244+
}
1245+
if (w != wend)
1246+
*w = 0;
1247+
return w - worig;
1248+
}
1249+
else {
1250+
nchar = 1; /* nul character */
1251+
while (u != uend) {
1252+
if (*u > 0xffff)
1253+
nchar += 2;
1254+
else
1255+
nchar++;
1256+
u++;
1257+
}
1258+
return nchar;
1259+
}
1260+
#else
1261+
# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
11691262
#endif
11701263
}
11711264

@@ -1178,17 +1271,7 @@ PyUnicode_AsWideChar(PyUnicodeObject *unicode,
11781271
PyErr_BadInternalCall();
11791272
return -1;
11801273
}
1181-
1182-
/* If possible, try to copy the 0-termination as well */
1183-
if (size > PyUnicode_GET_SIZE(unicode))
1184-
size = PyUnicode_GET_SIZE(unicode) + 1;
1185-
1186-
unicode_aswidechar(unicode, w, size);
1187-
1188-
if (size > PyUnicode_GET_SIZE(unicode))
1189-
return PyUnicode_GET_SIZE(unicode);
1190-
else
1191-
return size;
1274+
return unicode_aswidechar(unicode, w, size);
11921275
}
11931276

11941277
wchar_t*
@@ -1203,20 +1286,20 @@ PyUnicode_AsWideCharString(PyUnicodeObject *unicode,
12031286
return NULL;
12041287
}
12051288

1206-
if ((PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) < PyUnicode_GET_SIZE(unicode)) {
1289+
buflen = unicode_aswidechar(unicode, NULL, 0);
1290+
if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
12071291
PyErr_NoMemory();
12081292
return NULL;
12091293
}
12101294

1211-
buflen = PyUnicode_GET_SIZE(unicode) + 1; /* copy L'\0' */
12121295
buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
12131296
if (buffer == NULL) {
12141297
PyErr_NoMemory();
12151298
return NULL;
12161299
}
1217-
unicode_aswidechar(unicode, buffer, buflen);
1218-
if (size)
1219-
*size = buflen - 1;
1300+
buflen = unicode_aswidechar(unicode, buffer, buflen);
1301+
if (size != NULL)
1302+
*size = buflen;
12201303
return buffer;
12211304
}
12221305

0 commit comments

Comments
 (0)