@@ -1153,19 +1153,112 @@ PyUnicode_FromFormat(const char *format, ...)
11531153 return ret ;
11541154}
11551155
1156- static void
1156+ /* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1157+ convert a Unicode object to a wide character string.
1158+
1159+ - If w is NULL: return the number of wide characters (including the nul
1160+ character) required to convert the unicode object. Ignore size argument.
1161+
1162+ - Otherwise: return the number of wide characters (excluding the nul
1163+ character) written into w. Write at most size wide characters (including
1164+ the nul character). */
1165+ static Py_ssize_t
11571166unicode_aswidechar (PyUnicodeObject * unicode ,
11581167 wchar_t * w ,
11591168 Py_ssize_t size )
11601169{
11611170#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1162- memcpy (w , unicode -> str , size * sizeof (wchar_t ));
1163- #else
1164- register Py_UNICODE * u ;
1171+ Py_ssize_t res ;
1172+ if (w != NULL ) {
1173+ res = PyUnicode_GET_SIZE (unicode );
1174+ if (size > res )
1175+ size = res + 1 ;
1176+ else
1177+ res = size ;
1178+ memcpy (w , unicode -> str , size * sizeof (wchar_t ));
1179+ return res ;
1180+ }
1181+ else
1182+ return PyUnicode_GET_SIZE (unicode ) + 1 ;
1183+ #elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1184+ register const Py_UNICODE * u ;
1185+ const Py_UNICODE * uend ;
1186+ const wchar_t * worig , * wend ;
1187+ Py_ssize_t nchar ;
1188+
1189+ u = PyUnicode_AS_UNICODE (unicode );
1190+ uend = u + PyUnicode_GET_SIZE (unicode );
1191+ if (w != NULL ) {
1192+ worig = w ;
1193+ wend = w + size ;
1194+ while (u != uend && w != wend ) {
1195+ if (0xD800 <= u [0 ] && u [0 ] <= 0xDBFF
1196+ && 0xDC00 <= u [1 ] && u [1 ] <= 0xDFFF )
1197+ {
1198+ * w = (((u [0 ] & 0x3FF ) << 10 ) | (u [1 ] & 0x3FF )) + 0x10000 ;
1199+ u += 2 ;
1200+ }
1201+ else {
1202+ * w = * u ;
1203+ u ++ ;
1204+ }
1205+ w ++ ;
1206+ }
1207+ if (w != wend )
1208+ * w = L'\0' ;
1209+ return w - worig ;
1210+ }
1211+ else {
1212+ nchar = 1 ; /* nul character at the end */
1213+ while (u != uend ) {
1214+ if (0xD800 <= u [0 ] && u [0 ] <= 0xDBFF
1215+ && 0xDC00 <= u [1 ] && u [1 ] <= 0xDFFF )
1216+ u += 2 ;
1217+ else
1218+ u ++ ;
1219+ nchar ++ ;
1220+ }
1221+ }
1222+ return nchar ;
1223+ #elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1224+ register Py_UNICODE * u , * uend , ordinal ;
11651225 register Py_ssize_t i ;
1226+ wchar_t * worig , * wend ;
1227+ Py_ssize_t nchar ;
1228+
11661229 u = PyUnicode_AS_UNICODE (unicode );
1167- for (i = size ; i > 0 ; i -- )
1168- * w ++ = * u ++ ;
1230+ uend = u + PyUnicode_GET_SIZE (u );
1231+ if (w != NULL ) {
1232+ worig = w ;
1233+ wend = w + size ;
1234+ while (u != uend && w != wend ) {
1235+ ordinal = * u ;
1236+ if (ordinal > 0xffff ) {
1237+ ordinal -= 0x10000 ;
1238+ * w ++ = 0xD800 | (ordinal >> 10 );
1239+ * w ++ = 0xDC00 | (ordinal & 0x3FF );
1240+ }
1241+ else
1242+ * w ++ = ordinal ;
1243+ u ++ ;
1244+ }
1245+ if (w != wend )
1246+ * w = 0 ;
1247+ return w - worig ;
1248+ }
1249+ else {
1250+ nchar = 1 ; /* nul character */
1251+ while (u != uend ) {
1252+ if (* u > 0xffff )
1253+ nchar += 2 ;
1254+ else
1255+ nchar ++ ;
1256+ u ++ ;
1257+ }
1258+ return nchar ;
1259+ }
1260+ #else
1261+ # error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
11691262#endif
11701263}
11711264
@@ -1178,17 +1271,7 @@ PyUnicode_AsWideChar(PyUnicodeObject *unicode,
11781271 PyErr_BadInternalCall ();
11791272 return -1 ;
11801273 }
1181-
1182- /* If possible, try to copy the 0-termination as well */
1183- if (size > PyUnicode_GET_SIZE (unicode ))
1184- size = PyUnicode_GET_SIZE (unicode ) + 1 ;
1185-
1186- unicode_aswidechar (unicode , w , size );
1187-
1188- if (size > PyUnicode_GET_SIZE (unicode ))
1189- return PyUnicode_GET_SIZE (unicode );
1190- else
1191- return size ;
1274+ return unicode_aswidechar (unicode , w , size );
11921275}
11931276
11941277wchar_t *
@@ -1203,20 +1286,20 @@ PyUnicode_AsWideCharString(PyUnicodeObject *unicode,
12031286 return NULL ;
12041287 }
12051288
1206- if ((PY_SSIZE_T_MAX / sizeof (wchar_t ) - 1 ) < PyUnicode_GET_SIZE (unicode )) {
1289+ buflen = unicode_aswidechar (unicode , NULL , 0 );
1290+ if (PY_SSIZE_T_MAX / sizeof (wchar_t ) < buflen ) {
12071291 PyErr_NoMemory ();
12081292 return NULL ;
12091293 }
12101294
1211- buflen = PyUnicode_GET_SIZE (unicode ) + 1 ; /* copy L'\0' */
12121295 buffer = PyMem_MALLOC (buflen * sizeof (wchar_t ));
12131296 if (buffer == NULL ) {
12141297 PyErr_NoMemory ();
12151298 return NULL ;
12161299 }
1217- unicode_aswidechar (unicode , buffer , buflen );
1218- if (size )
1219- * size = buflen - 1 ;
1300+ buflen = unicode_aswidechar (unicode , buffer , buflen );
1301+ if (size != NULL )
1302+ * size = buflen ;
12201303 return buffer ;
12211304}
12221305
0 commit comments