@@ -2921,6 +2921,83 @@ PyUnicode_FromFormat(const char *format, ...)
29212921 return ret ;
29222922}
29232923
2924+ static Py_ssize_t
2925+ unicode_get_widechar_size (PyObject * unicode )
2926+ {
2927+ Py_ssize_t res ;
2928+
2929+ assert (unicode != NULL );
2930+ assert (_PyUnicode_CHECK (unicode ));
2931+
2932+ if (_PyUnicode_WSTR (unicode ) != NULL ) {
2933+ return PyUnicode_WSTR_LENGTH (unicode );
2934+ }
2935+ assert (PyUnicode_IS_READY (unicode ));
2936+
2937+ res = _PyUnicode_LENGTH (unicode );
2938+ #if SIZEOF_WCHAR_T == 2
2939+ if (PyUnicode_KIND (unicode ) == PyUnicode_4BYTE_KIND ) {
2940+ const Py_UCS4 * s = PyUnicode_4BYTE_DATA (unicode );
2941+ const Py_UCS4 * end = s + res ;
2942+ for (; s < end ; ++ s ) {
2943+ if (* s > 0xFFFF ) {
2944+ ++ res ;
2945+ }
2946+ }
2947+ }
2948+ #endif
2949+ return res ;
2950+ }
2951+
2952+ static void
2953+ unicode_copy_as_widechar (PyObject * unicode , wchar_t * w , Py_ssize_t size )
2954+ {
2955+ const wchar_t * wstr ;
2956+
2957+ assert (unicode != NULL );
2958+ assert (_PyUnicode_CHECK (unicode ));
2959+
2960+ wstr = _PyUnicode_WSTR (unicode );
2961+ if (wstr != NULL ) {
2962+ memcpy (w , wstr , size * sizeof (wchar_t ));
2963+ return ;
2964+ }
2965+ assert (PyUnicode_IS_READY (unicode ));
2966+
2967+ if (PyUnicode_KIND (unicode ) == PyUnicode_1BYTE_KIND ) {
2968+ const Py_UCS1 * s = PyUnicode_1BYTE_DATA (unicode );
2969+ for (; size -- ; ++ s , ++ w ) {
2970+ * w = * s ;
2971+ }
2972+ }
2973+ else {
2974+ #if SIZEOF_WCHAR_T == 4
2975+ assert (PyUnicode_KIND (unicode ) == PyUnicode_2BYTE_KIND );
2976+ const Py_UCS2 * s = PyUnicode_2BYTE_DATA (unicode );
2977+ for (; size -- ; ++ s , ++ w ) {
2978+ * w = * s ;
2979+ }
2980+ #else
2981+ assert (PyUnicode_KIND (unicode ) == PyUnicode_4BYTE_KIND );
2982+ const Py_UCS4 * s = PyUnicode_4BYTE_DATA (unicode );
2983+ for (; size -- ; ++ s , ++ w ) {
2984+ Py_UCS4 ch = * s ;
2985+ if (ch > 0xFFFF ) {
2986+ assert (ch <= MAX_UNICODE );
2987+ /* encode surrogate pair in this case */
2988+ * w ++ = Py_UNICODE_HIGH_SURROGATE (ch );
2989+ if (!size -- )
2990+ break ;
2991+ * w = Py_UNICODE_LOW_SURROGATE (ch );
2992+ }
2993+ else {
2994+ * w = ch ;
2995+ }
2996+ }
2997+ #endif
2998+ }
2999+ }
3000+
29243001#ifdef HAVE_WCHAR_H
29253002
29263003/* Convert a Unicode object to a wide character string.
@@ -2937,59 +3014,63 @@ PyUnicode_AsWideChar(PyObject *unicode,
29373014 Py_ssize_t size )
29383015{
29393016 Py_ssize_t res ;
2940- const wchar_t * wstr ;
29413017
29423018 if (unicode == NULL ) {
29433019 PyErr_BadInternalCall ();
29443020 return -1 ;
29453021 }
2946- wstr = PyUnicode_AsUnicodeAndSize ( unicode , & res );
2947- if ( wstr == NULL )
3022+ if (! PyUnicode_Check ( unicode )) {
3023+ PyErr_BadArgument ();
29483024 return -1 ;
2949-
2950- if (w != NULL ) {
2951- if (size > res )
2952- size = res + 1 ;
2953- else
2954- res = size ;
2955- memcpy (w , wstr , size * sizeof (wchar_t ));
2956- return res ;
29573025 }
2958- else
3026+
3027+ res = unicode_get_widechar_size (unicode );
3028+ if (w == NULL ) {
29593029 return res + 1 ;
3030+ }
3031+
3032+ if (size > res ) {
3033+ size = res + 1 ;
3034+ }
3035+ else {
3036+ res = size ;
3037+ }
3038+ unicode_copy_as_widechar (unicode , w , size );
3039+ return res ;
29603040}
29613041
29623042wchar_t *
29633043PyUnicode_AsWideCharString (PyObject * unicode ,
29643044 Py_ssize_t * size )
29653045{
2966- const wchar_t * wstr ;
29673046 wchar_t * buffer ;
29683047 Py_ssize_t buflen ;
29693048
29703049 if (unicode == NULL ) {
29713050 PyErr_BadInternalCall ();
29723051 return NULL ;
29733052 }
2974-
2975- wstr = PyUnicode_AsUnicodeAndSize (unicode , & buflen );
2976- if (wstr == NULL ) {
2977- return NULL ;
2978- }
2979- if (size == NULL && wcslen (wstr ) != (size_t )buflen ) {
2980- PyErr_SetString (PyExc_ValueError ,
2981- "embedded null character" );
3053+ if (!PyUnicode_Check (unicode )) {
3054+ PyErr_BadArgument ();
29823055 return NULL ;
29833056 }
29843057
2985- buffer = PyMem_NEW (wchar_t , buflen + 1 );
3058+ buflen = unicode_get_widechar_size (unicode );
3059+ buffer = (wchar_t * ) PyMem_NEW (wchar_t , (buflen + 1 ));
29863060 if (buffer == NULL ) {
29873061 PyErr_NoMemory ();
29883062 return NULL ;
29893063 }
2990- memcpy ( buffer , wstr , ( buflen + 1 ) * sizeof ( wchar_t ) );
2991- if (size != NULL )
3064+ unicode_copy_as_widechar ( unicode , buffer , buflen + 1 );
3065+ if (size != NULL ) {
29923066 * size = buflen ;
3067+ }
3068+ else if (wcslen (buffer ) != (size_t )buflen ) {
3069+ PyMem_FREE (buffer );
3070+ PyErr_SetString (PyExc_ValueError ,
3071+ "embedded null character" );
3072+ return NULL ;
3073+ }
29933074 return buffer ;
29943075}
29953076
@@ -3781,118 +3862,35 @@ PyUnicode_AsUTF8(PyObject *unicode)
37813862Py_UNICODE *
37823863PyUnicode_AsUnicodeAndSize (PyObject * unicode , Py_ssize_t * size )
37833864{
3784- const unsigned char * one_byte ;
3785- #if SIZEOF_WCHAR_T == 4
3786- const Py_UCS2 * two_bytes ;
3787- #else
3788- const Py_UCS4 * four_bytes ;
3789- const Py_UCS4 * ucs4_end ;
3790- Py_ssize_t num_surrogates ;
3791- #endif
3792- wchar_t * w ;
3793- wchar_t * wchar_end ;
3794-
37953865 if (!PyUnicode_Check (unicode )) {
37963866 PyErr_BadArgument ();
37973867 return NULL ;
37983868 }
3799- if (_PyUnicode_WSTR (unicode ) == NULL ) {
3869+ Py_UNICODE * w = _PyUnicode_WSTR (unicode );
3870+ if (w == NULL ) {
38003871 /* Non-ASCII compact unicode object */
3801- assert (_PyUnicode_KIND (unicode ) != 0 );
3872+ assert (_PyUnicode_KIND (unicode ) != PyUnicode_WCHAR_KIND );
38023873 assert (PyUnicode_IS_READY (unicode ));
38033874
3804- if (PyUnicode_KIND (unicode ) == PyUnicode_4BYTE_KIND ) {
3805- #if SIZEOF_WCHAR_T == 2
3806- four_bytes = PyUnicode_4BYTE_DATA (unicode );
3807- ucs4_end = four_bytes + _PyUnicode_LENGTH (unicode );
3808- num_surrogates = 0 ;
3809-
3810- for (; four_bytes < ucs4_end ; ++ four_bytes ) {
3811- if (* four_bytes > 0xFFFF )
3812- ++ num_surrogates ;
3813- }
3814-
3815- _PyUnicode_WSTR (unicode ) = (wchar_t * ) PyObject_MALLOC (
3816- sizeof (wchar_t ) * (_PyUnicode_LENGTH (unicode ) + 1 + num_surrogates ));
3817- if (!_PyUnicode_WSTR (unicode )) {
3818- PyErr_NoMemory ();
3819- return NULL ;
3820- }
3821- _PyUnicode_WSTR_LENGTH (unicode ) = _PyUnicode_LENGTH (unicode ) + num_surrogates ;
3822-
3823- w = _PyUnicode_WSTR (unicode );
3824- wchar_end = w + _PyUnicode_WSTR_LENGTH (unicode );
3825- four_bytes = PyUnicode_4BYTE_DATA (unicode );
3826- for (; four_bytes < ucs4_end ; ++ four_bytes , ++ w ) {
3827- if (* four_bytes > 0xFFFF ) {
3828- assert (* four_bytes <= MAX_UNICODE );
3829- /* encode surrogate pair in this case */
3830- * w ++ = Py_UNICODE_HIGH_SURROGATE (* four_bytes );
3831- * w = Py_UNICODE_LOW_SURROGATE (* four_bytes );
3832- }
3833- else
3834- * w = * four_bytes ;
3835-
3836- if (w > wchar_end ) {
3837- Py_UNREACHABLE ();
3838- }
3839- }
3840- * w = 0 ;
3841- #else
3842- /* sizeof(wchar_t) == 4 */
3843- Py_FatalError ("Impossible unicode object state, wstr and str "
3844- "should share memory already." );
3875+ Py_ssize_t wlen = unicode_get_widechar_size (unicode );
3876+ if ((size_t )wlen > PY_SSIZE_T_MAX / sizeof (wchar_t ) - 1 ) {
3877+ PyErr_NoMemory ();
38453878 return NULL ;
3846- #endif
38473879 }
3848- else {
3849- if ((size_t )_PyUnicode_LENGTH (unicode ) >
3850- PY_SSIZE_T_MAX / sizeof (wchar_t ) - 1 ) {
3851- PyErr_NoMemory ();
3852- return NULL ;
3853- }
3854- _PyUnicode_WSTR (unicode ) = (wchar_t * ) PyObject_MALLOC (sizeof (wchar_t ) *
3855- (_PyUnicode_LENGTH (unicode ) + 1 ));
3856- if (!_PyUnicode_WSTR (unicode )) {
3857- PyErr_NoMemory ();
3858- return NULL ;
3859- }
3860- if (!PyUnicode_IS_COMPACT_ASCII (unicode ))
3861- _PyUnicode_WSTR_LENGTH (unicode ) = _PyUnicode_LENGTH (unicode );
3862- w = _PyUnicode_WSTR (unicode );
3863- wchar_end = w + _PyUnicode_LENGTH (unicode );
3864-
3865- if (PyUnicode_KIND (unicode ) == PyUnicode_1BYTE_KIND ) {
3866- one_byte = PyUnicode_1BYTE_DATA (unicode );
3867- for (; w < wchar_end ; ++ one_byte , ++ w )
3868- * w = * one_byte ;
3869- /* null-terminate the wstr */
3870- * w = 0 ;
3871- }
3872- else if (PyUnicode_KIND (unicode ) == PyUnicode_2BYTE_KIND ) {
3873- #if SIZEOF_WCHAR_T == 4
3874- two_bytes = PyUnicode_2BYTE_DATA (unicode );
3875- for (; w < wchar_end ; ++ two_bytes , ++ w )
3876- * w = * two_bytes ;
3877- /* null-terminate the wstr */
3878- * w = 0 ;
3879- #else
3880- /* sizeof(wchar_t) == 2 */
3881- PyObject_FREE (_PyUnicode_WSTR (unicode ));
3882- _PyUnicode_WSTR (unicode ) = NULL ;
3883- Py_FatalError ("Impossible unicode object state, wstr "
3884- "and str should share memory already." );
3885- return NULL ;
3886- #endif
3887- }
3888- else {
3889- Py_UNREACHABLE ();
3890- }
3880+ w = (wchar_t * ) PyObject_MALLOC (sizeof (wchar_t ) * (wlen + 1 ));
3881+ if (w == NULL ) {
3882+ PyErr_NoMemory ();
3883+ return NULL ;
3884+ }
3885+ unicode_copy_as_widechar (unicode , w , wlen + 1 );
3886+ _PyUnicode_WSTR (unicode ) = w ;
3887+ if (!PyUnicode_IS_COMPACT_ASCII (unicode )) {
3888+ _PyUnicode_WSTR_LENGTH (unicode ) = wlen ;
38913889 }
38923890 }
38933891 if (size != NULL )
38943892 * size = PyUnicode_WSTR_LENGTH (unicode );
3895- return _PyUnicode_WSTR ( unicode ) ;
3893+ return w ;
38963894}
38973895
38983896Py_UNICODE *
0 commit comments