@@ -3991,11 +3991,11 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
39913991}
39923992
39933993
3994+ static int unicode_fill_utf8 (PyObject * unicode );
3995+
39943996const char *
39953997PyUnicode_AsUTF8AndSize (PyObject * unicode , Py_ssize_t * psize )
39963998{
3997- PyObject * bytes ;
3998-
39993999 if (!PyUnicode_Check (unicode )) {
40004000 PyErr_BadArgument ();
40014001 return NULL ;
@@ -4004,21 +4004,9 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
40044004 return NULL ;
40054005
40064006 if (PyUnicode_UTF8 (unicode ) == NULL ) {
4007- assert (!PyUnicode_IS_COMPACT_ASCII (unicode ));
4008- bytes = _PyUnicode_AsUTF8String (unicode , NULL );
4009- if (bytes == NULL )
4010- return NULL ;
4011- _PyUnicode_UTF8 (unicode ) = PyObject_MALLOC (PyBytes_GET_SIZE (bytes ) + 1 );
4012- if (_PyUnicode_UTF8 (unicode ) == NULL ) {
4013- PyErr_NoMemory ();
4014- Py_DECREF (bytes );
4007+ if (unicode_fill_utf8 (unicode ) == -1 ) {
40154008 return NULL ;
40164009 }
4017- _PyUnicode_UTF8_LENGTH (unicode ) = PyBytes_GET_SIZE (bytes );
4018- memcpy (_PyUnicode_UTF8 (unicode ),
4019- PyBytes_AS_STRING (bytes ),
4020- _PyUnicode_UTF8_LENGTH (unicode ) + 1 );
4021- Py_DECREF (bytes );
40224010 }
40234011
40244012 if (psize )
@@ -5381,10 +5369,6 @@ static PyObject *
53815369unicode_encode_utf8 (PyObject * unicode , _Py_error_handler error_handler ,
53825370 const char * errors )
53835371{
5384- enum PyUnicode_Kind kind ;
5385- void * data ;
5386- Py_ssize_t size ;
5387-
53885372 if (!PyUnicode_Check (unicode )) {
53895373 PyErr_BadArgument ();
53905374 return NULL ;
@@ -5397,22 +5381,86 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
53975381 return PyBytes_FromStringAndSize (PyUnicode_UTF8 (unicode ),
53985382 PyUnicode_UTF8_LENGTH (unicode ));
53995383
5400- kind = PyUnicode_KIND (unicode );
5401- data = PyUnicode_DATA (unicode );
5402- size = PyUnicode_GET_LENGTH (unicode );
5384+ enum PyUnicode_Kind kind = PyUnicode_KIND (unicode );
5385+ void * data = PyUnicode_DATA (unicode );
5386+ Py_ssize_t size = PyUnicode_GET_LENGTH (unicode );
5387+
5388+ _PyBytesWriter writer ;
5389+ char * end ;
54035390
54045391 switch (kind ) {
54055392 default :
54065393 Py_UNREACHABLE ();
54075394 case PyUnicode_1BYTE_KIND :
54085395 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
54095396 assert (!PyUnicode_IS_ASCII (unicode ));
5410- return ucs1lib_utf8_encoder (unicode , data , size , error_handler , errors );
5397+ end = ucs1lib_utf8_encoder (& writer , unicode , data , size , error_handler , errors );
5398+ break ;
5399+ case PyUnicode_2BYTE_KIND :
5400+ end = ucs2lib_utf8_encoder (& writer , unicode , data , size , error_handler , errors );
5401+ break ;
5402+ case PyUnicode_4BYTE_KIND :
5403+ end = ucs4lib_utf8_encoder (& writer , unicode , data , size , error_handler , errors );
5404+ break ;
5405+ }
5406+
5407+ if (end == NULL ) {
5408+ _PyBytesWriter_Dealloc (& writer );
5409+ return NULL ;
5410+ }
5411+ return _PyBytesWriter_Finish (& writer , end );
5412+ }
5413+
5414+ static int
5415+ unicode_fill_utf8 (PyObject * unicode )
5416+ {
5417+ /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5418+ assert (!PyUnicode_IS_ASCII (unicode ));
5419+
5420+ enum PyUnicode_Kind kind = PyUnicode_KIND (unicode );
5421+ void * data = PyUnicode_DATA (unicode );
5422+ Py_ssize_t size = PyUnicode_GET_LENGTH (unicode );
5423+
5424+ _PyBytesWriter writer ;
5425+ char * end ;
5426+
5427+ switch (kind ) {
5428+ default :
5429+ Py_UNREACHABLE ();
5430+ case PyUnicode_1BYTE_KIND :
5431+ end = ucs1lib_utf8_encoder (& writer , unicode , data , size ,
5432+ _Py_ERROR_STRICT , NULL );
5433+ break ;
54115434 case PyUnicode_2BYTE_KIND :
5412- return ucs2lib_utf8_encoder (unicode , data , size , error_handler , errors );
5435+ end = ucs2lib_utf8_encoder (& writer , unicode , data , size ,
5436+ _Py_ERROR_STRICT , NULL );
5437+ break ;
54135438 case PyUnicode_4BYTE_KIND :
5414- return ucs4lib_utf8_encoder (unicode , data , size , error_handler , errors );
5439+ end = ucs4lib_utf8_encoder (& writer , unicode , data , size ,
5440+ _Py_ERROR_STRICT , NULL );
5441+ break ;
5442+ }
5443+ if (end == NULL ) {
5444+ _PyBytesWriter_Dealloc (& writer );
5445+ return -1 ;
5446+ }
5447+
5448+ char * start = writer .use_small_buffer ? writer .small_buffer :
5449+ PyBytes_AS_STRING (writer .buffer );
5450+ Py_ssize_t len = end - start ;
5451+
5452+ char * cache = PyObject_MALLOC (len + 1 );
5453+ if (cache == NULL ) {
5454+ _PyBytesWriter_Dealloc (& writer );
5455+ PyErr_NoMemory ();
5456+ return -1 ;
54155457 }
5458+ _PyUnicode_UTF8 (unicode ) = cache ;
5459+ _PyUnicode_UTF8_LENGTH (unicode ) = len ;
5460+ memcpy (cache , start , len );
5461+ cache [len ] = '\0' ;
5462+ _PyBytesWriter_Dealloc (& writer );
5463+ return 0 ;
54165464}
54175465
54185466PyObject *
0 commit comments