@@ -4750,35 +4750,12 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
47504750
47514751
47524752static int
4753- unicode_decode_utf8_writer (_PyUnicodeWriter * writer ,
4754- const char * s , Py_ssize_t size ,
4755- _Py_error_handler error_handler , const char * errors ,
4756- Py_ssize_t * consumed )
4753+ unicode_decode_utf8_impl (_PyUnicodeWriter * writer ,
4754+ const char * starts , const char * s , const char * end ,
4755+ _Py_error_handler error_handler ,
4756+ const char * errors ,
4757+ Py_ssize_t * consumed )
47574758{
4758- const char * starts = s ;
4759- const char * end = s + size ;
4760-
4761- // fast path: try ASCII string.
4762- if (_PyUnicodeWriter_Prepare (writer , size , 127 ) < 0 ) {
4763- return -1 ;
4764- }
4765-
4766- Py_UCS1 * dest = (Py_UCS1 * )writer -> data + writer -> pos * writer -> kind ;
4767- if (writer -> kind == PyUnicode_1BYTE_KIND
4768- && _Py_IS_ALIGNED (dest , ALIGNOF_SIZE_T ))
4769- {
4770- Py_ssize_t decoded = ascii_decode (s , end , dest );
4771- writer -> pos += decoded ;
4772-
4773- if (decoded == size ) {
4774- if (consumed ) {
4775- * consumed = size ;
4776- }
4777- return 0 ;
4778- }
4779- s += decoded ;
4780- }
4781-
47824759 Py_ssize_t startinpos , endinpos ;
47834760 const char * errmsg = "" ;
47844761 PyObject * error_handler_obj = NULL ;
@@ -4828,6 +4805,8 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
48284805 endinpos = startinpos + ch - 1 ;
48294806 break ;
48304807 default :
4808+ // ch doesn't fit into kind, so change the buffer kind to write
4809+ // the character
48314810 if (_PyUnicodeWriter_WriteCharInline (writer , ch ) < 0 )
48324811 goto onError ;
48334812 continue ;
@@ -4899,8 +4878,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48994878 Py_ssize_t * consumed )
49004879{
49014880 if (size == 0 ) {
4902- if (consumed )
4881+ if (consumed ) {
49034882 * consumed = 0 ;
4883+ }
49044884 _Py_RETURN_UNICODE_EMPTY ();
49054885 }
49064886
@@ -4912,19 +4892,81 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
49124892 return get_latin1_char ((unsigned char )s [0 ]);
49134893 }
49144894
4895+ // fast path: try ASCII string.
4896+ const char * starts = s ;
4897+ const char * end = s + size ;
4898+ PyObject * u = PyUnicode_New (size , 127 );
4899+ if (u == NULL ) {
4900+ return NULL ;
4901+ }
4902+ Py_ssize_t decoded = ascii_decode (s , end , PyUnicode_1BYTE_DATA (u ));
4903+ if (decoded == size ) {
4904+ if (consumed ) {
4905+ * consumed = size ;
4906+ }
4907+ return u ;
4908+ }
4909+ s += decoded ;
4910+ size -= decoded ;
4911+
4912+ // Use _PyUnicodeWriter after fast path is failed.
49154913 _PyUnicodeWriter writer ;
4916- _PyUnicodeWriter_Init (& writer );
4914+ _PyUnicodeWriter_InitWithBuffer (& writer , u );
4915+ writer .pos = decoded ;
49174916
4918- if (unicode_decode_utf8_writer (& writer , s , size ,
4919- error_handler , errors ,
4920- consumed ) < 0 ) {
4917+ if (unicode_decode_utf8_impl (& writer , starts , s , end ,
4918+ error_handler , errors ,
4919+ consumed ) < 0 ) {
49214920 _PyUnicodeWriter_Dealloc (& writer );
49224921 return NULL ;
49234922 }
49244923 return _PyUnicodeWriter_Finish (& writer );
49254924}
49264925
49274926
4927+ static int
4928+ unicode_decode_utf8_writer (_PyUnicodeWriter * writer ,
4929+ const char * s , Py_ssize_t size ,
4930+ _Py_error_handler error_handler , const char * errors ,
4931+ Py_ssize_t * consumed )
4932+ {
4933+ if (size == 0 ) {
4934+ if (consumed ) {
4935+ * consumed = 0 ;
4936+ }
4937+ return 0 ;
4938+ }
4939+
4940+ // fast path: try ASCII string.
4941+ if (_PyUnicodeWriter_Prepare (writer , size , 127 ) < 0 ) {
4942+ return -1 ;
4943+ }
4944+
4945+ const char * starts = s ;
4946+ const char * end = s + size ;
4947+ Py_ssize_t decoded = 0 ;
4948+ Py_UCS1 * dest = (Py_UCS1 * )writer -> data + writer -> pos * writer -> kind ;
4949+ if (writer -> kind == PyUnicode_1BYTE_KIND
4950+ && _Py_IS_ALIGNED (dest , ALIGNOF_SIZE_T ))
4951+ {
4952+ decoded = ascii_decode (s , end , dest );
4953+ writer -> pos += decoded ;
4954+
4955+ if (decoded == size ) {
4956+ if (consumed ) {
4957+ * consumed = size ;
4958+ }
4959+ return 0 ;
4960+ }
4961+ s += decoded ;
4962+ size -= decoded ;
4963+ }
4964+
4965+ return unicode_decode_utf8_impl (writer , starts , s , end ,
4966+ error_handler , errors , consumed );
4967+ }
4968+
4969+
49284970PyObject *
49294971PyUnicode_DecodeUTF8Stateful (const char * s ,
49304972 Py_ssize_t size ,
0 commit comments