@@ -43,18 +43,90 @@ PyObject *CPyStr_GetItem(PyObject *str, CPyTagged index) {
4343 }
4444}
4545
46- PyObject * CPyStr_Build (int len , ...) {
47- int i ;
46+ // A simplification of _PyUnicode_JoinArray() from CPython 3.9.6
47+ PyObject * CPyStr_Build (Py_ssize_t len , ...) {
48+ Py_ssize_t i ;
4849 va_list args ;
50+
51+ // Calculate the total amount of space and check
52+ // whether all components have the same kind.
53+ Py_ssize_t sz = 0 ;
54+ Py_UCS4 maxchar = 0 ;
55+ int use_memcpy = 1 ; // Use memcpy by default
56+ PyObject * last_obj = NULL ;
57+
4958 va_start (args , len );
59+ for (i = 0 ; i < len ; i ++ ) {
60+ PyObject * item = va_arg (args , PyObject * );
61+ if (!PyUnicode_Check (item )) {
62+ PyErr_Format (PyExc_TypeError ,
63+ "sequence item %zd: expected str instance,"
64+ " %.80s found" ,
65+ i , Py_TYPE (item )-> tp_name );
66+ return NULL ;
67+ }
68+ if (PyUnicode_READY (item ) == -1 )
69+ return NULL ;
5070
51- PyObject * res = PyUnicode_FromObject (va_arg (args , PyObject * ));
52- for (i = 1 ; i < len ; i ++ ) {
53- PyObject * str = va_arg (args , PyObject * );
54- PyUnicode_Append (& res , str );
55- }
71+ size_t add_sz = PyUnicode_GET_LENGTH (item );
72+ Py_UCS4 item_maxchar = PyUnicode_MAX_CHAR_VALUE (item );
73+ maxchar = Py_MAX (maxchar , item_maxchar );
5674
75+ // Using size_t to avoid overflow during arithmetic calculation
76+ if (add_sz > (size_t )(PY_SSIZE_T_MAX - sz )) {
77+ PyErr_SetString (PyExc_OverflowError ,
78+ "join() result is too long for a Python string" );
79+ return NULL ;
80+ }
81+ sz += add_sz ;
82+
83+ // If these strings have different kind, we would call
84+ // _PyUnicode_FastCopyCharacters() in the following part.
85+ if (use_memcpy && last_obj != NULL ) {
86+ if (PyUnicode_KIND (last_obj ) != PyUnicode_KIND (item ))
87+ use_memcpy = 0 ;
88+ }
89+ last_obj = item ;
90+ }
5791 va_end (args );
92+
93+ // Construct the string
94+ PyObject * res = PyUnicode_New (sz , maxchar );
95+ if (res == NULL )
96+ return NULL ;
97+
98+ if (use_memcpy ) {
99+ unsigned char * res_data = PyUnicode_1BYTE_DATA (res );
100+ unsigned int kind = PyUnicode_KIND (res );
101+
102+ va_start (args , len );
103+ for (i = 0 ; i < len ; ++ i ) {
104+ PyObject * item = va_arg (args , PyObject * );
105+ Py_ssize_t itemlen = PyUnicode_GET_LENGTH (item );
106+ if (itemlen != 0 ) {
107+ memcpy (res_data , PyUnicode_DATA (item ), kind * itemlen );
108+ res_data += kind * itemlen ;
109+ }
110+ }
111+ va_end (args );
112+ assert (res_data == PyUnicode_1BYTE_DATA (res ) + kind * PyUnicode_GET_LENGTH (res ));
113+ } else {
114+ Py_ssize_t res_offset = 0 ;
115+
116+ va_start (args , len );
117+ for (i = 0 ; i < len ; ++ i ) {
118+ PyObject * item = va_arg (args , PyObject * );
119+ Py_ssize_t itemlen = PyUnicode_GET_LENGTH (item );
120+ if (itemlen != 0 ) {
121+ _PyUnicode_FastCopyCharacters (res , res_offset , item , 0 , itemlen );
122+ res_offset += itemlen ;
123+ }
124+ }
125+ va_end (args );
126+ assert (res_offset == PyUnicode_GET_LENGTH (res ));
127+ }
128+
129+ assert (_PyUnicode_CheckConsistency (res , 1 ));
58130 return res ;
59131}
60132
0 commit comments