@@ -427,7 +427,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
427427{
428428 PyUnicodeObject * unicode ;
429429 /* If the Unicode data is known at construction time, we can apply
430- some optimizations which share commonly used objects. */
430+ some optimizations which share commonly used objects.
431+ Also, this means the input must be UTF-8, so fall back to the
432+ UTF-8 decoder at the end. */
431433 if (u != NULL ) {
432434
433435 /* Optimization for empty strings */
@@ -436,8 +438,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
436438 return (PyObject * )unicode_empty ;
437439 }
438440
439- /* Single characters are shared when using this constructor */
440- if (size == 1 ) {
441+ /* Single characters are shared when using this constructor.
442+ Restrict to ASCII, since the input must be UTF-8. */
443+ if (size == 1 && Py_CHARMASK (* u ) < 128 ) {
441444 unicode = unicode_latin1 [Py_CHARMASK (* u )];
442445 if (!unicode ) {
443446 unicode = _PyUnicode_New (1 );
@@ -449,21 +452,14 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
449452 Py_INCREF (unicode );
450453 return (PyObject * )unicode ;
451454 }
455+
456+ return PyUnicode_DecodeUTF8 (u , size , NULL );
452457 }
453458
454459 unicode = _PyUnicode_New (size );
455460 if (!unicode )
456461 return NULL ;
457462
458- /* Copy the Unicode data into the new object */
459- if (u != NULL ) {
460- Py_UNICODE * p = unicode -> str ;
461- while (size -- )
462- * p ++ = Py_CHARMASK (* u ++ );
463- /* Don't need to write trailing 0 because
464- that's already done by _PyUnicode_New */
465- }
466-
467463 return (PyObject * )unicode ;
468464}
469465
0 commit comments