@@ -606,13 +606,13 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
606606}
607607#endif
608608
609- int
609+ Py_ssize_t
610610PyUnicode_CopyCharacters (PyObject * to , Py_ssize_t to_start ,
611611 PyObject * from , Py_ssize_t from_start ,
612612 Py_ssize_t how_many )
613613{
614- int from_kind ;
615- int to_kind ;
614+ unsigned int from_kind ;
615+ unsigned int to_kind ;
616616
617617 assert (PyUnicode_Check (from ));
618618 assert (PyUnicode_Check (to ));
@@ -622,94 +622,89 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
622622 if (PyUnicode_READY (to ))
623623 return -1 ;
624624
625+ how_many = PY_MIN (PyUnicode_GET_LENGTH (from ), how_many );
626+ if (to_start + how_many > PyUnicode_GET_LENGTH (to )) {
627+ PyErr_Format (PyExc_ValueError ,
628+ "Cannot write %zi characters at %zi "
629+ "in a string of %zi characters" ,
630+ how_many , to_start , PyUnicode_GET_LENGTH (to ));
631+ return -1 ;
632+ }
633+
625634 from_kind = PyUnicode_KIND (from );
626635 to_kind = PyUnicode_KIND (to );
627636
628637 if (from_kind == to_kind ) {
629- const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE (to );
630- Py_MEMCPY (PyUnicode_1BYTE_DATA (to ) + (to_start * char_size ),
631- PyUnicode_1BYTE_DATA (from ) + (from_start * char_size ),
632- how_many * char_size );
633- return 0 ;
634- }
638+ /* fast path */
639+ Py_MEMCPY ((char * )PyUnicode_DATA (to )
640+ + PyUnicode_KIND_SIZE (to_kind , to_start ),
641+ (char * )PyUnicode_DATA (from )
642+ + PyUnicode_KIND_SIZE (from_kind , from_start ),
643+ PyUnicode_KIND_SIZE (to_kind , how_many ));
644+ return how_many ;
645+ }
646+
647+ if (from_kind > to_kind ) {
648+ /* slow path to check for character overflow */
649+ const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE (to );
650+ void * from_data = PyUnicode_DATA (from );
651+ void * to_data = PyUnicode_DATA (to );
652+ Py_UCS4 ch , maxchar ;
653+ Py_ssize_t i ;
654+ int overflow ;
635655
636- switch (from_kind ) {
637- case PyUnicode_1BYTE_KIND :
638- switch (to_kind ) {
639- case PyUnicode_2BYTE_KIND :
640- _PyUnicode_CONVERT_BYTES (
641- unsigned char , Py_UCS2 ,
642- PyUnicode_1BYTE_DATA (from ) + from_start ,
643- PyUnicode_1BYTE_DATA (from ) + from_start + how_many ,
644- PyUnicode_2BYTE_DATA (to ) + to_start
645- );
646- break ;
647- case PyUnicode_4BYTE_KIND :
648- _PyUnicode_CONVERT_BYTES (
649- unsigned char , Py_UCS4 ,
650- PyUnicode_1BYTE_DATA (from ) + from_start ,
651- PyUnicode_1BYTE_DATA (from ) + from_start + how_many ,
652- PyUnicode_4BYTE_DATA (to ) + to_start
653- );
654- break ;
655- default :
656- goto invalid_state ;
657- }
658- break ;
659- case PyUnicode_2BYTE_KIND :
660- switch (to_kind ) {
661- case PyUnicode_1BYTE_KIND :
662- _PyUnicode_CONVERT_BYTES (
663- Py_UCS2 , unsigned char ,
664- PyUnicode_2BYTE_DATA (from ) + from_start ,
665- PyUnicode_2BYTE_DATA (from ) + from_start + how_many ,
666- PyUnicode_1BYTE_DATA (to ) + to_start
667- );
668- break ;
669- case PyUnicode_4BYTE_KIND :
670- _PyUnicode_CONVERT_BYTES (
671- Py_UCS2 , Py_UCS4 ,
672- PyUnicode_2BYTE_DATA (from ) + from_start ,
673- PyUnicode_2BYTE_DATA (from ) + from_start + how_many ,
674- PyUnicode_4BYTE_DATA (to ) + to_start
675- );
676- break ;
677- default :
678- goto invalid_state ;
679- }
680- break ;
681- case PyUnicode_4BYTE_KIND :
682- switch (to_kind ) {
683- case PyUnicode_1BYTE_KIND :
684- _PyUnicode_CONVERT_BYTES (
685- Py_UCS4 , unsigned char ,
686- PyUnicode_4BYTE_DATA (from ) + from_start ,
687- PyUnicode_4BYTE_DATA (from ) + from_start + how_many ,
688- PyUnicode_1BYTE_DATA (to ) + to_start
689- );
690- break ;
691- case PyUnicode_2BYTE_KIND :
692- _PyUnicode_CONVERT_BYTES (
693- Py_UCS4 , Py_UCS2 ,
694- PyUnicode_4BYTE_DATA (from ) + from_start ,
695- PyUnicode_4BYTE_DATA (from ) + from_start + how_many ,
696- PyUnicode_2BYTE_DATA (to ) + to_start
697- );
656+ maxchar = 0 ;
657+ for (i = 0 ; i < how_many ; i ++ ) {
658+ ch = PyUnicode_READ (from_kind , from_data , from_start + i );
659+ if (ch > maxchar ) {
660+ maxchar = ch ;
661+ if (maxchar > to_maxchar ) {
662+ overflow = 1 ;
698663 break ;
699- default :
700- goto invalid_state ;
664+ }
701665 }
702- break ;
703- default :
704- goto invalid_state ;
666+ PyUnicode_WRITE (to_kind , to_data , to_start + i , ch );
667+ }
668+ if (!overflow )
669+ return how_many ;
670+ }
671+ else if (from_kind == PyUnicode_1BYTE_KIND && to_kind == PyUnicode_2BYTE_KIND )
672+ {
673+ _PyUnicode_CONVERT_BYTES (
674+ Py_UCS1 , Py_UCS2 ,
675+ PyUnicode_1BYTE_DATA (from ) + from_start ,
676+ PyUnicode_1BYTE_DATA (from ) + from_start + how_many ,
677+ PyUnicode_2BYTE_DATA (to ) + to_start
678+ );
679+ return how_many ;
680+ }
681+ else if (from_kind == PyUnicode_1BYTE_KIND
682+ && to_kind == PyUnicode_4BYTE_KIND )
683+ {
684+ _PyUnicode_CONVERT_BYTES (
685+ Py_UCS1 , Py_UCS4 ,
686+ PyUnicode_1BYTE_DATA (from ) + from_start ,
687+ PyUnicode_1BYTE_DATA (from ) + from_start + how_many ,
688+ PyUnicode_4BYTE_DATA (to ) + to_start
689+ );
690+ return how_many ;
691+ }
692+ else if (from_kind == PyUnicode_2BYTE_KIND
693+ && to_kind == PyUnicode_4BYTE_KIND )
694+ {
695+ _PyUnicode_CONVERT_BYTES (
696+ Py_UCS2 , Py_UCS4 ,
697+ PyUnicode_2BYTE_DATA (from ) + from_start ,
698+ PyUnicode_2BYTE_DATA (from ) + from_start + how_many ,
699+ PyUnicode_4BYTE_DATA (to ) + to_start
700+ );
701+ return how_many ;
705702 }
706- return 0 ;
707-
708- invalid_state :
709703 PyErr_Format (PyExc_ValueError ,
710- "Impossible kind state (from=%i, to=%i) "
711- "in PyUnicode_CopyCharacters" ,
712- from_kind , to_kind );
704+ "Cannot copy UCS%u characters "
705+ "into a string of UCS%u characters" ,
706+ 1 << (from_kind - 1 ),
707+ 1 << (to_kind - 1 ));
713708 return -1 ;
714709}
715710
0 commit comments