@@ -2001,6 +2001,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
20012001 return PyUnicode_DecodeUTF8Stateful (s , size , errors , NULL );
20022002}
20032003
2004+ /* Mask to check or force alignment of a pointer to C 'long' boundaries */
2005+ #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2006+
2007+ /* Mask to quickly check whether a C 'long' contains a
2008+ non-ASCII, UTF8-encoded char. */
2009+ #if (SIZEOF_LONG == 8 )
2010+ # define ASCII_CHAR_MASK 0x8080808080808080L
2011+ #elif (SIZEOF_LONG == 4 )
2012+ # define ASCII_CHAR_MASK 0x80808080L
2013+ #else
2014+ # error C 'long' size should be either 4 or 8!
2015+ #endif
2016+
20042017PyObject * PyUnicode_DecodeUTF8Stateful (const char * s ,
20052018 Py_ssize_t size ,
20062019 const char * errors ,
@@ -2011,7 +2024,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
20112024 Py_ssize_t startinpos ;
20122025 Py_ssize_t endinpos ;
20132026 Py_ssize_t outpos ;
2014- const char * e ;
2027+ const char * e , * aligned_end ;
20152028 PyUnicodeObject * unicode ;
20162029 Py_UNICODE * p ;
20172030 const char * errmsg = "" ;
@@ -2032,10 +2045,51 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
20322045 /* Unpack UTF-8 encoded data */
20332046 p = unicode -> str ;
20342047 e = s + size ;
2048+ aligned_end = (const char * ) ((size_t ) e & ~LONG_PTR_MASK );
20352049
20362050 while (s < e ) {
20372051 Py_UCS4 ch = (unsigned char )* s ;
20382052
2053+ if (ch < 0x80 ) {
2054+ /* Fast path for runs of ASCII characters. Given that common UTF-8
2055+ input will consist of an overwhelming majority of ASCII
2056+ characters, we try to optimize for this case by checking
2057+ as many characters as a C 'long' can contain.
2058+ First, check if we can do an aligned read, as most CPUs have
2059+ a penalty for unaligned reads.
2060+ */
2061+ if (!((size_t ) s & LONG_PTR_MASK )) {
2062+ /* Help register allocation */
2063+ register const char * _s = s ;
2064+ register Py_UNICODE * _p = p ;
2065+ while (_s < aligned_end ) {
2066+ /* Read a whole long at a time (either 4 or 8 bytes),
2067+ and do a fast unrolled copy if it only contains ASCII
2068+ characters. */
2069+ unsigned long data = * (unsigned long * ) _s ;
2070+ if (data & ASCII_CHAR_MASK )
2071+ break ;
2072+ _p [0 ] = (unsigned char ) _s [0 ];
2073+ _p [1 ] = (unsigned char ) _s [1 ];
2074+ _p [2 ] = (unsigned char ) _s [2 ];
2075+ _p [3 ] = (unsigned char ) _s [3 ];
2076+ #if (SIZEOF_LONG == 8 )
2077+ _p [4 ] = (unsigned char ) _s [4 ];
2078+ _p [5 ] = (unsigned char ) _s [5 ];
2079+ _p [6 ] = (unsigned char ) _s [6 ];
2080+ _p [7 ] = (unsigned char ) _s [7 ];
2081+ #endif
2082+ _s += SIZEOF_LONG ;
2083+ _p += SIZEOF_LONG ;
2084+ }
2085+ s = _s ;
2086+ p = _p ;
2087+ if (s == e )
2088+ break ;
2089+ ch = (unsigned char )* s ;
2090+ }
2091+ }
2092+
20392093 if (ch < 0x80 ) {
20402094 * p ++ = (Py_UNICODE )ch ;
20412095 s ++ ;
@@ -2169,6 +2223,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
21692223 & starts , & e , & startinpos , & endinpos , & exc , & s ,
21702224 & unicode , & outpos , & p ))
21712225 goto onError ;
2226+ aligned_end = (const char * ) ((size_t ) e & ~LONG_PTR_MASK );
21722227 }
21732228 if (consumed )
21742229 * consumed = s - starts ;
@@ -2188,6 +2243,9 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
21882243 return NULL ;
21892244}
21902245
2246+ #undef ASCII_CHAR_MASK
2247+
2248+
21912249/* Allocation strategy: if the string is short, convert into a stack buffer
21922250 and allocate exactly as much space needed at the end. Else allocate the
21932251 maximum possible needed (4 result bytes per Unicode character), and return
@@ -2582,6 +2640,23 @@ PyUnicode_DecodeUTF16(const char *s,
25822640 return PyUnicode_DecodeUTF16Stateful (s , size , errors , byteorder , NULL );
25832641}
25842642
2643+ /* Two masks for fast checking of whether a C 'long' may contain
2644+ UTF16-encoded surrogate characters. This is an efficient heuristic,
2645+ assuming that non-surrogate characters with a code point >= 0x8000 are
2646+ rare in most input.
2647+ FAST_CHAR_MASK is used when the input is in native byte ordering,
2648+ SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
2649+ */
2650+ #if (SIZEOF_LONG == 8 )
2651+ # define FAST_CHAR_MASK 0x8000800080008000L
2652+ # define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2653+ #elif (SIZEOF_LONG == 4 )
2654+ # define FAST_CHAR_MASK 0x80008000L
2655+ # define SWAPPED_FAST_CHAR_MASK 0x00800080L
2656+ #else
2657+ # error C 'long' size should be either 4 or 8!
2658+ #endif
2659+
25852660PyObject *
25862661PyUnicode_DecodeUTF16Stateful (const char * s ,
25872662 Py_ssize_t size ,
@@ -2595,8 +2670,9 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
25952670 Py_ssize_t outpos ;
25962671 PyUnicodeObject * unicode ;
25972672 Py_UNICODE * p ;
2598- const unsigned char * q , * e ;
2673+ const unsigned char * q , * e , * aligned_end ;
25992674 int bo = 0 ; /* assume native ordering by default */
2675+ int native_ordering = 0 ;
26002676 const char * errmsg = "" ;
26012677 /* Offsets from q for retrieving byte pairs in the right order. */
26022678#ifdef BYTEORDER_IS_LITTLE_ENDIAN
@@ -2618,7 +2694,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
26182694 /* Unpack UTF-16 encoded data */
26192695 p = unicode -> str ;
26202696 q = (unsigned char * )s ;
2621- e = q + size ;
2697+ e = q + size - 1 ;
26222698
26232699 if (byteorder )
26242700 bo = * byteorder ;
@@ -2662,20 +2738,78 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
26622738 ihi = 0 ;
26632739 ilo = 1 ;
26642740 }
2741+ #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2742+ native_ordering = ilo < ihi ;
2743+ #else
2744+ native_ordering = ilo > ihi ;
2745+ #endif
26652746
2747+ aligned_end = (const unsigned char * ) ((size_t ) e & ~LONG_PTR_MASK );
26662748 while (q < e ) {
26672749 Py_UNICODE ch ;
2668- /* remaining bytes at the end? (size should be even) */
2669- if (e - q < 2 ) {
2670- if (consumed )
2671- break ;
2672- errmsg = "truncated data" ;
2673- startinpos = ((const char * )q )- starts ;
2674- endinpos = ((const char * )e )- starts ;
2675- goto utf16Error ;
2676- /* The remaining input chars are ignored if the callback
2677- chooses to skip the input */
2678- }
2750+ /* First check for possible aligned read of a C 'long'. Unaligned
2751+ reads are more expensive, better to defer to another iteration. */
2752+ if (!((size_t ) q & LONG_PTR_MASK )) {
2753+ /* Fast path for runs of non-surrogate chars. */
2754+ register const unsigned char * _q = q ;
2755+ Py_UNICODE * _p = p ;
2756+ if (native_ordering ) {
2757+ /* Native ordering is simple: as long as the input cannot
2758+ possibly contain a surrogate char, do an unrolled copy
2759+ of several 16-bit code points to the target object.
2760+ The non-surrogate check is done on several input bytes
2761+ at a time (as many as a C 'long' can contain). */
2762+ while (_q < aligned_end ) {
2763+ unsigned long data = * (unsigned long * ) _q ;
2764+ if (data & FAST_CHAR_MASK )
2765+ break ;
2766+ _p [0 ] = ((unsigned short * ) _q )[0 ];
2767+ _p [1 ] = ((unsigned short * ) _q )[1 ];
2768+ #if (SIZEOF_LONG == 8 )
2769+ _p [2 ] = ((unsigned short * ) _q )[2 ];
2770+ _p [3 ] = ((unsigned short * ) _q )[3 ];
2771+ #endif
2772+ _q += SIZEOF_LONG ;
2773+ _p += SIZEOF_LONG / 2 ;
2774+ }
2775+ }
2776+ else {
2777+ /* Byteswapped ordering is similar, but we must decompose
2778+ the copy bytewise, and take care of zero'ing out the
2779+ upper bytes if the target object is in 32-bit units
2780+ (that is, in UCS-4 builds). */
2781+ while (_q < aligned_end ) {
2782+ unsigned long data = * (unsigned long * ) _q ;
2783+ if (data & SWAPPED_FAST_CHAR_MASK )
2784+ break ;
2785+ /* Zero upper bytes in UCS-4 builds */
2786+ #if (Py_UNICODE_SIZE > 2 )
2787+ _p [0 ] = 0 ;
2788+ _p [1 ] = 0 ;
2789+ #if (SIZEOF_LONG == 8 )
2790+ _p [2 ] = 0 ;
2791+ _p [3 ] = 0 ;
2792+ #endif
2793+ #endif
2794+ ((unsigned char * ) _p )[1 ] = _q [0 ];
2795+ ((unsigned char * ) _p )[0 ] = _q [1 ];
2796+ ((unsigned char * ) _p )[1 + Py_UNICODE_SIZE ] = _q [2 ];
2797+ ((unsigned char * ) _p )[0 + Py_UNICODE_SIZE ] = _q [3 ];
2798+ #if (SIZEOF_LONG == 8 )
2799+ ((unsigned char * ) _p )[1 + 2 * Py_UNICODE_SIZE ] = _q [4 ];
2800+ ((unsigned char * ) _p )[0 + 2 * Py_UNICODE_SIZE ] = _q [5 ];
2801+ ((unsigned char * ) _p )[1 + 3 * Py_UNICODE_SIZE ] = _q [6 ];
2802+ ((unsigned char * ) _p )[0 + 3 * Py_UNICODE_SIZE ] = _q [7 ];
2803+ #endif
2804+ _q += SIZEOF_LONG ;
2805+ _p += SIZEOF_LONG / 2 ;
2806+ }
2807+ }
2808+ p = _p ;
2809+ q = _q ;
2810+ if (q >= e )
2811+ break ;
2812+ }
26792813 ch = (q [ihi ] << 8 ) | q [ilo ];
26802814
26812815 q += 2 ;
@@ -2686,10 +2820,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
26862820 }
26872821
26882822 /* UTF-16 code pair: */
2689- if (q >= e ) {
2823+ if (q > e ) {
26902824 errmsg = "unexpected end of data" ;
2691- startinpos = (((const char * )q )- 2 ) - starts ;
2692- endinpos = ((const char * )e )- starts ;
2825+ startinpos = (((const char * )q ) - 2 ) - starts ;
2826+ endinpos = ((const char * )e ) + 1 - starts ;
26932827 goto utf16Error ;
26942828 }
26952829 if (0xD800 <= ch && ch <= 0xDBFF ) {
@@ -2718,14 +2852,47 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
27182852 /* Fall through to report the error */
27192853
27202854 utf16Error :
2721- outpos = p - PyUnicode_AS_UNICODE (unicode );
2855+ outpos = p - PyUnicode_AS_UNICODE (unicode );
27222856 if (unicode_decode_call_errorhandler (
2723- errors , & errorHandler ,
2724- "utf16" , errmsg ,
2725- & starts , (const char * * )& e , & startinpos , & endinpos , & exc , (const char * * )& q ,
2726- & unicode , & outpos , & p ))
2857+ errors ,
2858+ & errorHandler ,
2859+ "utf16" , errmsg ,
2860+ & starts ,
2861+ (const char * * )& e ,
2862+ & startinpos ,
2863+ & endinpos ,
2864+ & exc ,
2865+ (const char * * )& q ,
2866+ & unicode ,
2867+ & outpos ,
2868+ & p ))
27272869 goto onError ;
27282870 }
2871+ /* remaining byte at the end? (size should be even) */
2872+ if (e == q ) {
2873+ if (!consumed ) {
2874+ errmsg = "truncated data" ;
2875+ startinpos = ((const char * )q ) - starts ;
2876+ endinpos = ((const char * )e ) + 1 - starts ;
2877+ outpos = p - PyUnicode_AS_UNICODE (unicode );
2878+ if (unicode_decode_call_errorhandler (
2879+ errors ,
2880+ & errorHandler ,
2881+ "utf16" , errmsg ,
2882+ & starts ,
2883+ (const char * * )& e ,
2884+ & startinpos ,
2885+ & endinpos ,
2886+ & exc ,
2887+ (const char * * )& q ,
2888+ & unicode ,
2889+ & outpos ,
2890+ & p ))
2891+ goto onError ;
2892+ /* The remaining input chars are ignored if the callback
2893+ chooses to skip the input */
2894+ }
2895+ }
27292896
27302897 if (byteorder )
27312898 * byteorder = bo ;
@@ -2748,6 +2915,9 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
27482915 return NULL ;
27492916}
27502917
2918+ #undef FAST_CHAR_MASK
2919+ #undef SWAPPED_FAST_CHAR_MASK
2920+
27512921PyObject *
27522922PyUnicode_EncodeUTF16 (const Py_UNICODE * s ,
27532923 Py_ssize_t size ,
@@ -3571,6 +3741,7 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
35713741{
35723742 PyUnicodeObject * v ;
35733743 Py_UNICODE * p ;
3744+ const char * e , * unrolled_end ;
35743745
35753746 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
35763747 if (size == 1 ) {
@@ -3584,8 +3755,20 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
35843755 if (size == 0 )
35853756 return (PyObject * )v ;
35863757 p = PyUnicode_AS_UNICODE (v );
3587- while (size -- > 0 )
3588- * p ++ = (unsigned char )* s ++ ;
3758+ e = s + size ;
3759+ /* Unrolling the copy makes it much faster by reducing the looping
3760+ overhead. This is similar to what many memcpy() implementations do. */
3761+ unrolled_end = e - 4 ;
3762+ while (s < unrolled_end ) {
3763+ p [0 ] = (unsigned char ) s [0 ];
3764+ p [1 ] = (unsigned char ) s [1 ];
3765+ p [2 ] = (unsigned char ) s [2 ];
3766+ p [3 ] = (unsigned char ) s [3 ];
3767+ s += 4 ;
3768+ p += 4 ;
3769+ }
3770+ while (s < e )
3771+ * p ++ = (unsigned char ) * s ++ ;
35893772 return (PyObject * )v ;
35903773
35913774 onError :
0 commit comments