Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ab86831

Browse files
committed
Issue #4868: utf-8, utf-16 and latin1 decoding are now 2x to 4x faster. The
common cases are optimized thanks to a dedicated fast path and a moderate amount of loop unrolling. This will especially help text I/O (we already register a 30% speedup on large reads on the io-c branch).
1 parent dd6351e commit ab86831

2 files changed

Lines changed: 211 additions & 24 deletions

File tree

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ What's New in Python 3.1 alpha 0
1212
Core and Builtins
1313
-----------------
1414

15+
- Issue #4868: utf-8, utf-16 and latin1 decoding are now 2x to 4x faster. The
16+
common cases are optimized thanks to a dedicated fast path and a moderate
17+
amount of loop unrolling.
18+
1519
- Issue #4074: Change the criteria for doing a full garbage collection (i.e.
1620
collecting the oldest generation) so that allocating lots of objects without
1721
destroying them does not show quadratic performance. Based on a proposal by

Objects/unicodeobject.c

Lines changed: 207 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2001,6 +2001,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
20012001
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
20022002
}
20032003

2004+
/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2005+
#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2006+
2007+
/* Mask to quickly check whether a C 'long' contains a
2008+
non-ASCII, UTF8-encoded char. */
2009+
#if (SIZEOF_LONG == 8)
2010+
# define ASCII_CHAR_MASK 0x8080808080808080L
2011+
#elif (SIZEOF_LONG == 4)
2012+
# define ASCII_CHAR_MASK 0x80808080L
2013+
#else
2014+
# error C 'long' size should be either 4 or 8!
2015+
#endif
2016+
20042017
PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
20052018
Py_ssize_t size,
20062019
const char *errors,
@@ -2011,7 +2024,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
20112024
Py_ssize_t startinpos;
20122025
Py_ssize_t endinpos;
20132026
Py_ssize_t outpos;
2014-
const char *e;
2027+
const char *e, *aligned_end;
20152028
PyUnicodeObject *unicode;
20162029
Py_UNICODE *p;
20172030
const char *errmsg = "";
@@ -2032,10 +2045,51 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
20322045
/* Unpack UTF-8 encoded data */
20332046
p = unicode->str;
20342047
e = s + size;
2048+
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
20352049

20362050
while (s < e) {
20372051
Py_UCS4 ch = (unsigned char)*s;
20382052

2053+
if (ch < 0x80) {
2054+
/* Fast path for runs of ASCII characters. Given that common UTF-8
2055+
input will consist of an overwhelming majority of ASCII
2056+
characters, we try to optimize for this case by checking
2057+
as many characters as a C 'long' can contain.
2058+
First, check if we can do an aligned read, as most CPUs have
2059+
a penalty for unaligned reads.
2060+
*/
2061+
if (!((size_t) s & LONG_PTR_MASK)) {
2062+
/* Help register allocation */
2063+
register const char *_s = s;
2064+
register Py_UNICODE *_p = p;
2065+
while (_s < aligned_end) {
2066+
/* Read a whole long at a time (either 4 or 8 bytes),
2067+
and do a fast unrolled copy if it only contains ASCII
2068+
characters. */
2069+
unsigned long data = *(unsigned long *) _s;
2070+
if (data & ASCII_CHAR_MASK)
2071+
break;
2072+
_p[0] = (unsigned char) _s[0];
2073+
_p[1] = (unsigned char) _s[1];
2074+
_p[2] = (unsigned char) _s[2];
2075+
_p[3] = (unsigned char) _s[3];
2076+
#if (SIZEOF_LONG == 8)
2077+
_p[4] = (unsigned char) _s[4];
2078+
_p[5] = (unsigned char) _s[5];
2079+
_p[6] = (unsigned char) _s[6];
2080+
_p[7] = (unsigned char) _s[7];
2081+
#endif
2082+
_s += SIZEOF_LONG;
2083+
_p += SIZEOF_LONG;
2084+
}
2085+
s = _s;
2086+
p = _p;
2087+
if (s == e)
2088+
break;
2089+
ch = (unsigned char)*s;
2090+
}
2091+
}
2092+
20392093
if (ch < 0x80) {
20402094
*p++ = (Py_UNICODE)ch;
20412095
s++;
@@ -2169,6 +2223,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
21692223
&starts, &e, &startinpos, &endinpos, &exc, &s,
21702224
&unicode, &outpos, &p))
21712225
goto onError;
2226+
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
21722227
}
21732228
if (consumed)
21742229
*consumed = s-starts;
@@ -2188,6 +2243,9 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
21882243
return NULL;
21892244
}
21902245

2246+
#undef ASCII_CHAR_MASK
2247+
2248+
21912249
/* Allocation strategy: if the string is short, convert into a stack buffer
21922250
and allocate exactly as much space needed at the end. Else allocate the
21932251
maximum possible needed (4 result bytes per Unicode character), and return
@@ -2582,6 +2640,23 @@ PyUnicode_DecodeUTF16(const char *s,
25822640
return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
25832641
}
25842642

2643+
/* Two masks for fast checking of whether a C 'long' may contain
2644+
UTF16-encoded surrogate characters. This is an efficient heuristic,
2645+
assuming that non-surrogate characters with a code point >= 0x8000 are
2646+
rare in most input.
2647+
FAST_CHAR_MASK is used when the input is in native byte ordering,
2648+
SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
2649+
*/
2650+
#if (SIZEOF_LONG == 8)
2651+
# define FAST_CHAR_MASK 0x8000800080008000L
2652+
# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2653+
#elif (SIZEOF_LONG == 4)
2654+
# define FAST_CHAR_MASK 0x80008000L
2655+
# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2656+
#else
2657+
# error C 'long' size should be either 4 or 8!
2658+
#endif
2659+
25852660
PyObject *
25862661
PyUnicode_DecodeUTF16Stateful(const char *s,
25872662
Py_ssize_t size,
@@ -2595,8 +2670,9 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
25952670
Py_ssize_t outpos;
25962671
PyUnicodeObject *unicode;
25972672
Py_UNICODE *p;
2598-
const unsigned char *q, *e;
2673+
const unsigned char *q, *e, *aligned_end;
25992674
int bo = 0; /* assume native ordering by default */
2675+
int native_ordering = 0;
26002676
const char *errmsg = "";
26012677
/* Offsets from q for retrieving byte pairs in the right order. */
26022678
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
@@ -2618,7 +2694,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
26182694
/* Unpack UTF-16 encoded data */
26192695
p = unicode->str;
26202696
q = (unsigned char *)s;
2621-
e = q + size;
2697+
e = q + size - 1;
26222698

26232699
if (byteorder)
26242700
bo = *byteorder;
@@ -2662,20 +2738,78 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
26622738
ihi = 0;
26632739
ilo = 1;
26642740
}
2741+
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2742+
native_ordering = ilo < ihi;
2743+
#else
2744+
native_ordering = ilo > ihi;
2745+
#endif
26652746

2747+
aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
26662748
while (q < e) {
26672749
Py_UNICODE ch;
2668-
/* remaining bytes at the end? (size should be even) */
2669-
if (e-q<2) {
2670-
if (consumed)
2671-
break;
2672-
errmsg = "truncated data";
2673-
startinpos = ((const char *)q)-starts;
2674-
endinpos = ((const char *)e)-starts;
2675-
goto utf16Error;
2676-
/* The remaining input chars are ignored if the callback
2677-
chooses to skip the input */
2678-
}
2750+
/* First check for possible aligned read of a C 'long'. Unaligned
2751+
reads are more expensive, better to defer to another iteration. */
2752+
if (!((size_t) q & LONG_PTR_MASK)) {
2753+
/* Fast path for runs of non-surrogate chars. */
2754+
register const unsigned char *_q = q;
2755+
Py_UNICODE *_p = p;
2756+
if (native_ordering) {
2757+
/* Native ordering is simple: as long as the input cannot
2758+
possibly contain a surrogate char, do an unrolled copy
2759+
of several 16-bit code points to the target object.
2760+
The non-surrogate check is done on several input bytes
2761+
at a time (as many as a C 'long' can contain). */
2762+
while (_q < aligned_end) {
2763+
unsigned long data = * (unsigned long *) _q;
2764+
if (data & FAST_CHAR_MASK)
2765+
break;
2766+
_p[0] = ((unsigned short *) _q)[0];
2767+
_p[1] = ((unsigned short *) _q)[1];
2768+
#if (SIZEOF_LONG == 8)
2769+
_p[2] = ((unsigned short *) _q)[2];
2770+
_p[3] = ((unsigned short *) _q)[3];
2771+
#endif
2772+
_q += SIZEOF_LONG;
2773+
_p += SIZEOF_LONG / 2;
2774+
}
2775+
}
2776+
else {
2777+
/* Byteswapped ordering is similar, but we must decompose
2778+
the copy bytewise, and take care of zero'ing out the
2779+
upper bytes if the target object is in 32-bit units
2780+
(that is, in UCS-4 builds). */
2781+
while (_q < aligned_end) {
2782+
unsigned long data = * (unsigned long *) _q;
2783+
if (data & SWAPPED_FAST_CHAR_MASK)
2784+
break;
2785+
/* Zero upper bytes in UCS-4 builds */
2786+
#if (Py_UNICODE_SIZE > 2)
2787+
_p[0] = 0;
2788+
_p[1] = 0;
2789+
#if (SIZEOF_LONG == 8)
2790+
_p[2] = 0;
2791+
_p[3] = 0;
2792+
#endif
2793+
#endif
2794+
((unsigned char *) _p)[1] = _q[0];
2795+
((unsigned char *) _p)[0] = _q[1];
2796+
((unsigned char *) _p)[1 + Py_UNICODE_SIZE] = _q[2];
2797+
((unsigned char *) _p)[0 + Py_UNICODE_SIZE] = _q[3];
2798+
#if (SIZEOF_LONG == 8)
2799+
((unsigned char *) _p)[1 + 2 * Py_UNICODE_SIZE] = _q[4];
2800+
((unsigned char *) _p)[0 + 2 * Py_UNICODE_SIZE] = _q[5];
2801+
((unsigned char *) _p)[1 + 3 * Py_UNICODE_SIZE] = _q[6];
2802+
((unsigned char *) _p)[0 + 3 * Py_UNICODE_SIZE] = _q[7];
2803+
#endif
2804+
_q += SIZEOF_LONG;
2805+
_p += SIZEOF_LONG / 2;
2806+
}
2807+
}
2808+
p = _p;
2809+
q = _q;
2810+
if (q >= e)
2811+
break;
2812+
}
26792813
ch = (q[ihi] << 8) | q[ilo];
26802814

26812815
q += 2;
@@ -2686,10 +2820,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
26862820
}
26872821

26882822
/* UTF-16 code pair: */
2689-
if (q >= e) {
2823+
if (q > e) {
26902824
errmsg = "unexpected end of data";
2691-
startinpos = (((const char *)q)-2)-starts;
2692-
endinpos = ((const char *)e)-starts;
2825+
startinpos = (((const char *)q) - 2) - starts;
2826+
endinpos = ((const char *)e) + 1 - starts;
26932827
goto utf16Error;
26942828
}
26952829
if (0xD800 <= ch && ch <= 0xDBFF) {
@@ -2718,14 +2852,47 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
27182852
/* Fall through to report the error */
27192853

27202854
utf16Error:
2721-
outpos = p-PyUnicode_AS_UNICODE(unicode);
2855+
outpos = p - PyUnicode_AS_UNICODE(unicode);
27222856
if (unicode_decode_call_errorhandler(
2723-
errors, &errorHandler,
2724-
"utf16", errmsg,
2725-
&starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2726-
&unicode, &outpos, &p))
2857+
errors,
2858+
&errorHandler,
2859+
"utf16", errmsg,
2860+
&starts,
2861+
(const char **)&e,
2862+
&startinpos,
2863+
&endinpos,
2864+
&exc,
2865+
(const char **)&q,
2866+
&unicode,
2867+
&outpos,
2868+
&p))
27272869
goto onError;
27282870
}
2871+
/* remaining byte at the end? (size should be even) */
2872+
if (e == q) {
2873+
if (!consumed) {
2874+
errmsg = "truncated data";
2875+
startinpos = ((const char *)q) - starts;
2876+
endinpos = ((const char *)e) + 1 - starts;
2877+
outpos = p - PyUnicode_AS_UNICODE(unicode);
2878+
if (unicode_decode_call_errorhandler(
2879+
errors,
2880+
&errorHandler,
2881+
"utf16", errmsg,
2882+
&starts,
2883+
(const char **)&e,
2884+
&startinpos,
2885+
&endinpos,
2886+
&exc,
2887+
(const char **)&q,
2888+
&unicode,
2889+
&outpos,
2890+
&p))
2891+
goto onError;
2892+
/* The remaining input chars are ignored if the callback
2893+
chooses to skip the input */
2894+
}
2895+
}
27292896

27302897
if (byteorder)
27312898
*byteorder = bo;
@@ -2748,6 +2915,9 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
27482915
return NULL;
27492916
}
27502917

2918+
#undef FAST_CHAR_MASK
2919+
#undef SWAPPED_FAST_CHAR_MASK
2920+
27512921
PyObject *
27522922
PyUnicode_EncodeUTF16(const Py_UNICODE *s,
27532923
Py_ssize_t size,
@@ -3571,6 +3741,7 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
35713741
{
35723742
PyUnicodeObject *v;
35733743
Py_UNICODE *p;
3744+
const char *e, *unrolled_end;
35743745

35753746
/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
35763747
if (size == 1) {
@@ -3584,8 +3755,20 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
35843755
if (size == 0)
35853756
return (PyObject *)v;
35863757
p = PyUnicode_AS_UNICODE(v);
3587-
while (size-- > 0)
3588-
*p++ = (unsigned char)*s++;
3758+
e = s + size;
3759+
/* Unrolling the copy makes it much faster by reducing the looping
3760+
overhead. This is similar to what many memcpy() implementations do. */
3761+
unrolled_end = e - 4;
3762+
while (s < unrolled_end) {
3763+
p[0] = (unsigned char) s[0];
3764+
p[1] = (unsigned char) s[1];
3765+
p[2] = (unsigned char) s[2];
3766+
p[3] = (unsigned char) s[3];
3767+
s += 4;
3768+
p += 4;
3769+
}
3770+
while (s < e)
3771+
*p++ = (unsigned char) *s++;
35893772
return (PyObject *)v;
35903773

35913774
onError:

0 commit comments

Comments
 (0)