Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 44531cb

Browse files
committed
Optimize built-in unicode codecs by avoiding unnecessary copying.
The approach used is similiar to what is currently used in the version of unicodeobject.c in Python 2.x. The only difference is we use _PyBytes_Resize instead of _PyString_Resize.
1 parent 9cb6f7f commit 44531cb

1 file changed

Lines changed: 58 additions & 50 deletions

File tree

Objects/unicodeobject.c

Lines changed: 58 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1873,7 +1873,7 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
18731873
int encodeWhiteSpace,
18741874
const char *errors)
18751875
{
1876-
PyObject *v, *result;
1876+
PyObject *v;
18771877
/* It might be possible to tighten this worst case */
18781878
Py_ssize_t cbAllocated = 5 * size;
18791879
int inShift = 0;
@@ -1889,11 +1889,11 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
18891889
if (cbAllocated / 5 != size)
18901890
return PyErr_NoMemory();
18911891

1892-
v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
1892+
v = PyBytes_FromStringAndSize(NULL, cbAllocated);
18931893
if (v == NULL)
18941894
return NULL;
18951895

1896-
start = out = PyByteArray_AS_STRING(v);
1896+
start = out = PyBytes_AS_STRING(v);
18971897
for (;i < size; ++i) {
18981898
Py_UNICODE ch = s[i];
18991899

@@ -1958,10 +1958,9 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
19581958
*out++= B64(charsleft << (6-bitsleft) );
19591959
*out++ = '-';
19601960
}
1961-
1962-
result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
1963-
Py_DECREF(v);
1964-
return result;
1961+
if (_PyBytes_Resize(&v, out - start) < 0)
1962+
return NULL;
1963+
return v;
19651964
}
19661965

19671966
#undef SPECIAL
@@ -2479,7 +2478,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
24792478
const char *errors,
24802479
int byteorder)
24812480
{
2482-
PyObject *v, *result;
2481+
PyObject *v;
24832482
unsigned char *p;
24842483
Py_ssize_t nsize, bytesize;
24852484
#ifndef Py_UNICODE_WIDE
@@ -2515,11 +2514,11 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
25152514
bytesize = nsize * 4;
25162515
if (bytesize / 4 != nsize)
25172516
return PyErr_NoMemory();
2518-
v = PyByteArray_FromStringAndSize(NULL, bytesize);
2517+
v = PyBytes_FromStringAndSize(NULL, bytesize);
25192518
if (v == NULL)
25202519
return NULL;
25212520

2522-
p = (unsigned char *)PyByteArray_AS_STRING(v);
2521+
p = (unsigned char *)PyBytes_AS_STRING(v);
25232522
if (byteorder == 0)
25242523
STORECHAR(0xFEFF);
25252524
if (size == 0)
@@ -2556,9 +2555,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
25562555
}
25572556

25582557
done:
2559-
result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2560-
Py_DECREF(v);
2561-
return result;
2558+
return v;
25622559
#undef STORECHAR
25632560
}
25642561

@@ -2757,7 +2754,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
27572754
const char *errors,
27582755
int byteorder)
27592756
{
2760-
PyObject *v, *result;
2757+
PyObject *v;
27612758
unsigned char *p;
27622759
Py_ssize_t nsize, bytesize;
27632760
#ifdef Py_UNICODE_WIDE
@@ -2792,11 +2789,11 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
27922789
bytesize = nsize * 2;
27932790
if (bytesize / 2 != nsize)
27942791
return PyErr_NoMemory();
2795-
v = PyByteArray_FromStringAndSize(NULL, bytesize);
2792+
v = PyBytes_FromStringAndSize(NULL, bytesize);
27962793
if (v == NULL)
27972794
return NULL;
27982795

2799-
p = (unsigned char *)PyByteArray_AS_STRING(v);
2796+
p = (unsigned char *)PyBytes_AS_STRING(v);
28002797
if (byteorder == 0)
28012798
STORECHAR(0xFEFF);
28022799
if (size == 0)
@@ -2828,9 +2825,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
28282825
}
28292826

28302827
done:
2831-
result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2832-
Py_DECREF(v);
2833-
return result;
2828+
return v;
28342829
#undef STORECHAR
28352830
}
28362831

@@ -3120,7 +3115,7 @@ static const char *hexdigits = "0123456789abcdef";
31203115
PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
31213116
Py_ssize_t size)
31223117
{
3123-
PyObject *repr, *result;
3118+
PyObject *repr;
31243119
char *p;
31253120

31263121
#ifdef Py_UNICODE_WIDE
@@ -3147,17 +3142,20 @@ PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
31473142
escape.
31483143
*/
31493144

3145+
if (size == 0)
3146+
return PyBytes_FromStringAndSize(NULL, 0);
3147+
31503148
if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
31513149
return PyErr_NoMemory();
31523150

3153-
repr = PyByteArray_FromStringAndSize(NULL,
3151+
repr = PyBytes_FromStringAndSize(NULL,
31543152
2
31553153
+ expandsize*size
31563154
+ 1);
31573155
if (repr == NULL)
31583156
return NULL;
31593157

3160-
p = PyByteArray_AS_STRING(repr);
3158+
p = PyBytes_AS_STRING(repr);
31613159

31623160
while (size-- > 0) {
31633161
Py_UNICODE ch = *s++;
@@ -3249,13 +3247,13 @@ PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
32493247
*p++ = (char) ch;
32503248
}
32513249

3252-
result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
3253-
p - PyByteArray_AS_STRING(repr));
3254-
Py_DECREF(repr);
3255-
return result;
3250+
assert(p - PyBytes_AS_STRING(repr) > 0);
3251+
if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3252+
return NULL;
3253+
return repr;
32563254
}
32573255

3258-
PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3256+
PyObject *PyUnicodeAsUnicodeEscapeString(PyObject *unicode)
32593257
{
32603258
PyObject *s;
32613259
if (!PyUnicode_Check(unicode)) {
@@ -3389,7 +3387,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
33893387
PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
33903388
Py_ssize_t size)
33913389
{
3392-
PyObject *repr, *result;
3390+
PyObject *repr;
33933391
char *p;
33943392
char *q;
33953393

@@ -3402,13 +3400,13 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
34023400
if (size > PY_SSIZE_T_MAX / expandsize)
34033401
return PyErr_NoMemory();
34043402

3405-
repr = PyByteArray_FromStringAndSize(NULL, expandsize * size);
3403+
repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
34063404
if (repr == NULL)
34073405
return NULL;
34083406
if (size == 0)
3409-
goto done;
3407+
return repr;
34103408

3411-
p = q = PyByteArray_AS_STRING(repr);
3409+
p = q = PyBytes_AS_STRING(repr);
34123410
while (size-- > 0) {
34133411
Py_UNICODE ch = *s++;
34143412
#ifdef Py_UNICODE_WIDE
@@ -3468,10 +3466,10 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
34683466
}
34693467
size = p - q;
34703468

3471-
done:
3472-
result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
3473-
Py_DECREF(repr);
3474-
return result;
3469+
assert(size > 0);
3470+
if (_PyBytes_Resize(&repr, size) < 0)
3471+
return NULL;
3472+
return repr;
34753473
}
34763474

34773475
PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
@@ -3706,7 +3704,6 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
37063704
const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
37073705
PyObject *errorHandler = NULL;
37083706
PyObject *exc = NULL;
3709-
PyObject *result = NULL;
37103707
/* the following variable is used for caching string comparisons
37113708
* -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
37123709
int known_errorHandler = -1;
@@ -3715,10 +3712,10 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
37153712
replacements, if we need more, we'll resize */
37163713
if (size == 0)
37173714
return PyBytes_FromStringAndSize(NULL, 0);
3718-
res = PyByteArray_FromStringAndSize(NULL, size);
3715+
res = PyBytes_FromStringAndSize(NULL, size);
37193716
if (res == NULL)
37203717
return NULL;
3721-
str = PyByteArray_AS_STRING(res);
3718+
str = PyBytes_AS_STRING(res);
37223719
ressize = size;
37233720

37243721
while (p<endp) {
@@ -3768,7 +3765,7 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
37683765
p = collend;
37693766
break;
37703767
case 4: /* xmlcharrefreplace */
3771-
respos = str - PyByteArray_AS_STRING(res);
3768+
respos = str - PyBytes_AS_STRING(res);
37723769
/* determine replacement size (temporarily (mis)uses p) */
37733770
for (p = collstart, repsize = 0; p < collend; ++p) {
37743771
if (*p<10)
@@ -3795,9 +3792,9 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
37953792
if (requiredsize > ressize) {
37963793
if (requiredsize<2*ressize)
37973794
requiredsize = 2*ressize;
3798-
if (PyByteArray_Resize(res, requiredsize))
3795+
if (_PyBytes_Resize(&res, requiredsize))
37993796
goto onError;
3800-
str = PyByteArray_AS_STRING(res) + respos;
3797+
str = PyBytes_AS_STRING(res) + respos;
38013798
ressize = requiredsize;
38023799
}
38033800
/* generate replacement (temporarily (mis)uses p) */
@@ -3815,17 +3812,17 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
38153812
/* need more space? (at least enough for what we
38163813
have+the replacement+the rest of the string, so
38173814
we won't have to check space for encodable characters) */
3818-
respos = str - PyByteArray_AS_STRING(res);
3815+
respos = str - PyBytes_AS_STRING(res);
38193816
repsize = PyUnicode_GET_SIZE(repunicode);
38203817
requiredsize = respos+repsize+(endp-collend);
38213818
if (requiredsize > ressize) {
38223819
if (requiredsize<2*ressize)
38233820
requiredsize = 2*ressize;
3824-
if (PyByteArray_Resize(res, requiredsize)) {
3821+
if (_PyBytes_Resize(&res, requiredsize)) {
38253822
Py_DECREF(repunicode);
38263823
goto onError;
38273824
}
3828-
str = PyByteArray_AS_STRING(res) + respos;
3825+
str = PyBytes_AS_STRING(res) + respos;
38293826
ressize = requiredsize;
38303827
}
38313828
/* check if there is anything unencodable in the replacement
@@ -3845,13 +3842,23 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
38453842
}
38463843
}
38473844
}
3848-
result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
3849-
str - PyByteArray_AS_STRING(res));
3845+
/* Resize if we allocated to much */
3846+
size = str - PyBytes_AS_STRING(res);
3847+
if (size < ressize) { /* If this falls res will be NULL */
3848+
assert(size > 0);
3849+
if (_PyBytes_Resize(&res, size) < 0)
3850+
goto onError;
3851+
}
3852+
3853+
Py_XDECREF(errorHandler);
3854+
Py_XDECREF(exc);
3855+
return res;
3856+
38503857
onError:
3851-
Py_DECREF(res);
3858+
Py_XDECREF(res);
38523859
Py_XDECREF(errorHandler);
38533860
Py_XDECREF(exc);
3854-
return result;
3861+
return NULL;
38553862
}
38563863

38573864
PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
@@ -4104,7 +4111,7 @@ static int encode_mbcs(PyObject **repr,
41044111
else {
41054112
/* Extend string object */
41064113
n = PyBytes_Size(*repr);
4107-
if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4114+
if (_PyBytes_Resize(&repr, n + mbcssize) < 0)
41084115
return -1;
41094116
}
41104117

@@ -4834,7 +4841,8 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
48344841

48354842
/* Resize if we allocated to much */
48364843
if (respos<PyBytes_GET_SIZE(res))
4837-
_PyBytes_Resize(&res, respos);
4844+
if (_PyBytes_Resize(&res, respos) < 0)
4845+
goto onError;
48384846

48394847
Py_XDECREF(exc);
48404848
Py_XDECREF(errorHandler);

0 commit comments

Comments
 (0)