Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit eeb719e

Browse files
bpo-35365: Use a wchar_t* buffer in the code page decoder. (GH-10837)
1 parent 7fc633f commit eeb719e

1 file changed

Lines changed: 52 additions & 60 deletions

File tree

Objects/unicodeobject.c

Lines changed: 52 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -4059,6 +4059,21 @@ make_decode_exception(PyObject **exceptionObject,
40594059
}
40604060

40614061
#ifdef MS_WINDOWS
4062+
static int
4063+
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4064+
{
4065+
if (newsize > *size) {
4066+
wchar_t *newbuf = *buf;
4067+
if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4068+
PyErr_NoMemory();
4069+
return -1;
4070+
}
4071+
*buf = newbuf;
4072+
}
4073+
*size = newsize;
4074+
return 0;
4075+
}
4076+
40624077
/* error handling callback helper:
40634078
build arguments, call the callback and check the arguments,
40644079
if no exception occurred, copy the replacement to the output
@@ -4072,7 +4087,7 @@ unicode_decode_call_errorhandler_wchar(
40724087
const char *encoding, const char *reason,
40734088
const char **input, const char **inend, Py_ssize_t *startinpos,
40744089
Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4075-
PyObject **output, Py_ssize_t *outpos)
4090+
wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
40764091
{
40774092
static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
40784093

@@ -4086,9 +4101,6 @@ unicode_decode_call_errorhandler_wchar(
40864101
wchar_t *repwstr;
40874102
Py_ssize_t repwlen;
40884103

4089-
assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4090-
outsize = _PyUnicode_WSTR_LENGTH(*output);
4091-
40924104
if (*errorHandler == NULL) {
40934105
*errorHandler = PyCodec_LookupError(errors);
40944106
if (*errorHandler == NULL)
@@ -4146,13 +4158,15 @@ unicode_decode_call_errorhandler_wchar(
41464158
if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
41474159
goto overflow;
41484160
requiredsize += insize - newpos;
4161+
outsize = *bufsize;
41494162
if (requiredsize > outsize) {
41504163
if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
41514164
requiredsize = 2*outsize;
4152-
if (unicode_resize(output, requiredsize) < 0)
4165+
if (widechar_resize(buf, bufsize, requiredsize) < 0) {
41534166
goto onError;
4167+
}
41544168
}
4155-
wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4169+
wcsncpy(*buf + *outpos, repwstr, repwlen);
41564170
*outpos += repwlen;
41574171
*endinpos = newpos;
41584172
*inptr = *input + newpos;
@@ -7146,7 +7160,8 @@ decode_code_page_flags(UINT code_page)
71467160
*/
71477161
static int
71487162
decode_code_page_strict(UINT code_page,
7149-
PyObject **v,
7163+
wchar_t **buf,
7164+
Py_ssize_t *bufsize,
71507165
const char *in,
71517166
int insize)
71527167
{
@@ -7160,21 +7175,12 @@ decode_code_page_strict(UINT code_page,
71607175
if (outsize <= 0)
71617176
goto error;
71627177

7163-
if (*v == NULL) {
7164-
/* Create unicode object */
7165-
/* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7166-
*v = (PyObject*)_PyUnicode_New(outsize);
7167-
if (*v == NULL)
7168-
return -1;
7169-
out = PyUnicode_AS_UNICODE(*v);
7170-
}
7171-
else {
7172-
/* Extend unicode object */
7173-
Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7174-
if (unicode_resize(v, n + outsize) < 0)
7175-
return -1;
7176-
out = PyUnicode_AS_UNICODE(*v) + n;
7178+
/* Extend a wchar_t* buffer */
7179+
Py_ssize_t n = *bufsize; /* Get the current length */
7180+
if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7181+
return -1;
71777182
}
7183+
out = *buf + n;
71787184

71797185
/* Do the conversion */
71807186
outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
@@ -7198,7 +7204,8 @@ decode_code_page_strict(UINT code_page,
71987204
*/
71997205
static int
72007206
decode_code_page_errors(UINT code_page,
7201-
PyObject **v,
7207+
wchar_t **buf,
7208+
Py_ssize_t *bufsize,
72027209
const char *in, const int size,
72037210
const char *errors, int final)
72047211
{
@@ -7238,29 +7245,16 @@ decode_code_page_errors(UINT code_page,
72387245
goto error;
72397246
}
72407247

7241-
if (*v == NULL) {
7242-
/* Create unicode object */
7243-
if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7244-
PyErr_NoMemory();
7245-
goto error;
7246-
}
7247-
/* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7248-
*v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7249-
if (*v == NULL)
7250-
goto error;
7251-
out = PyUnicode_AS_UNICODE(*v);
7248+
/* Extend a wchar_t* buffer */
7249+
Py_ssize_t n = *bufsize; /* Get the current length */
7250+
if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7251+
PyErr_NoMemory();
7252+
goto error;
72527253
}
7253-
else {
7254-
/* Extend unicode object */
7255-
Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7256-
if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7257-
PyErr_NoMemory();
7258-
goto error;
7259-
}
7260-
if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7261-
goto error;
7262-
out = PyUnicode_AS_UNICODE(*v) + n;
7254+
if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7255+
goto error;
72637256
}
7257+
out = *buf + n;
72647258

72657259
/* Decode the byte string character per character */
72667260
while (in < endin)
@@ -7295,16 +7289,16 @@ decode_code_page_errors(UINT code_page,
72957289

72967290
startinpos = in - startin;
72977291
endinpos = startinpos + 1;
7298-
outpos = out - PyUnicode_AS_UNICODE(*v);
7292+
outpos = out - *buf;
72997293
if (unicode_decode_call_errorhandler_wchar(
73007294
errors, &errorHandler,
73017295
encoding, reason,
73027296
&startin, &endin, &startinpos, &endinpos, &exc, &in,
7303-
v, &outpos))
7297+
buf, bufsize, &outpos))
73047298
{
73057299
goto error;
73067300
}
7307-
out = PyUnicode_AS_UNICODE(*v) + outpos;
7301+
out = *buf + outpos;
73087302
}
73097303
else {
73107304
in += insize;
@@ -7313,14 +7307,9 @@ decode_code_page_errors(UINT code_page,
73137307
}
73147308
}
73157309

7316-
/* write a NUL character at the end */
7317-
*out = 0;
7318-
7319-
/* Extend unicode object */
7320-
outsize = out - PyUnicode_AS_UNICODE(*v);
7321-
assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7322-
if (unicode_resize(v, outsize) < 0)
7323-
goto error;
7310+
/* Shrink the buffer */
7311+
assert(out - *buf <= *bufsize);
7312+
*bufsize = out - *buf;
73247313
/* (in - startin) <= size and size is an int */
73257314
ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
73267315

@@ -7336,7 +7325,8 @@ decode_code_page_stateful(int code_page,
73367325
const char *s, Py_ssize_t size,
73377326
const char *errors, Py_ssize_t *consumed)
73387327
{
7339-
PyObject *v = NULL;
7328+
wchar_t *buf = NULL;
7329+
Py_ssize_t bufsize = 0;
73407330
int chunk_size, final, converted, done;
73417331

73427332
if (code_page < 0) {
@@ -7368,21 +7358,21 @@ decode_code_page_stateful(int code_page,
73687358
}
73697359

73707360
if (chunk_size == 0 && done) {
7371-
if (v != NULL)
7361+
if (buf != NULL)
73727362
break;
73737363
_Py_RETURN_UNICODE_EMPTY();
73747364
}
73757365

7376-
converted = decode_code_page_strict(code_page, &v,
7366+
converted = decode_code_page_strict(code_page, &buf, &bufsize,
73777367
s, chunk_size);
73787368
if (converted == -2)
7379-
converted = decode_code_page_errors(code_page, &v,
7369+
converted = decode_code_page_errors(code_page, &buf, &bufsize,
73807370
s, chunk_size,
73817371
errors, final);
73827372
assert(converted != 0 || done);
73837373

73847374
if (converted < 0) {
7385-
Py_XDECREF(v);
7375+
PyMem_Free(buf);
73867376
return NULL;
73877377
}
73887378

@@ -7393,7 +7383,9 @@ decode_code_page_stateful(int code_page,
73937383
size -= converted;
73947384
} while (!done);
73957385

7396-
return unicode_result(v);
7386+
PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7387+
PyMem_Free(buf);
7388+
return v;
73977389
}
73987390

73997391
PyObject *

0 commit comments

Comments
 (0)