Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b2750b5

Browse files
committed
Move the codec decode type checks to bytes/bytearray.decode().
Use faster PyUnicode_FromEncodedObject() for bytes/bytearray.decode(). Add new PyCodec_KnownEncoding() API. Add new PyUnicode_AsDecodedUnicode() and PyUnicode_AsEncodedUnicode() APIs. Add missing PyUnicode_AsDecodedObject() to unicodeobject.h Fix punicode codec to also work on memoryviews.
1 parent 4efb518 commit b2750b5

8 files changed

Lines changed: 171 additions & 41 deletions

File tree

Include/codecs.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ PyAPI_FUNC(int) PyCodec_Register(
2727
PyObject *search_function
2828
);
2929

30-
/* Codec register lookup API.
30+
/* Codec registry lookup API.
3131
3232
Looks up the given encoding and returns a CodecInfo object with
3333
function attributes which implement the different aspects of
@@ -49,6 +49,17 @@ PyAPI_FUNC(PyObject *) _PyCodec_Lookup(
4949
const char *encoding
5050
);
5151

52+
/* Codec registry encoding check API.
53+
54+
Returns 1/0 depending on whether there is a registered codec for
55+
the given encoding.
56+
57+
*/
58+
59+
PyAPI_FUNC(int) PyCodec_KnownEncoding(
60+
const char *encoding
61+
);
62+
5263
/* Generic codec based encoding API.
5364
5465
object is passed through the encoder function found for the given

Include/unicodeobject.h

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,11 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
139139

140140
# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
141141
# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
142+
# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject
143+
# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode
142144
# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
143145
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
146+
# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode
144147
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
145148
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
146149
# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
@@ -233,8 +236,11 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
233236

234237
# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
235238
# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
239+
# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject
240+
# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode
236241
# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
237242
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
243+
# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode
238244
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
239245
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
240246
# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
@@ -744,6 +750,24 @@ PyAPI_FUNC(PyObject*) PyUnicode_Decode(
744750
const char *errors /* error handling */
745751
);
746752

753+
/* Decode a Unicode object unicode and return the result as Python
754+
object. */
755+
756+
PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
757+
PyObject *unicode, /* Unicode object */
758+
const char *encoding, /* encoding */
759+
const char *errors /* error handling */
760+
);
761+
762+
/* Decode a Unicode object unicode and return the result as Unicode
763+
object. */
764+
765+
PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
766+
PyObject *unicode, /* Unicode object */
767+
const char *encoding, /* encoding */
768+
const char *errors /* error handling */
769+
);
770+
747771
/* Encodes a Py_UNICODE buffer of the given size and returns a
748772
Python string object. */
749773

@@ -772,11 +796,21 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
772796
const char *errors /* error handling */
773797
);
774798

799+
/* Encodes a Unicode object and returns the result as Unicode
800+
object. */
801+
802+
PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
803+
PyObject *unicode, /* Unicode object */
804+
const char *encoding, /* encoding */
805+
const char *errors /* error handling */
806+
);
807+
808+
/* Build an encoding map. */
809+
775810
PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
776811
PyObject* string /* 256 character map */
777812
);
778813

779-
780814
/* --- UTF-7 Codecs ------------------------------------------------------- */
781815

782816
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(

Lib/encodings/punycode.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ def insertion_sort(base, extended, errors):
183183
def punycode_decode(text, errors):
184184
if isinstance(text, str):
185185
text = text.encode("ascii")
186+
if isinstance(text, memoryview):
187+
text = bytes(text)
186188
pos = text.rfind(b"-")
187189
if pos == -1:
188190
base = ""

Objects/bytearrayobject.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -725,7 +725,7 @@ bytes_init(PyByteArrayObject *self, PyObject *args, PyObject *kwds)
725725
"string argument without an encoding");
726726
return -1;
727727
}
728-
encoded = PyCodec_Encode(arg, encoding, errors);
728+
encoded = PyUnicode_AsEncodedString(arg, encoding, errors);
729729
if (encoded == NULL)
730730
return -1;
731731
assert(PyBytes_Check(encoded));
@@ -2854,7 +2854,7 @@ bytes_decode(PyObject *self, PyObject *args)
28542854
return NULL;
28552855
if (encoding == NULL)
28562856
encoding = PyUnicode_GetDefaultEncoding();
2857-
return PyCodec_Decode(self, encoding, errors);
2857+
return PyUnicode_FromEncodedObject(self, encoding, errors);
28582858
}
28592859

28602860
PyDoc_STRVAR(alloc_doc,

Objects/bytesobject.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2713,7 +2713,7 @@ string_decode(PyObject *self, PyObject *args)
27132713
return NULL;
27142714
if (encoding == NULL)
27152715
encoding = PyUnicode_GetDefaultEncoding();
2716-
return PyCodec_Decode(self, encoding, errors);
2716+
return PyUnicode_FromEncodedObject(self, encoding, errors);
27172717
}
27182718

27192719

@@ -2899,7 +2899,7 @@ string_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
28992899
"string argument without an encoding");
29002900
return NULL;
29012901
}
2902-
new = PyCodec_Encode(x, encoding, errors);
2902+
new = PyUnicode_AsEncodedString(x, encoding, errors);
29032903
if (new == NULL)
29042904
return NULL;
29052905
assert(PyBytes_Check(new));

Objects/unicodeobject.c

Lines changed: 89 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,14 +1099,18 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
10991099

11001100
/* Coerce object */
11011101
if (PyBytes_Check(obj)) {
1102-
s = PyBytes_AS_STRING(obj);
1103-
len = PyBytes_GET_SIZE(obj);
1104-
}
1102+
s = PyBytes_AS_STRING(obj);
1103+
len = PyBytes_GET_SIZE(obj);
1104+
}
1105+
else if (PyByteArray_Check(obj)) {
1106+
s = PyByteArray_AS_STRING(obj);
1107+
len = PyByteArray_GET_SIZE(obj);
1108+
}
11051109
else if (PyObject_AsCharBuffer(obj, &s, &len)) {
11061110
/* Overwrite the error message with something more useful in
11071111
case of a TypeError. */
11081112
if (PyErr_ExceptionMatches(PyExc_TypeError))
1109-
PyErr_Format(PyExc_TypeError,
1113+
PyErr_Format(PyExc_TypeError,
11101114
"coercing to Unicode: need string or buffer, "
11111115
"%.80s found",
11121116
Py_TYPE(obj)->tp_name);
@@ -1188,7 +1192,7 @@ PyObject *PyUnicode_Decode(const char *s,
11881192
goto onError;
11891193
if (!PyUnicode_Check(unicode)) {
11901194
PyErr_Format(PyExc_TypeError,
1191-
"decoder did not return an unicode object (type=%.400s)",
1195+
"decoder did not return a unicode object (type=%.400s)",
11921196
Py_TYPE(unicode)->tp_name);
11931197
Py_DECREF(unicode);
11941198
goto onError;
@@ -1225,6 +1229,37 @@ PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
12251229
return NULL;
12261230
}
12271231

1232+
PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1233+
const char *encoding,
1234+
const char *errors)
1235+
{
1236+
PyObject *v;
1237+
1238+
if (!PyUnicode_Check(unicode)) {
1239+
PyErr_BadArgument();
1240+
goto onError;
1241+
}
1242+
1243+
if (encoding == NULL)
1244+
encoding = PyUnicode_GetDefaultEncoding();
1245+
1246+
/* Decode via the codec registry */
1247+
v = PyCodec_Decode(unicode, encoding, errors);
1248+
if (v == NULL)
1249+
goto onError;
1250+
if (!PyUnicode_Check(v)) {
1251+
PyErr_Format(PyExc_TypeError,
1252+
"decoder did not return a unicode object (type=%.400s)",
1253+
Py_TYPE(v)->tp_name);
1254+
Py_DECREF(v);
1255+
goto onError;
1256+
}
1257+
return v;
1258+
1259+
onError:
1260+
return NULL;
1261+
}
1262+
12281263
PyObject *PyUnicode_Encode(const Py_UNICODE *s,
12291264
Py_ssize_t size,
12301265
const char *encoding,
@@ -1296,7 +1331,54 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
12961331
v = PyCodec_Encode(unicode, encoding, errors);
12971332
if (v == NULL)
12981333
goto onError;
1299-
assert(PyBytes_Check(v));
1334+
if (PyByteArray_Check(v)) {
1335+
char msg[100];
1336+
PyOS_snprintf(msg, sizeof(msg),
1337+
"encoder %s returned buffer instead of bytes",
1338+
encoding);
1339+
if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1340+
v = NULL;
1341+
goto onError;
1342+
}
1343+
v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1344+
}
1345+
else if (!PyBytes_Check(v)) {
1346+
PyErr_Format(PyExc_TypeError,
1347+
"encoder did not return a bytes object (type=%.400s)",
1348+
Py_TYPE(v)->tp_name);
1349+
v = NULL;
1350+
}
1351+
return v;
1352+
1353+
onError:
1354+
return NULL;
1355+
}
1356+
1357+
PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1358+
const char *encoding,
1359+
const char *errors)
1360+
{
1361+
PyObject *v;
1362+
1363+
if (!PyUnicode_Check(unicode)) {
1364+
PyErr_BadArgument();
1365+
goto onError;
1366+
}
1367+
1368+
if (encoding == NULL)
1369+
encoding = PyUnicode_GetDefaultEncoding();
1370+
1371+
/* Encode via the codec registry */
1372+
v = PyCodec_Encode(unicode, encoding, errors);
1373+
if (v == NULL)
1374+
goto onError;
1375+
if (!PyUnicode_Check(v)) {
1376+
PyErr_Format(PyExc_TypeError,
1377+
"encoder did not return an unicode object (type=%.400s)",
1378+
Py_TYPE(v)->tp_name);
1379+
Py_DECREF(v);
1380+
goto onError;
1381+
}
13001382
return v;
13011383

13021384
onError:
@@ -6617,7 +6699,7 @@ unicode_encode(PyUnicodeObject *self, PyObject *args)
66176699

66186700
if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
66196701
return NULL;
6620-
v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6702+
v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
66216703
if (v == NULL)
66226704
goto onError;
66236705
if (!PyBytes_Check(v)) {

Python/codecs.c

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,23 @@ PyObject *_PyCodec_Lookup(const char *encoding)
183183
return NULL;
184184
}
185185

186+
/* Codec registry encoding check API. */
187+
188+
int PyCodec_KnownEncoding(const char *encoding)
189+
{
190+
PyObject *codecs;
191+
192+
codecs = _PyCodec_Lookup(encoding);
193+
if (!codecs) {
194+
PyErr_Clear();
195+
return 0;
196+
}
197+
else {
198+
Py_DECREF(codecs);
199+
return 1;
200+
}
201+
}
202+
186203
static
187204
PyObject *args_tuple(PyObject *object,
188205
const char *errors)
@@ -344,32 +361,20 @@ PyObject *PyCodec_Encode(PyObject *object,
344361
"encoder must return a tuple (object, integer)");
345362
goto onError;
346363
}
347-
v = PyTuple_GET_ITEM(result, 0);
348-
if (PyByteArray_Check(v)) {
349-
char msg[100];
350-
PyOS_snprintf(msg, sizeof(msg),
351-
"encoder %s returned buffer instead of bytes",
352-
encoding);
353-
if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
354-
v = NULL;
355-
goto onError;
356-
}
357-
v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
358-
}
359-
else if (PyBytes_Check(v))
360-
Py_INCREF(v);
361-
else {
362-
PyErr_SetString(PyExc_TypeError,
363-
"encoding must return a tuple(bytes, integer)");
364-
v = NULL;
365-
}
364+
v = PyTuple_GET_ITEM(result,0);
365+
Py_INCREF(v);
366366
/* We don't check or use the second (integer) entry. */
367367

368+
Py_DECREF(args);
369+
Py_DECREF(encoder);
370+
Py_DECREF(result);
371+
return v;
372+
368373
onError:
369374
Py_XDECREF(result);
370375
Py_XDECREF(args);
371376
Py_XDECREF(encoder);
372-
return v;
377+
return NULL;
373378
}
374379

375380
/* Decode an object (usually a Python string) using the given encoding

Python/pythonrun.c

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -261,14 +261,10 @@ Py_InitializeEx(int install_sigs)
261261

262262
codeset = nl_langinfo(CODESET);
263263
if (codeset && *codeset) {
264-
PyObject *enc = PyCodec_Encoder(codeset);
265-
if (enc) {
266-
codeset = strdup(codeset);
267-
Py_DECREF(enc);
268-
} else {
269-
codeset = NULL;
270-
PyErr_Clear();
271-
}
264+
if (PyCodec_KnownEncoding(codeset))
265+
codeset = strdup(codeset);
266+
else
267+
codeset = NULL;
272268
} else
273269
codeset = NULL;
274270

0 commit comments

Comments
 (0)