Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 6dc32b3

Browse files
committed
Issue #13612: handle unknown encodings without a buffer overflow.
This affects pyexpat and _elementtree. PyExpat_CAPI now exposes a new function - DefaultUnknownEncodingHandler. Based on a patch by Serhiy Storchaka.
1 parent 6b5a38c commit 6dc32b3

4 files changed

Lines changed: 123 additions & 74 deletions

File tree

Include/pyexpat.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#define PyExpat_CAPI_MAGIC "pyexpat.expat_CAPI 1.0"
77
#define PyExpat_CAPSULE_NAME "pyexpat.expat_CAPI"
88

9-
struct PyExpat_CAPI
9+
struct PyExpat_CAPI
1010
{
1111
char* magic; /* set to PyExpat_CAPI_MAGIC */
1212
int size; /* set to sizeof(struct PyExpat_CAPI) */
@@ -46,6 +46,8 @@ struct PyExpat_CAPI
4646
void (*SetStartDoctypeDeclHandler)(XML_Parser parser,
4747
XML_StartDoctypeDeclHandler start);
4848
enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding);
49+
int (*DefaultUnknownEncodingHandler)(
50+
void *encodingHandlerData, const XML_Char *name, XML_Encoding *info);
4951
/* always add new stuff to the end! */
5052
};
5153

Lib/test/test_xml_etree.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,98 @@ def check(encoding, body=''):
690690
check("cp437", '\u221a')
691691
check("mac-roman", '\u02da')
692692

693+
def xml(encoding):
694+
return "<?xml version='1.0' encoding='%s'?><xml />" % encoding
695+
def bxml(encoding):
696+
return xml(encoding).encode(encoding)
697+
supported_encodings = [
698+
'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
699+
'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
700+
'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
701+
'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
702+
'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852',
703+
'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862',
704+
'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1250',
705+
'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
706+
'cp1257', 'cp1258',
707+
'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
708+
'mac-roman', 'mac-turkish',
709+
'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
710+
'iso2022-jp-3', 'iso2022-jp-ext',
711+
'koi8-r', 'koi8-u',
712+
'hz', 'ptcp154',
713+
]
714+
for encoding in supported_encodings:
715+
self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
716+
717+
unsupported_ascii_compatible_encodings = [
718+
'big5', 'big5hkscs',
719+
'cp932', 'cp949', 'cp950',
720+
'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
721+
'gb2312', 'gbk', 'gb18030',
722+
'iso2022-kr', 'johab',
723+
'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
724+
'utf-7',
725+
]
726+
for encoding in unsupported_ascii_compatible_encodings:
727+
self.assertRaises(ValueError, ET.XML, bxml(encoding))
728+
729+
unsupported_ascii_incompatible_encodings = [
730+
'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
731+
'utf_32', 'utf_32_be', 'utf_32_le',
732+
]
733+
for encoding in unsupported_ascii_incompatible_encodings:
734+
self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
735+
736+
self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
737+
self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))
738+
739+
def xml(encoding):
740+
return "<?xml version='1.0' encoding='%s'?><xml />" % encoding
741+
def bxml(encoding):
742+
return xml(encoding).encode(encoding)
743+
supported_encodings = [
744+
'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
745+
'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
746+
'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
747+
'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
748+
'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852',
749+
'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862',
750+
'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1250',
751+
'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
752+
'cp1257', 'cp1258',
753+
'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
754+
'mac-roman', 'mac-turkish',
755+
'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
756+
'iso2022-jp-3', 'iso2022-jp-ext',
757+
'koi8-r', 'koi8-u',
758+
'hz', 'ptcp154',
759+
]
760+
for encoding in supported_encodings:
761+
self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
762+
763+
unsupported_ascii_compatible_encodings = [
764+
'big5', 'big5hkscs',
765+
'cp932', 'cp949', 'cp950',
766+
'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
767+
'gb2312', 'gbk', 'gb18030',
768+
'iso2022-kr', 'johab',
769+
'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
770+
'utf-7',
771+
]
772+
for encoding in unsupported_ascii_compatible_encodings:
773+
self.assertRaises(ValueError, ET.XML, bxml(encoding))
774+
775+
unsupported_ascii_incompatible_encodings = [
776+
'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
777+
'utf_32', 'utf_32_be', 'utf_32_le',
778+
]
779+
for encoding in unsupported_ascii_incompatible_encodings:
780+
self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
781+
782+
self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
783+
self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))
784+
693785
def test_methods(self):
694786
# Test serialization methods.
695787

Modules/_elementtree.c

Lines changed: 1 addition & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -3136,47 +3136,6 @@ expat_pi_handler(XMLParserObject* self, const XML_Char* target_in,
31363136
}
31373137
}
31383138

3139-
static int
3140-
expat_unknown_encoding_handler(XMLParserObject *self, const XML_Char *name,
3141-
XML_Encoding *info)
3142-
{
3143-
PyObject* u;
3144-
unsigned char s[256];
3145-
int i;
3146-
void *data;
3147-
unsigned int kind;
3148-
3149-
memset(info, 0, sizeof(XML_Encoding));
3150-
3151-
for (i = 0; i < 256; i++)
3152-
s[i] = i;
3153-
3154-
u = PyUnicode_Decode((char*) s, 256, name, "replace");
3155-
if (!u)
3156-
return XML_STATUS_ERROR;
3157-
if (PyUnicode_READY(u))
3158-
return XML_STATUS_ERROR;
3159-
3160-
if (PyUnicode_GET_LENGTH(u) != 256) {
3161-
Py_DECREF(u);
3162-
return XML_STATUS_ERROR;
3163-
}
3164-
3165-
kind = PyUnicode_KIND(u);
3166-
data = PyUnicode_DATA(u);
3167-
for (i = 0; i < 256; i++) {
3168-
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
3169-
if (ch != Py_UNICODE_REPLACEMENT_CHARACTER)
3170-
info->map[i] = ch;
3171-
else
3172-
info->map[i] = -1;
3173-
}
3174-
3175-
Py_DECREF(u);
3176-
3177-
return XML_STATUS_OK;
3178-
}
3179-
31803139
/* -------------------------------------------------------------------- */
31813140

31823141
static PyObject *
@@ -3278,7 +3237,7 @@ xmlparser_init(PyObject *self, PyObject *args, PyObject *kwds)
32783237
);
32793238
EXPAT(SetUnknownEncodingHandler)(
32803239
self_xp->parser,
3281-
(XML_UnknownEncodingHandler) expat_unknown_encoding_handler, NULL
3240+
EXPAT(DefaultUnknownEncodingHandler), NULL
32823241
);
32833242

32843243
return 0;

Modules/pyexpat.c

Lines changed: 27 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1111,53 +1111,49 @@ static struct PyMethodDef xmlparse_methods[] = {
11111111
Make it as simple as possible.
11121112
*/
11131113

1114-
static char template_buffer[257];
1115-
1116-
static void
1117-
init_template_buffer(void)
1118-
{
1119-
int i;
1120-
for (i = 0; i < 256; i++) {
1121-
template_buffer[i] = i;
1122-
}
1123-
template_buffer[256] = 0;
1124-
}
1125-
11261114
static int
11271115
PyUnknownEncodingHandler(void *encodingHandlerData,
11281116
const XML_Char *name,
11291117
XML_Encoding *info)
11301118
{
1131-
PyUnicodeObject *_u_string = NULL;
1132-
int result = 0;
1119+
static unsigned char template_buffer[256] = {0};
1120+
PyObject* u;
11331121
int i;
1134-
int kind;
11351122
void *data;
1123+
unsigned int kind;
11361124

1137-
/* Yes, supports only 8bit encodings */
1138-
_u_string = (PyUnicodeObject *)
1139-
PyUnicode_Decode(template_buffer, 256, name, "replace");
1125+
if (template_buffer[1] == 0) {
1126+
for (i = 0; i < 256; i++)
1127+
template_buffer[i] = i;
1128+
}
11401129

1141-
if (_u_string == NULL || PyUnicode_READY(_u_string) == -1)
1142-
return result;
1130+
u = PyUnicode_Decode((char*) template_buffer, 256, name, "replace");
1131+
if (u == NULL || PyUnicode_READY(u))
1132+
return XML_STATUS_ERROR;
11431133

1144-
kind = PyUnicode_KIND(_u_string);
1145-
data = PyUnicode_DATA(_u_string);
1134+
if (PyUnicode_GET_LENGTH(u) != 256) {
1135+
Py_DECREF(u);
1136+
PyErr_SetString(PyExc_ValueError,
1137+
"multi-byte encodings are not supported");
1138+
return XML_STATUS_ERROR;
1139+
}
11461140

1141+
kind = PyUnicode_KIND(u);
1142+
data = PyUnicode_DATA(u);
11471143
for (i = 0; i < 256; i++) {
1148-
/* Stupid to access directly, but fast */
1149-
Py_UCS4 c = PyUnicode_READ(kind, data, i);
1150-
if (c == Py_UNICODE_REPLACEMENT_CHARACTER)
1151-
info->map[i] = -1;
1144+
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
1145+
if (ch != Py_UNICODE_REPLACEMENT_CHARACTER)
1146+
info->map[i] = ch;
11521147
else
1153-
info->map[i] = c;
1148+
info->map[i] = -1;
11541149
}
1150+
11551151
info->data = NULL;
11561152
info->convert = NULL;
11571153
info->release = NULL;
1158-
result = 1;
1159-
Py_DECREF(_u_string);
1160-
return result;
1154+
Py_DECREF(u);
1155+
1156+
return XML_STATUS_OK;
11611157
}
11621158

11631159

@@ -1752,7 +1748,6 @@ MODULE_INITFUNC(void)
17521748
Py_BuildValue("(iii)", info.major,
17531749
info.minor, info.micro));
17541750
}
1755-
init_template_buffer();
17561751
/* XXX When Expat supports some way of figuring out how it was
17571752
compiled, this should check and set native_encoding
17581753
appropriately.
@@ -1938,6 +1933,7 @@ MODULE_INITFUNC(void)
19381933
capi.SetUserData = XML_SetUserData;
19391934
capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler;
19401935
capi.SetEncoding = XML_SetEncoding;
1936+
capi.DefaultUnknownEncodingHandler = PyUnknownEncodingHandler;
19411937

19421938
/* export using capsule */
19431939
capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL);

0 commit comments

Comments
 (0)