Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 447b6e3

Browse files
Issue #16986: ElementTree now correctly parses a string input not only when
an internal XML encoding is UTF-8 or US-ASCII.
2 parents 43e145b + 66d53fa commit 447b6e3

5 files changed

Lines changed: 65 additions & 24 deletions

File tree

Include/pyexpat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ struct PyExpat_CAPI
4545
void (*SetUserData)(XML_Parser parser, void *userData);
4646
void (*SetStartDoctypeDeclHandler)(XML_Parser parser,
4747
XML_StartDoctypeDeclHandler start);
48+
enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding);
4849
/* always add new stuff to the end! */
4950
};
5051

Lib/test/test_xml_etree.py

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -668,15 +668,18 @@ def test_writestring(self):
668668
elem = ET.fromstring("<html><body>text</body></html>")
669669
self.assertEqual(ET.tostring(elem), b'<html><body>text</body></html>')
670670

671-
def test_encoding(encoding):
672-
def check(encoding):
673-
ET.XML("<?xml version='1.0' encoding='%s'?><xml />" % encoding)
674-
check("ascii")
675-
check("us-ascii")
676-
check("iso-8859-1")
677-
check("iso-8859-15")
678-
check("cp437")
679-
check("mac-roman")
671+
def test_encoding(self):
672+
def check(encoding, body=''):
673+
xml = ("<?xml version='1.0' encoding='%s'?><xml>%s</xml>" %
674+
(encoding, body))
675+
self.assertEqual(ET.XML(xml.encode(encoding)).text, body)
676+
self.assertEqual(ET.XML(xml).text, body)
677+
check("ascii", 'a')
678+
check("us-ascii", 'a')
679+
check("iso-8859-1", '\xbd')
680+
check("iso-8859-15", '\u20ac')
681+
check("cp437", '\u221a')
682+
check("mac-roman", '\u02da')
680683

681684
def test_methods(self):
682685
# Test serialization methods.
@@ -2002,11 +2005,13 @@ def close(self):
20022005

20032006

20042007
class XMLParserTest(unittest.TestCase):
2005-
sample1 = '<file><line>22</line></file>'
2006-
sample2 = ('<!DOCTYPE html PUBLIC'
2007-
' "-//W3C//DTD XHTML 1.0 Transitional//EN"'
2008-
' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'
2009-
'<html>text</html>')
2008+
sample1 = b'<file><line>22</line></file>'
2009+
sample2 = (b'<!DOCTYPE html PUBLIC'
2010+
b' "-//W3C//DTD XHTML 1.0 Transitional//EN"'
2011+
b' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'
2012+
b'<html>text</html>')
2013+
sample3 = ('<?xml version="1.0" encoding="iso-8859-1"?>\n'
2014+
'<money value="$\xa3\u20ac\U0001017b">$\xa3\u20ac\U0001017b</money>')
20102015

20112016
def _check_sample_element(self, e):
20122017
self.assertEqual(e.tag, 'file')
@@ -2042,12 +2047,21 @@ def doctype(self, name, pubid, system):
20422047
_doctype = (name, pubid, system)
20432048

20442049
parser = MyParserWithDoctype()
2045-
parser.feed(self.sample2)
2050+
with self.assertWarns(DeprecationWarning):
2051+
parser.feed(self.sample2)
20462052
parser.close()
20472053
self.assertEqual(_doctype,
20482054
('html', '-//W3C//DTD XHTML 1.0 Transitional//EN',
20492055
'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'))
20502056

2057+
def test_parse_string(self):
2058+
parser = ET.XMLParser(target=ET.TreeBuilder())
2059+
parser.feed(self.sample3)
2060+
e = parser.close()
2061+
self.assertEqual(e.tag, 'money')
2062+
self.assertEqual(e.attrib['value'], '$\xa3\u20ac\U0001017b')
2063+
self.assertEqual(e.text, '$\xa3\u20ac\U0001017b')
2064+
20512065

20522066
class NamespaceParseTest(unittest.TestCase):
20532067
def test_find_with_namespace(self):
@@ -2473,6 +2487,7 @@ def test_main(module=None):
24732487
ElementFindTest,
24742488
ElementIterTest,
24752489
TreeBuilderTest,
2490+
XMLParserTest,
24762491
BugsTest,
24772492
]
24782493

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ Core and Builtins
9696
Library
9797
-------
9898

99+
- Issue #16986: ElementTree now correctly parses a string input not only when
100+
an internal XML encoding is UTF-8 or US-ASCII.
101+
99102
- Issue #17996: socket module now exposes AF_LINK constant on BSD and OSX.
100103

101104
- Issue #17900: Allowed pickling of recursive OrderedDicts. Decreased pickled

Modules/_elementtree.c

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3288,7 +3288,7 @@ xmlparser_dealloc(XMLParserObject* self)
32883288
}
32893289

32903290
LOCAL(PyObject*)
3291-
expat_parse(XMLParserObject* self, char* data, int data_len, int final)
3291+
expat_parse(XMLParserObject* self, const char* data, int data_len, int final)
32923292
{
32933293
int ok;
32943294

@@ -3334,16 +3334,37 @@ xmlparser_close(XMLParserObject* self, PyObject* args)
33343334
}
33353335

33363336
static PyObject*
3337-
xmlparser_feed(XMLParserObject* self, PyObject* args)
3337+
xmlparser_feed(XMLParserObject* self, PyObject* arg)
33383338
{
33393339
/* feed data to parser */
33403340

3341-
char* data;
3342-
int data_len;
3343-
if (!PyArg_ParseTuple(args, "s#:feed", &data, &data_len))
3344-
return NULL;
3345-
3346-
return expat_parse(self, data, data_len, 0);
3341+
if (PyUnicode_Check(arg)) {
3342+
Py_ssize_t data_len;
3343+
const char *data = PyUnicode_AsUTF8AndSize(arg, &data_len);
3344+
if (data == NULL)
3345+
return NULL;
3346+
if (data_len > INT_MAX) {
3347+
PyErr_SetString(PyExc_OverflowError, "size does not fit in an int");
3348+
return NULL;
3349+
}
3350+
/* Explicitly set UTF-8 encoding. Return code ignored. */
3351+
(void)EXPAT(SetEncoding)(self->parser, "utf-8");
3352+
return expat_parse(self, data, (int)data_len, 0);
3353+
}
3354+
else {
3355+
Py_buffer view;
3356+
PyObject *res;
3357+
if (PyObject_GetBuffer(arg, &view, PyBUF_SIMPLE) < 0)
3358+
return NULL;
3359+
if (view.len > INT_MAX) {
3360+
PyBuffer_Release(&view);
3361+
PyErr_SetString(PyExc_OverflowError, "size does not fit in an int");
3362+
return NULL;
3363+
}
3364+
res = expat_parse(self, view.buf, (int)view.len, 0);
3365+
PyBuffer_Release(&view);
3366+
return res;
3367+
}
33473368
}
33483369

33493370
static PyObject*
@@ -3523,7 +3544,7 @@ xmlparser_setevents(XMLParserObject *self, PyObject* args)
35233544
}
35243545

35253546
static PyMethodDef xmlparser_methods[] = {
3526-
{"feed", (PyCFunction) xmlparser_feed, METH_VARARGS},
3547+
{"feed", (PyCFunction) xmlparser_feed, METH_O},
35273548
{"close", (PyCFunction) xmlparser_close, METH_VARARGS},
35283549
{"_parse_whole", (PyCFunction) xmlparser_parse_whole, METH_VARARGS},
35293550
{"_setevents", (PyCFunction) xmlparser_setevents, METH_VARARGS},

Modules/pyexpat.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1937,6 +1937,7 @@ MODULE_INITFUNC(void)
19371937
capi.SetUnknownEncodingHandler = XML_SetUnknownEncodingHandler;
19381938
capi.SetUserData = XML_SetUserData;
19391939
capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler;
1940+
capi.SetEncoding = XML_SetEncoding;
19401941

19411942
/* export using capsule */
19421943
capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL);

0 commit comments

Comments
 (0)