Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 66d53fa

Browse files
Issue #16986: ElementTree now correctly parses a string input not only when
an internal XML encoding is UTF-8 or US-ASCII.
1 parent 9e62d35 commit 66d53fa

5 files changed

Lines changed: 65 additions & 24 deletions

File tree

Include/pyexpat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ struct PyExpat_CAPI
4545
void (*SetUserData)(XML_Parser parser, void *userData);
4646
void (*SetStartDoctypeDeclHandler)(XML_Parser parser,
4747
XML_StartDoctypeDeclHandler start);
48+
enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding);
4849
/* always add new stuff to the end! */
4950
};
5051

Lib/test/test_xml_etree.py

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -677,15 +677,18 @@ def test_writestring(self):
677677
elem = ET.fromstring("<html><body>text</body></html>")
678678
self.assertEqual(ET.tostring(elem), b'<html><body>text</body></html>')
679679

680-
def test_encoding(encoding):
681-
def check(encoding):
682-
ET.XML("<?xml version='1.0' encoding='%s'?><xml />" % encoding)
683-
check("ascii")
684-
check("us-ascii")
685-
check("iso-8859-1")
686-
check("iso-8859-15")
687-
check("cp437")
688-
check("mac-roman")
680+
def test_encoding(self):
681+
def check(encoding, body=''):
682+
xml = ("<?xml version='1.0' encoding='%s'?><xml>%s</xml>" %
683+
(encoding, body))
684+
self.assertEqual(ET.XML(xml.encode(encoding)).text, body)
685+
self.assertEqual(ET.XML(xml).text, body)
686+
check("ascii", 'a')
687+
check("us-ascii", 'a')
688+
check("iso-8859-1", '\xbd')
689+
check("iso-8859-15", '\u20ac')
690+
check("cp437", '\u221a')
691+
check("mac-roman", '\u02da')
689692

690693
def test_methods(self):
691694
# Test serialization methods.
@@ -1842,11 +1845,13 @@ def close(self):
18421845

18431846

18441847
class XMLParserTest(unittest.TestCase):
1845-
sample1 = '<file><line>22</line></file>'
1846-
sample2 = ('<!DOCTYPE html PUBLIC'
1847-
' "-//W3C//DTD XHTML 1.0 Transitional//EN"'
1848-
' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'
1849-
'<html>text</html>')
1848+
sample1 = b'<file><line>22</line></file>'
1849+
sample2 = (b'<!DOCTYPE html PUBLIC'
1850+
b' "-//W3C//DTD XHTML 1.0 Transitional//EN"'
1851+
b' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'
1852+
b'<html>text</html>')
1853+
sample3 = ('<?xml version="1.0" encoding="iso-8859-1"?>\n'
1854+
'<money value="$\xa3\u20ac\U0001017b">$\xa3\u20ac\U0001017b</money>')
18501855

18511856
def _check_sample_element(self, e):
18521857
self.assertEqual(e.tag, 'file')
@@ -1882,12 +1887,21 @@ def doctype(self, name, pubid, system):
18821887
_doctype = (name, pubid, system)
18831888

18841889
parser = MyParserWithDoctype()
1885-
parser.feed(self.sample2)
1890+
with self.assertWarns(DeprecationWarning):
1891+
parser.feed(self.sample2)
18861892
parser.close()
18871893
self.assertEqual(_doctype,
18881894
('html', '-//W3C//DTD XHTML 1.0 Transitional//EN',
18891895
'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'))
18901896

1897+
def test_parse_string(self):
1898+
parser = ET.XMLParser(target=ET.TreeBuilder())
1899+
parser.feed(self.sample3)
1900+
e = parser.close()
1901+
self.assertEqual(e.tag, 'money')
1902+
self.assertEqual(e.attrib['value'], '$\xa3\u20ac\U0001017b')
1903+
self.assertEqual(e.text, '$\xa3\u20ac\U0001017b')
1904+
18911905

18921906
class NamespaceParseTest(unittest.TestCase):
18931907
def test_find_with_namespace(self):
@@ -2297,6 +2311,7 @@ def test_main(module=None):
22972311
ElementFindTest,
22982312
ElementIterTest,
22992313
TreeBuilderTest,
2314+
XMLParserTest,
23002315
BugsTest,
23012316
]
23022317

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ Core and Builtins
2424
Library
2525
-------
2626

27+
- Issue #16986: ElementTree now correctly parses a string input not only when
28+
an internal XML encoding is UTF-8 or US-ASCII.
29+
2730
- Issue #17812: Fixed quadratic complexity of base64.b32encode().
2831

2932
- Issue #17980: Fix possible abuse of ssl.match_hostname() for denial of

Modules/_elementtree.c

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3330,7 +3330,7 @@ xmlparser_dealloc(XMLParserObject* self)
33303330
}
33313331

33323332
LOCAL(PyObject*)
3333-
expat_parse(XMLParserObject* self, char* data, int data_len, int final)
3333+
expat_parse(XMLParserObject* self, const char* data, int data_len, int final)
33343334
{
33353335
int ok;
33363336

@@ -3376,16 +3376,37 @@ xmlparser_close(XMLParserObject* self, PyObject* args)
33763376
}
33773377

33783378
static PyObject*
3379-
xmlparser_feed(XMLParserObject* self, PyObject* args)
3379+
xmlparser_feed(XMLParserObject* self, PyObject* arg)
33803380
{
33813381
/* feed data to parser */
33823382

3383-
char* data;
3384-
int data_len;
3385-
if (!PyArg_ParseTuple(args, "s#:feed", &data, &data_len))
3386-
return NULL;
3387-
3388-
return expat_parse(self, data, data_len, 0);
3383+
if (PyUnicode_Check(arg)) {
3384+
Py_ssize_t data_len;
3385+
const char *data = PyUnicode_AsUTF8AndSize(arg, &data_len);
3386+
if (data == NULL)
3387+
return NULL;
3388+
if (data_len > INT_MAX) {
3389+
PyErr_SetString(PyExc_OverflowError, "size does not fit in an int");
3390+
return NULL;
3391+
}
3392+
/* Explicitly set UTF-8 encoding. Return code ignored. */
3393+
(void)EXPAT(SetEncoding)(self->parser, "utf-8");
3394+
return expat_parse(self, data, (int)data_len, 0);
3395+
}
3396+
else {
3397+
Py_buffer view;
3398+
PyObject *res;
3399+
if (PyObject_GetBuffer(arg, &view, PyBUF_SIMPLE) < 0)
3400+
return NULL;
3401+
if (view.len > INT_MAX) {
3402+
PyBuffer_Release(&view);
3403+
PyErr_SetString(PyExc_OverflowError, "size does not fit in an int");
3404+
return NULL;
3405+
}
3406+
res = expat_parse(self, view.buf, (int)view.len, 0);
3407+
PyBuffer_Release(&view);
3408+
return res;
3409+
}
33893410
}
33903411

33913412
static PyObject*
@@ -3570,7 +3591,7 @@ xmlparser_setevents(XMLParserObject *self, PyObject* args)
35703591
}
35713592

35723593
static PyMethodDef xmlparser_methods[] = {
3573-
{"feed", (PyCFunction) xmlparser_feed, METH_VARARGS},
3594+
{"feed", (PyCFunction) xmlparser_feed, METH_O},
35743595
{"close", (PyCFunction) xmlparser_close, METH_VARARGS},
35753596
{"_parse", (PyCFunction) xmlparser_parse, METH_VARARGS},
35763597
{"_setevents", (PyCFunction) xmlparser_setevents, METH_VARARGS},

Modules/pyexpat.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1937,6 +1937,7 @@ MODULE_INITFUNC(void)
19371937
capi.SetUnknownEncodingHandler = XML_SetUnknownEncodingHandler;
19381938
capi.SetUserData = XML_SetUserData;
19391939
capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler;
1940+
capi.SetEncoding = XML_SetEncoding;
19401941

19411942
/* export using capsule */
19421943
capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL);

0 commit comments

Comments
 (0)