Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 43536e9

Browse files
Issue #17089: Expat parser now correctly works with string input not only when
an internal XML encoding is UTF-8 or US-ASCII. It now accepts bytes and strings larger than 2 GiB.
1 parent 95b7110 commit 43536e9

3 files changed

Lines changed: 86 additions & 40 deletions

File tree

Lib/test/test_pyexpat.py

Lines changed: 43 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def test_specified_attributes(self):
5252
<sub2><![CDATA[contents of CDATA section]]></sub2>
5353
&external_entity;
5454
&skipped_entity;
55+
\xb5
5556
</root>
5657
'''
5758

@@ -195,13 +196,13 @@ def _verify_parse_output(self, operations):
195196
"End element: 'sub2'",
196197
"External entity ref: (None, 'entity.file', None)",
197198
('Skipped entity', ('skipped_entity', 0)),
199+
"Character data: '\xb5'",
198200
"End element: 'root'",
199201
]
200202
for operation, expected_operation in zip(operations, expected_operations):
201203
self.assertEqual(operation, expected_operation)
202204

203-
def test_unicode(self):
204-
# Try the parse again, this time producing Unicode output
205+
def test_parse_bytes(self):
205206
out = self.Outputter()
206207
parser = expat.ParserCreate(namespace_separator='!')
207208
self._hookup_callbacks(parser, out)
@@ -213,6 +214,16 @@ def test_unicode(self):
213214
# Issue #6697.
214215
self.assertRaises(AttributeError, getattr, parser, '\uD800')
215216

217+
def test_parse_str(self):
218+
out = self.Outputter()
219+
parser = expat.ParserCreate(namespace_separator='!')
220+
self._hookup_callbacks(parser, out)
221+
222+
parser.Parse(data.decode('iso-8859-1'), 1)
223+
224+
operations = out.out
225+
self._verify_parse_output(operations)
226+
216227
def test_parse_file(self):
217228
# Try parsing a file
218229
out = self.Outputter()
@@ -269,7 +280,7 @@ def collector(name, *args):
269280
L.append(name)
270281
p.StartElementHandler = collector
271282
p.EndElementHandler = collector
272-
p.Parse("<e> <e/> <e></e> </e>", 1)
283+
p.Parse(b"<e> <e/> <e></e> </e>", 1)
273284
tag = L[0]
274285
self.assertEqual(len(L), 6)
275286
for entry in L:
@@ -285,7 +296,7 @@ def __init__(self, parser):
285296

286297
def ExternalEntityRefHandler(self, context, base, sysId, pubId):
287298
external_parser = self.parser.ExternalEntityParserCreate("")
288-
self.parser_result = external_parser.Parse("", 1)
299+
self.parser_result = external_parser.Parse(b"", 1)
289300
return 1
290301

291302
parser = expat.ParserCreate(namespace_separator='!')
@@ -336,55 +347,55 @@ def test_default_to_disabled(self):
336347
def test_buffering_enabled(self):
337348
# Make sure buffering is turned on
338349
self.assertTrue(self.parser.buffer_text)
339-
self.parser.Parse("<a>1<b/>2<c/>3</a>", 1)
350+
self.parser.Parse(b"<a>1<b/>2<c/>3</a>", 1)
340351
self.assertEqual(self.stuff, ['123'],
341352
"buffered text not properly collapsed")
342353

343354
def test1(self):
344355
# XXX This test exposes more detail of Expat's text chunking than we
345356
# XXX like, but it tests what we need to concisely.
346357
self.setHandlers(["StartElementHandler"])
347-
self.parser.Parse("<a>1<b buffer-text='no'/>2\n3<c buffer-text='yes'/>4\n5</a>", 1)
358+
self.parser.Parse(b"<a>1<b buffer-text='no'/>2\n3<c buffer-text='yes'/>4\n5</a>", 1)
348359
self.assertEqual(self.stuff,
349360
["<a>", "1", "<b>", "2", "\n", "3", "<c>", "4\n5"],
350361
"buffering control not reacting as expected")
351362

352363
def test2(self):
353-
self.parser.Parse("<a>1<b/>&lt;2&gt;<c/>&#32;\n&#x20;3</a>", 1)
364+
self.parser.Parse(b"<a>1<b/>&lt;2&gt;<c/>&#32;\n&#x20;3</a>", 1)
354365
self.assertEqual(self.stuff, ["1<2> \n 3"],
355366
"buffered text not properly collapsed")
356367

357368
def test3(self):
358369
self.setHandlers(["StartElementHandler"])
359-
self.parser.Parse("<a>1<b/>2<c/>3</a>", 1)
370+
self.parser.Parse(b"<a>1<b/>2<c/>3</a>", 1)
360371
self.assertEqual(self.stuff, ["<a>", "1", "<b>", "2", "<c>", "3"],
361372
"buffered text not properly split")
362373

363374
def test4(self):
364375
self.setHandlers(["StartElementHandler", "EndElementHandler"])
365376
self.parser.CharacterDataHandler = None
366-
self.parser.Parse("<a>1<b/>2<c/>3</a>", 1)
377+
self.parser.Parse(b"<a>1<b/>2<c/>3</a>", 1)
367378
self.assertEqual(self.stuff,
368379
["<a>", "<b>", "</b>", "<c>", "</c>", "</a>"])
369380

370381
def test5(self):
371382
self.setHandlers(["StartElementHandler", "EndElementHandler"])
372-
self.parser.Parse("<a>1<b></b>2<c/>3</a>", 1)
383+
self.parser.Parse(b"<a>1<b></b>2<c/>3</a>", 1)
373384
self.assertEqual(self.stuff,
374385
["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3", "</a>"])
375386

376387
def test6(self):
377388
self.setHandlers(["CommentHandler", "EndElementHandler",
378389
"StartElementHandler"])
379-
self.parser.Parse("<a>1<b/>2<c></c>345</a> ", 1)
390+
self.parser.Parse(b"<a>1<b/>2<c></c>345</a> ", 1)
380391
self.assertEqual(self.stuff,
381392
["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "345", "</a>"],
382393
"buffered text not properly split")
383394

384395
def test7(self):
385396
self.setHandlers(["CommentHandler", "EndElementHandler",
386397
"StartElementHandler"])
387-
self.parser.Parse("<a>1<b/>2<c></c>3<!--abc-->4<!--def-->5</a> ", 1)
398+
self.parser.Parse(b"<a>1<b/>2<c></c>3<!--abc-->4<!--def-->5</a> ", 1)
388399
self.assertEqual(self.stuff,
389400
["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3",
390401
"<!--abc-->", "4", "<!--def-->", "5", "</a>"],
@@ -400,7 +411,7 @@ def test(self):
400411
parser = expat.ParserCreate()
401412
parser.StartElementHandler = self.StartElementHandler
402413
try:
403-
parser.Parse("<a><b><c/></b></a>", 1)
414+
parser.Parse(b"<a><b><c/></b></a>", 1)
404415
self.fail()
405416
except RuntimeError as e:
406417
self.assertEqual(e.args[0], 'a',
@@ -436,7 +447,7 @@ def test(self):
436447
self.expected_list = [('s', 0, 1, 0), ('s', 5, 2, 1), ('s', 11, 3, 2),
437448
('e', 15, 3, 6), ('e', 17, 4, 1), ('e', 22, 5, 0)]
438449

439-
xml = '<a>\n <b>\n <c/>\n </b>\n</a>'
450+
xml = b'<a>\n <b>\n <c/>\n </b>\n</a>'
440451
self.parser.Parse(xml, 1)
441452

442453

@@ -457,7 +468,7 @@ def handler(text):
457468
parser = expat.ParserCreate()
458469
parser.CharacterDataHandler = handler
459470

460-
self.assertRaises(Exception, parser.Parse, xml)
471+
self.assertRaises(Exception, parser.Parse, xml.encode('iso8859'))
461472

462473
class ChardataBufferTest(unittest.TestCase):
463474
"""
@@ -480,8 +491,8 @@ def f(size):
480491
self.assertRaises(ValueError, f, 0)
481492

482493
def test_unchanged_size(self):
483-
xml1 = ("<?xml version='1.0' encoding='iso8859'?><s>%s" % ('a' * 512))
484-
xml2 = 'a'*512 + '</s>'
494+
xml1 = b"<?xml version='1.0' encoding='iso8859'?><s>" + b'a' * 512
495+
xml2 = b'a'*512 + b'</s>'
485496
parser = expat.ParserCreate()
486497
parser.CharacterDataHandler = self.counting_handler
487498
parser.buffer_size = 512
@@ -503,9 +514,9 @@ def test_unchanged_size(self):
503514

504515

505516
def test_disabling_buffer(self):
506-
xml1 = "<?xml version='1.0' encoding='iso8859'?><a>%s" % ('a' * 512)
507-
xml2 = ('b' * 1024)
508-
xml3 = "%s</a>" % ('c' * 1024)
517+
xml1 = b"<?xml version='1.0' encoding='iso8859'?><a>" + b'a' * 512
518+
xml2 = b'b' * 1024
519+
xml3 = b'c' * 1024 + b'</a>';
509520
parser = expat.ParserCreate()
510521
parser.CharacterDataHandler = self.counting_handler
511522
parser.buffer_text = 1
@@ -532,16 +543,11 @@ def test_disabling_buffer(self):
532543
parser.Parse(xml3, 1)
533544
self.assertEqual(self.n, 12)
534545

535-
536-
537-
def make_document(self, bytes):
538-
return ("<?xml version='1.0'?><tag>" + bytes * 'a' + '</tag>')
539-
540546
def counting_handler(self, text):
541547
self.n += 1
542548

543549
def small_buffer_test(self, buffer_len):
544-
xml = "<?xml version='1.0' encoding='iso8859'?><s>%s</s>" % ('a' * buffer_len)
550+
xml = b"<?xml version='1.0' encoding='iso8859'?><s>" + b'a' * buffer_len + b'</s>'
545551
parser = expat.ParserCreate()
546552
parser.CharacterDataHandler = self.counting_handler
547553
parser.buffer_size = 1024
@@ -552,8 +558,8 @@ def small_buffer_test(self, buffer_len):
552558
return self.n
553559

554560
def test_change_size_1(self):
555-
xml1 = "<?xml version='1.0' encoding='iso8859'?><a><s>%s" % ('a' * 1024)
556-
xml2 = "aaa</s><s>%s</s></a>" % ('a' * 1025)
561+
xml1 = b"<?xml version='1.0' encoding='iso8859'?><a><s>" + b'a' * 1024
562+
xml2 = b'aaa</s><s>' + b'a' * 1025 + b'</s></a>'
557563
parser = expat.ParserCreate()
558564
parser.CharacterDataHandler = self.counting_handler
559565
parser.buffer_text = 1
@@ -568,8 +574,8 @@ def test_change_size_1(self):
568574
self.assertEqual(self.n, 2)
569575

570576
def test_change_size_2(self):
571-
xml1 = "<?xml version='1.0' encoding='iso8859'?><a>a<s>%s" % ('a' * 1023)
572-
xml2 = "aaa</s><s>%s</s></a>" % ('a' * 1025)
577+
xml1 = b"<?xml version='1.0' encoding='iso8859'?><a>a<s>" + b'a' * 1023
578+
xml2 = b'aaa</s><s>' + b'a' * 1025 + b'</s></a>'
573579
parser = expat.ParserCreate()
574580
parser.CharacterDataHandler = self.counting_handler
575581
parser.buffer_text = 1
@@ -585,7 +591,7 @@ def test_change_size_2(self):
585591

586592
class MalformedInputTest(unittest.TestCase):
587593
def test1(self):
588-
xml = "\0\r\n"
594+
xml = b"\0\r\n"
589595
parser = expat.ParserCreate()
590596
try:
591597
parser.Parse(xml, True)
@@ -594,7 +600,8 @@ def test1(self):
594600
self.assertEqual(str(e), 'unclosed token: line 2, column 0')
595601

596602
def test2(self):
597-
xml = "<?xml version\xc2\x85='1.0'?>\r\n"
603+
# \xc2\x85 is UTF-8 encoded U+0085 (NEXT LINE)
604+
xml = b"<?xml version\xc2\x85='1.0'?>\r\n"
598605
parser = expat.ParserCreate()
599606
try:
600607
parser.Parse(xml, True)
@@ -609,7 +616,7 @@ def test_codes(self):
609616
errors.messages[errors.codes[errors.XML_ERROR_SYNTAX]])
610617

611618
def test_expaterror(self):
612-
xml = '<'
619+
xml = b'<'
613620
parser = expat.ParserCreate()
614621
try:
615622
parser.Parse(xml, True)
@@ -638,7 +645,7 @@ def resolve_entity(context, base, system_id, public_id):
638645
parser.UseForeignDTD(True)
639646
parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
640647
parser.ExternalEntityRefHandler = resolve_entity
641-
parser.Parse("<?xml version='1.0'?><element/>")
648+
parser.Parse(b"<?xml version='1.0'?><element/>")
642649
self.assertEqual(handler_call_args, [(None, None)])
643650

644651
# test UseForeignDTD() is equal to UseForeignDTD(True)
@@ -648,7 +655,7 @@ def resolve_entity(context, base, system_id, public_id):
648655
parser.UseForeignDTD()
649656
parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
650657
parser.ExternalEntityRefHandler = resolve_entity
651-
parser.Parse("<?xml version='1.0'?><element/>")
658+
parser.Parse(b"<?xml version='1.0'?><element/>")
652659
self.assertEqual(handler_call_args, [(None, None)])
653660

654661
def test_ignore_use_foreign_dtd(self):
@@ -667,7 +674,7 @@ def resolve_entity(context, base, system_id, public_id):
667674
parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
668675
parser.ExternalEntityRefHandler = resolve_entity
669676
parser.Parse(
670-
"<?xml version='1.0'?><!DOCTYPE foo PUBLIC 'bar' 'baz'><element/>")
677+
b"<?xml version='1.0'?><!DOCTYPE foo PUBLIC 'bar' 'baz'><element/>")
671678
self.assertEqual(handler_call_args, [("bar", "baz")])
672679

673680

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,10 @@ Core and Builtins
212212
Library
213213
-------
214214

215+
- Issue #17089: Expat parser now correctly works with string input not only when
216+
an internal XML encoding is UTF-8 or US-ASCII. It now accepts bytes and
217+
strings larger than 2 GiB.
218+
215219
- Issue #16903: Popen.communicate() on Unix now accepts strings when
216220
universal_newlines is true as on Windows.
217221

Modules/pyexpat.c

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -777,17 +777,52 @@ PyDoc_STRVAR(xmlparse_Parse__doc__,
777777
"Parse(data[, isfinal])\n\
778778
Parse XML data. `isfinal' should be true at end of input.");
779779

780+
#define MAX_CHUNK_SIZE (1 << 20)
781+
780782
static PyObject *
781783
xmlparse_Parse(xmlparseobject *self, PyObject *args)
782784
{
783-
char *s;
784-
int slen;
785+
PyObject *data;
785786
int isFinal = 0;
787+
const char *s;
788+
Py_ssize_t slen;
789+
Py_buffer view;
790+
int rc;
786791

787-
if (!PyArg_ParseTuple(args, "s#|i:Parse", &s, &slen, &isFinal))
792+
if (!PyArg_ParseTuple(args, "O|i:Parse", &data, &isFinal))
788793
return NULL;
789794

790-
return get_parse_result(self, XML_Parse(self->itself, s, slen, isFinal));
795+
if (PyUnicode_Check(data)) {
796+
PyObject *bytes;
797+
bytes = PyUnicode_AsUTF8String(data);
798+
if (bytes == NULL)
799+
return NULL;
800+
view.buf = NULL;
801+
s = PyBytes_AS_STRING(bytes);
802+
slen = PyBytes_GET_SIZE(bytes);
803+
/* Explicitly set UTF-8 encoding. Return code ignored. */
804+
(void)XML_SetEncoding(self->itself, "utf-8");
805+
}
806+
else {
807+
if (PyObject_GetBuffer(data, &view, PyBUF_SIMPLE) < 0)
808+
return NULL;
809+
s = view.buf;
810+
slen = view.len;
811+
}
812+
813+
while (slen > MAX_CHUNK_SIZE) {
814+
rc = XML_Parse(self->itself, s, MAX_CHUNK_SIZE, 0);
815+
if (!rc)
816+
goto done;
817+
s += MAX_CHUNK_SIZE;
818+
slen -= MAX_CHUNK_SIZE;
819+
}
820+
rc = XML_Parse(self->itself, s, slen, isFinal);
821+
822+
done:
823+
if (view.buf != NULL)
824+
PyBuffer_Release(&view);
825+
return get_parse_result(self, rc);
791826
}
792827

793828
/* File reading copied from cPickle */

0 commit comments

Comments
 (0)