Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 2a3d7db

Browse files
committed
Added character data buffering to pyexpat parser objects.
Setting the buffer_text attribute to true causes the parser to collect character data, waiting as long as possible to report it to the Python callback. This can save an enormous number of callbacks from C to Python, which can be a substantial performance improvement. buffer_text defaults to false.
1 parent 3e76d7f commit 2a3d7db

2 files changed

Lines changed: 251 additions & 21 deletions

File tree

Lib/test/test_pyexpat.py

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pyexpat
77
from xml.parsers import expat
88

9-
from test_support import sortdict
9+
from test_support import sortdict, TestFailed
1010

1111
class Outputter:
1212
def StartElementHandler(self, name, attrs):
@@ -218,3 +218,96 @@ def collector(name, *args):
218218
print "(it didn't)"
219219
print "L =", `L`
220220
break
221+
222+
# Tests of the buffer_text attribute.
223+
import sys
224+
225+
class TextCollector:
226+
def __init__(self, parser):
227+
self.stuff = []
228+
229+
def check(self, expected, label):
230+
require(self.stuff == expected,
231+
"%s\nstuff = %s\nexpected = %s"
232+
% (label, `self.stuff`, `map(unicode, expected)`))
233+
234+
def CharacterDataHandler(self, text):
235+
self.stuff.append(text)
236+
237+
def StartElementHandler(self, name, attrs):
238+
self.stuff.append("<%s>" % name)
239+
bt = attrs.get("buffer-text")
240+
if bt == "yes":
241+
parser.buffer_text = 1
242+
elif bt == "no":
243+
parser.buffer_text = 0
244+
245+
def EndElementHandler(self, name):
246+
self.stuff.append("</%s>" % name)
247+
248+
def CommentHandler(self, data):
249+
self.stuff.append("<!--%s-->" % data)
250+
251+
def require(cond, label):
252+
# similar to confirm(), but no extraneous output
253+
if not cond:
254+
raise TestFailed(label)
255+
256+
def setup(handlers=[]):
257+
parser = expat.ParserCreate()
258+
require(not parser.buffer_text,
259+
"buffer_text not disabled by default")
260+
parser.buffer_text = 1
261+
handler = TextCollector(parser)
262+
parser.CharacterDataHandler = handler.CharacterDataHandler
263+
for name in handlers:
264+
setattr(parser, name, getattr(handler, name))
265+
return parser, handler
266+
267+
parser, handler = setup()
268+
require(parser.buffer_text,
269+
"text buffering either not acknowledged or not enabled")
270+
parser.Parse("<a>1<b/>2<c/>3</a>", 1)
271+
handler.check(["123"],
272+
"buffered text not properly collapsed")
273+
274+
# XXX This test exposes more detail of Expat's text chunking than we
275+
# XXX like, but it tests what we need to concisely.
276+
parser, handler = setup(["StartElementHandler"])
277+
parser.Parse("<a>1<b buffer-text='no'/>2\n3<c buffer-text='yes'/>4\n5</a>", 1)
278+
handler.check(["<a>", "1", "<b>", "2", "\n", "3", "<c>", "4\n5"],
279+
"buffering control not reacting as expected")
280+
281+
parser, handler = setup()
282+
parser.Parse("<a>1<b/>&lt;2&gt;<c/>&#32;\n&#x20;3</a>", 1)
283+
handler.check(["1<2> \n 3"],
284+
"buffered text not properly collapsed")
285+
286+
parser, handler = setup(["StartElementHandler"])
287+
parser.Parse("<a>1<b/>2<c/>3</a>", 1)
288+
handler.check(["<a>", "1", "<b>", "2", "<c>", "3"],
289+
"buffered text not properly split")
290+
291+
parser, handler = setup(["StartElementHandler", "EndElementHandler"])
292+
parser.CharacterDataHandler = None
293+
parser.Parse("<a>1<b/>2<c/>3</a>", 1)
294+
handler.check(["<a>", "<b>", "</b>", "<c>", "</c>", "</a>"],
295+
"huh?")
296+
297+
parser, handler = setup(["StartElementHandler", "EndElementHandler"])
298+
parser.Parse("<a>1<b></b>2<c/>3</a>", 1)
299+
handler.check(["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3", "</a>"],
300+
"huh?")
301+
302+
parser, handler = setup(["CommentHandler", "EndElementHandler",
303+
"StartElementHandler"])
304+
parser.Parse("<a>1<b/>2<c></c>345</a> ", 1)
305+
handler.check(["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "345", "</a>"],
306+
"buffered text not properly split")
307+
308+
parser, handler = setup(["CommentHandler", "EndElementHandler",
309+
"StartElementHandler"])
310+
parser.Parse("<a>1<b/>2<c></c>3<!--abc-->4<!--def-->5</a> ", 1)
311+
handler.check(["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3",
312+
"<!--abc-->", "4", "<!--def-->", "5", "</a>"],
313+
"buffered text not properly split")

0 commit comments

Comments
 (0)