From 324d5af129ed852bd6867f6becd8c1658843ed86 Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Wed, 24 May 2023 23:09:54 +0000 Subject: [PATCH 01/15] SL-19707 - use iterators instead of recursion to generate XML This will allow us to handle much deeper hierachies. Also, reduce the number of function calls required to render values for performance. --- llsd/serde_xml.py | 167 ++++++++++++++++++++++++++-------------------- tests/bench.py | 34 ++++++++++ 2 files changed, 128 insertions(+), 73 deletions(-) diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py index 7dfeaa2..61ebe08 100644 --- a/llsd/serde_xml.py +++ b/llsd/serde_xml.py @@ -37,20 +37,6 @@ class LLSDXMLFormatter(LLSDBaseFormatter): this class since the module level format_xml() is the most convenient interface to this functionality. """ - def _elt(self, name, contents=None): - """ - Serialize a single element. - - If 'contents' is omitted, write . - If 'contents' is bytes, write contents. - If 'contents' is str, write contents.encode('utf8'). - """ - if not contents: - self.stream.writelines([b"<", name, b" />"]) - else: - self.stream.writelines([b"<", name, b">", - _str_to_bytes(contents), - b""]) def xml_esc(self, v): "Escape string or unicode object v for xml output" @@ -70,53 +56,35 @@ def xml_esc(self, v): return v.replace(b'&',b'&').replace(b'<',b'<').replace(b'>',b'>') def _LLSD(self, v): - return self._generate(v.thing) + raise LLSDSerializationError("We should never end up here") def _UNDEF(self, _v): - return self._elt(b'undef') + return b'' def _BOOLEAN(self, v): if v: - return self._elt(b'boolean', b'true') + return b'true' else: - return self._elt(b'boolean', b'false') + return b'false' def _INTEGER(self, v): - return self._elt(b'integer', str(v)) + return b'' + _str_to_bytes(str(v)) + b'' def _REAL(self, v): - return self._elt(b'real', repr(v)) + return b'' + _str_to_bytes(str(v)) + b'' def _UUID(self, v): if v.int == 0: - return self._elt(b'uuid') + return b'' else: - return self._elt(b'uuid', str(v)) + return b'' + _str_to_bytes(str(v)) + b'' def _BINARY(self, v): - return self._elt(b'binary', base64.b64encode(v).strip()) + return b'' + base64.b64encode(v).strip() + b'' def _STRING(self, v): - return self._elt(b'string', self.xml_esc(v)) + return b'' + self.xml_esc(v) + b'' def _URI(self, v): - return self._elt(b'uri', self.xml_esc(str(v))) + return b'' + self.xml_esc(v) + b'' def _DATE(self, v): - return self._elt(b'date', _format_datestr(v)) + return b'' + _format_datestr(v) + b'' def _ARRAY(self, v): - self.stream.write(b'') - for item in v: - self._generate(item) - self.stream.write(b'') + raise LLSDSerializationError("We should never end up here") def _MAP(self, v): - self.stream.write(b'') - for key, value in v.items(): - self._elt(b'key', self.xml_esc(UnicodeType(key))) - self._generate(value) - self.stream.write(b'') - - def _generate(self, something): - "Generate xml from a single python object." - t = type(something) - if t in self.type_map: - return self.type_map[t](something) - elif isinstance(something, _LLSD): - return self.type_map[_LLSD](something) - else: - raise LLSDSerializationError( - "Cannot serialize unknown type: %s (%s)" % (t, something)) + raise LLSDSerializationError("We should never end up here") def _write(self, something): """ @@ -126,7 +94,36 @@ def _write(self, something): """ self.stream.write(b'' b'') - self._generate(something) + + iter_stack = [(iter([something]), b"")] + while True: + cur_iter, iter_type = iter_stack[-1] + try: + item = next(cur_iter) + if iter_type == b"map": + self.stream.write(b'' + _str_to_bytes(self.xml_esc(UnicodeType(item[0]))) + b'') + item = item[1] + if isinstance(item, _LLSD): + item = item.thing + t = type(item) + if not t in self.type_map: + raise LLSDSerializationError( + "Cannot serialize unknown type: %s (%s)" % (t, item)) + tf = self.type_map[t] + + if tf == self._MAP: + self.stream.write(b'') + iter_stack.append((iter(item.items()), b"map")) + elif tf == self._ARRAY: + self.stream.write(b'') + iter_stack.append((iter(item), b"array")) + else: + self.stream.write(tf(item)) + except StopIteration: + self.stream.write(b'') + iter_stack.pop() + if len(iter_stack) == 1: + break self.stream.write(b'') @@ -161,40 +158,64 @@ def _indent(self): def _ARRAY(self, v): "Recursively format an array with pretty turned on." - self.stream.write(b'\n') - self._indent_level += 1 - for item in v: - self._indent() - self._generate(item) - self.stream.write(b'\n') - self._indent_level -= 1 - self._indent() - self.stream.write(b'') + raise LLSDSerializationError("We should never end up here") def _MAP(self, v): "Recursively format a map with pretty turned on." - self.stream.write(b'\n') - self._indent_level += 1 - # sorted list of keys - for key in sorted(v): - self._indent() - self._elt(b'key', UnicodeType(key)) - self.stream.write(b'\n') - self._indent() - self._generate(v[key]) - self.stream.write(b'\n') - self._indent_level -= 1 - self._indent() - self.stream.write(b'') + raise LLSDSerializationError("We should never end up here") def _write(self, something): """ - Serialize a python object to self.stream as 'pretty' application/llsd+xml. + Serialize a python object to self.stream as application/llsd+xml. + + :param something: A python object (typically a dict) to be serialized. - :param something: a python object (typically a dict) to be serialized. + NOTE: This is nearly identical to the above _write with the exception + that this one includes newlines and indentation. Doing something clever + for the above may decrease performance for the common case, so it's been + split out. We can probably revisit this, though. """ - self.stream.write(b'\n') - self._generate(something) + self.stream.write(b'\n' + b'\n') + + iter_stack = [(iter([something]), b"")] + while True: + cur_iter, iter_type = iter_stack[-1] + try: + item = next(cur_iter) + if iter_type == b"map": + self._indent() + self.stream.write(b'' + _str_to_bytes(self.xml_esc(UnicodeType(item[0]))) + b'\n') + item = item[1] + if isinstance(item, _LLSD): + item = item.thing + t = type(item) + if not t in self.type_map: + raise LLSDSerializationError( + "Cannot serialize unknown type: %s (%s)" % (t, item)) + tf = self.type_map[t] + + if tf == self._MAP: + self._indent() + self.stream.write(b'\n') + self._indent_level += 1 + iter_stack.append((iter(item.items()), b"map")) + elif tf == self._ARRAY: + self._indent() + self.stream.write(b'\n') + self._indent_level += 1 + iter_stack.append((iter(item), b"array")) + else: + self._indent() + self.stream.write(tf(item)) + self.stream.write(b'\n') + except StopIteration: + self._indent_level -= 1 + self._indent() + self.stream.write(b'\n') + iter_stack.pop() + if len(iter_stack) == 1: + break self.stream.write(b'\n') diff --git a/tests/bench.py b/tests/bench.py index f907997..cad1b0f 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -45,6 +45,9 @@ """ _bench_data = llsd.parse_xml(BENCH_DATA_XML) + + + BENCH_DATA_BINARY = llsd.format_binary(_bench_data) BENCH_DATA_NOTATION = llsd.format_notation(_bench_data) @@ -78,6 +81,31 @@ def binary_stream(): f.seek(0) yield f +def build_deep_xml(): + + deep_data = {} + curr_data = deep_data + for i in range(250): + curr_data["curr_data"] = {} + curr_data["integer"] = 7 + curr_data["string"] = "string" + curr_data = curr_data["curr_data"] + + return deep_data +_deep_bench_data = build_deep_xml() + +def build_wide_xml(): + wide_xml = b""" +wide_array" +""" + + for i in range(100000): + wide_xml += b""" + 5000""" + wide_xml += b"" + + return llsd.parse_xml(wide_xml) +_wide_bench_data = build_wide_xml() def bench_stream(parse, stream): ret = parse(stream) @@ -125,3 +153,9 @@ def test_format_notation(benchmark): def test_format_binary(benchmark): benchmark(llsd.format_binary, _bench_data) + +def test_format_xml_deep(benchmark): + benchmark(llsd.format_xml, _deep_bench_data) + +def test_format_xml_wide(benchmark): + benchmark(llsd.format_xml, _wide_bench_data) From 3117e20c8c07b5924d2b1aafc1507b25d8e6f5fd Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Wed, 24 May 2023 23:18:00 +0000 Subject: [PATCH 02/15] Increase depth test --- tests/bench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/bench.py b/tests/bench.py index cad1b0f..10cde99 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -85,7 +85,7 @@ def build_deep_xml(): deep_data = {} curr_data = deep_data - for i in range(250): + for i in range(1000): curr_data["curr_data"] = {} curr_data["integer"] = 7 curr_data["string"] = "string" From bcb42027394fcfd27aa6fbf527795deefced6ff9 Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Thu, 25 May 2023 03:12:20 +0000 Subject: [PATCH 03/15] Speed optimizations --- llsd/serde_xml.py | 18 +++++++++--------- tests/bench.py | 3 ++- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py index 61ebe08..84bab9a 100644 --- a/llsd/serde_xml.py +++ b/llsd/serde_xml.py @@ -65,14 +65,14 @@ def _BOOLEAN(self, v): else: return b'false' def _INTEGER(self, v): - return b'' + _str_to_bytes(str(v)) + b'' + return b'' + str(v).encode() + b'' def _REAL(self, v): - return b'' + _str_to_bytes(str(v)) + b'' + return b'' + str(v).encode() + b'' def _UUID(self, v): if v.int == 0: return b'' else: - return b'' + _str_to_bytes(str(v)) + b'' + return b'' + str(v).encode() + b'' def _BINARY(self, v): return b'' + base64.b64encode(v).strip() + b'' def _STRING(self, v): @@ -95,14 +95,14 @@ def _write(self, something): self.stream.write(b'' b'') - iter_stack = [(iter([something]), b"")] + iter_stack = [(iter([something]), b"", None)] while True: - cur_iter, iter_type = iter_stack[-1] + cur_iter, iter_type, iterable = iter_stack[-1] try: item = next(cur_iter) if iter_type == b"map": - self.stream.write(b'' + _str_to_bytes(self.xml_esc(UnicodeType(item[0]))) + b'') - item = item[1] + self.stream.write(b'' + self.xml_esc(UnicodeType(item)) + b'') + item = iterable[item] if isinstance(item, _LLSD): item = item.thing t = type(item) @@ -113,10 +113,10 @@ def _write(self, something): if tf == self._MAP: self.stream.write(b'') - iter_stack.append((iter(item.items()), b"map")) + iter_stack.append((iter(list(item)), b"map", item)) elif tf == self._ARRAY: self.stream.write(b'') - iter_stack.append((iter(item), b"array")) + iter_stack.append((iter(item), b"array", None)) else: self.stream.write(tf(item)) except StopIteration: diff --git a/tests/bench.py b/tests/bench.py index 10cde99..882e03d 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -85,10 +85,11 @@ def build_deep_xml(): deep_data = {} curr_data = deep_data - for i in range(1000): + for i in range(250): curr_data["curr_data"] = {} curr_data["integer"] = 7 curr_data["string"] = "string" + curr_data["map"] = { "item1": 2.345, "item2": [1,2,3], "item3": {"item4": llsd.uri("http://foo.bar.com")}} curr_data = curr_data["curr_data"] return deep_data From 3a2106e9ec17c37f4b43ad0f9bc5edb6b5ea717d Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Thu, 25 May 2023 18:42:55 +0000 Subject: [PATCH 04/15] SL-19707 - use python3 translate functionality for escaping strings --- llsd/serde_xml.py | 146 ++++++++++++++++++++++++++++------------------ tests/bench.py | 11 ++-- 2 files changed, 94 insertions(+), 63 deletions(-) diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py index 84bab9a..636384b 100644 --- a/llsd/serde_xml.py +++ b/llsd/serde_xml.py @@ -5,12 +5,24 @@ from llsd.base import (_LLSD, ALL_CHARS, LLSDBaseParser, LLSDBaseFormatter, XML_HEADER, LLSDParseError, LLSDSerializationError, UnicodeType, - _format_datestr, _str_to_bytes, _to_python, is_unicode) + _format_datestr, _str_to_bytes, _to_python, is_unicode, PY2) from llsd.fastest_elementtree import ElementTreeError, fromstring, parse as _parse INVALID_XML_BYTES = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c'\ b'\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18'\ b'\x19\x1a\x1b\x1c\x1d\x1e\x1f' + +XML_ESC_TRANS = {} +if not PY2: + XML_ESC_TRANS = str.maketrans({'&': '&', + '<':'<', + '>':'>', + u'\uffff':None, # cannot be parsed + u'\ufffe':None}) # cannot be parsed + + for x in INVALID_XML_BYTES: + XML_ESC_TRANS[x] = None + INVALID_XML_RE = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f]') @@ -25,6 +37,24 @@ def remove_invalid_xml_bytes(b): # unit tests) return INVALID_XML_RE.sub('', b) +def xml_esc(v): + "Escape string or unicode object v for xml output" + + # Use is_unicode() instead of is_string() because in python 2, str is + # bytes, not unicode, and should not be "encode()"d. Attempts to + # encode("utf-8") a bytes type will result in an implicit + # decode("ascii") that will throw a UnicodeDecodeError if the string + # contains non-ascii characters. + if is_unicode(v): + # we need to drop these invalid characters because they + # cannot be parsed (and encode() doesn't drop them for us) + v = v.replace(u'\uffff', u'') + v = v.replace(u'\ufffe', u'') + v = v.encode('utf-8') + v = remove_invalid_xml_bytes(v) + return v.replace(b'&',b'&').replace(b'<',b'<').replace(b'>',b'>') + + class LLSDXMLFormatter(LLSDBaseFormatter): """ @@ -38,23 +68,12 @@ class LLSDXMLFormatter(LLSDBaseFormatter): interface to this functionality. """ - def xml_esc(self, v): - "Escape string or unicode object v for xml output" - - # Use is_unicode() instead of is_string() because in python 2, str is - # bytes, not unicode, and should not be "encode()"d. Attempts to - # encode("utf-8") a bytes type will result in an implicit - # decode("ascii") that will throw a UnicodeDecodeError if the string - # contains non-ascii characters. - if is_unicode(v): - # we need to drop these invalid characters because they - # cannot be parsed (and encode() doesn't drop them for us) - v = v.replace(u'\uffff', u'') - v = v.replace(u'\ufffe', u'') - v = v.encode('utf-8') - v = remove_invalid_xml_bytes(v) - return v.replace(b'&',b'&').replace(b'<',b'<').replace(b'>',b'>') - + def __init__(self, indent_atom = None): + "Construct a pretty serializer." + # Call the super class constructor so that we have the type map + super(LLSDXMLFormatter, self).__init__() + self.py2 = PY2 + def _LLSD(self, v): raise LLSDSerializationError("We should never end up here") def _UNDEF(self, _v): @@ -62,23 +81,28 @@ def _UNDEF(self, _v): def _BOOLEAN(self, v): if v: return b'true' - else: - return b'false' + return b'false' def _INTEGER(self, v): - return b'' + str(v).encode() + b'' + return b'' + str(v).encode('utf-8') + b'' def _REAL(self, v): - return b'' + str(v).encode() + b'' + return b'' + str(v).encode('utf-8') + b'' def _UUID(self, v): if v.int == 0: return b'' else: - return b'' + str(v).encode() + b'' + return b'' + str(v).encode('utf-8') + b'' def _BINARY(self, v): return b'' + base64.b64encode(v).strip() + b'' def _STRING(self, v): - return b'' + self.xml_esc(v) + b'' + if self.py2: + return b'' + _str_to_bytes(xml_esc(v)) + b'' + else: + return b'' + v.translate(XML_ESC_TRANS).encode('utf-8') + b'' def _URI(self, v): - return b'' + self.xml_esc(v) + b'' + if self.py2: + return b'' + _str_to_bytes(xml_esc(v)) + b'' + else: + return b'' + UnicodeType(v).translate(XML_ESC_TRANS).encode('utf-8') + b'' def _DATE(self, v): return b'' + _format_datestr(v) + b'' def _ARRAY(self, v): @@ -97,28 +121,38 @@ def _write(self, something): iter_stack = [(iter([something]), b"", None)] while True: - cur_iter, iter_type, iterable = iter_stack[-1] + cur_iter, iter_type, iterable_obj = iter_stack[-1] try: item = next(cur_iter) if iter_type == b"map": - self.stream.write(b'' + self.xml_esc(UnicodeType(item)) + b'') - item = iterable[item] + + if self.py2: + self.stream.write(b'' + + _str_to_bytes(xml_esc(UnicodeType(item))) + + b'') + else: + # fair performance improvement by explicitly doing the + # translate for py3 instead of calling xml_esc + self.stream.write(b'' + + UnicodeType(item).translate(XML_ESC_TRANS).encode('utf-8') + + b'') + item = iterable_obj[item] if isinstance(item, _LLSD): item = item.thing - t = type(item) - if not t in self.type_map: + item_type = type(item) + if not item_type in self.type_map: raise LLSDSerializationError( - "Cannot serialize unknown type: %s (%s)" % (t, item)) - tf = self.type_map[t] + "Cannot serialize unknown type: %s (%s)" % (item_type, item)) + tfunction = self.type_map[item_type] - if tf == self._MAP: + if tfunction == self._MAP: self.stream.write(b'') iter_stack.append((iter(list(item)), b"map", item)) - elif tf == self._ARRAY: + elif tfunction == self._ARRAY: self.stream.write(b'') iter_stack.append((iter(item), b"array", None)) else: - self.stream.write(tf(item)) + self.stream.write(tfunction(item)) except StopIteration: self.stream.write(b'') iter_stack.pop() @@ -156,14 +190,6 @@ def _indent(self): "Write an indentation based on the atom and indentation level." self.stream.writelines([self._indent_atom] * self._indent_level) - def _ARRAY(self, v): - "Recursively format an array with pretty turned on." - raise LLSDSerializationError("We should never end up here") - - def _MAP(self, v): - "Recursively format a map with pretty turned on." - raise LLSDSerializationError("We should never end up here") - def _write(self, something): """ Serialize a python object to self.stream as application/llsd+xml. @@ -178,36 +204,44 @@ def _write(self, something): self.stream.write(b'\n' b'\n') - iter_stack = [(iter([something]), b"")] + iter_stack = [(iter([something]), b"", None)] while True: - cur_iter, iter_type = iter_stack[-1] + cur_iter, iter_type, iterable_obj = iter_stack[-1] try: item = next(cur_iter) if iter_type == b"map": self._indent() - self.stream.write(b'' + _str_to_bytes(self.xml_esc(UnicodeType(item[0]))) + b'\n') - item = item[1] + if self.py2: + self.stream.write(b'' + + _str_to_bytes(xml_esc(UnicodeType(item))) + + b'') + else: + # calling translate directly is a bit faster + self.stream.write(b'' + + UnicodeType(item).translate(XML_ESC_TRANS).encode('utf-8') + + b'\n') + item = iterable_obj[item] if isinstance(item, _LLSD): item = item.thing - t = type(item) - if not t in self.type_map: + item_type = type(item) + if not item_type in self.type_map: raise LLSDSerializationError( - "Cannot serialize unknown type: %s (%s)" % (t, item)) - tf = self.type_map[t] + "Cannot serialize unknown type: %s (%s)" % (item_type, item)) + tfunction = self.type_map[item_type] - if tf == self._MAP: + if tfunction == self._MAP: self._indent() self.stream.write(b'\n') self._indent_level += 1 - iter_stack.append((iter(item.items()), b"map")) - elif tf == self._ARRAY: + iter_stack.append((iter(list(item)), b"map", item)) + elif tfunction == self._ARRAY: self._indent() self.stream.write(b'\n') self._indent_level += 1 - iter_stack.append((iter(item), b"array")) + iter_stack.append((iter(item), b"array", None)) else: self._indent() - self.stream.write(tf(item)) + self.stream.write(tfunction(item)) self.stream.write(b'\n') except StopIteration: self._indent_level -= 1 diff --git a/tests/bench.py b/tests/bench.py index 882e03d..272de2c 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -82,7 +82,6 @@ def binary_stream(): yield f def build_deep_xml(): - deep_data = {} curr_data = deep_data for i in range(250): @@ -96,16 +95,14 @@ def build_deep_xml(): _deep_bench_data = build_deep_xml() def build_wide_xml(): + wide_xml = b""" wide_array" """ - + wide_data = {} for i in range(100000): - wide_xml += b""" - 5000""" - wide_xml += b"" - - return llsd.parse_xml(wide_xml) + wide_data["item"+str(i)] = {"item1":2.345, "item2": [1,2,3], "item3": "string", "item4":{"subitem": llsd.uri("http://foo.bar.com")}} + return wide_data _wide_bench_data = build_wide_xml() def bench_stream(parse, stream): From 62bc5305d862cf9d1047814a066c777f7388972c Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Fri, 26 May 2023 19:23:47 +0000 Subject: [PATCH 05/15] Don't do coverage for python2 code as github actions doesn't cover that --- llsd/serde_xml.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py index 636384b..42b784c 100644 --- a/llsd/serde_xml.py +++ b/llsd/serde_xml.py @@ -27,6 +27,9 @@ def remove_invalid_xml_bytes(b): + """ + Remove characters that aren't allowed in xml. + """ try: # Dropping chars that cannot be parsed later on. The # translate() function was benchmarked to be the fastest way @@ -37,7 +40,8 @@ def remove_invalid_xml_bytes(b): # unit tests) return INVALID_XML_RE.sub('', b) -def xml_esc(v): +# only python2, which is not covered by coverage tests +def xml_esc(v): # pragma: no cover "Escape string or unicode object v for xml output" # Use is_unicode() instead of is_string() because in python 2, str is @@ -73,9 +77,9 @@ def __init__(self, indent_atom = None): # Call the super class constructor so that we have the type map super(LLSDXMLFormatter, self).__init__() self.py2 = PY2 - + def _LLSD(self, v): - raise LLSDSerializationError("We should never end up here") + raise LLSDSerializationError("We should never end up here") # pragma: no cover def _UNDEF(self, _v): return b'' def _BOOLEAN(self, v): @@ -89,26 +93,23 @@ def _REAL(self, v): def _UUID(self, v): if v.int == 0: return b'' - else: - return b'' + str(v).encode('utf-8') + b'' + return b'' + str(v).encode('utf-8') + b'' def _BINARY(self, v): return b'' + base64.b64encode(v).strip() + b'' def _STRING(self, v): - if self.py2: + if self.py2: # pragma: no cover return b'' + _str_to_bytes(xml_esc(v)) + b'' - else: - return b'' + v.translate(XML_ESC_TRANS).encode('utf-8') + b'' + return b'' + v.translate(XML_ESC_TRANS).encode('utf-8') + b'' def _URI(self, v): - if self.py2: + if self.py2: # pragma: no cover return b'' + _str_to_bytes(xml_esc(v)) + b'' - else: - return b'' + UnicodeType(v).translate(XML_ESC_TRANS).encode('utf-8') + b'' + return b'' + UnicodeType(v).translate(XML_ESC_TRANS).encode('utf-8') + b'' def _DATE(self, v): return b'' + _format_datestr(v) + b'' def _ARRAY(self, v): - raise LLSDSerializationError("We should never end up here") + raise LLSDSerializationError("We should never end up here") # pragma: no cover def _MAP(self, v): - raise LLSDSerializationError("We should never end up here") + raise LLSDSerializationError("We should never end up here") # pragma: no cover def _write(self, something): """ @@ -126,7 +127,7 @@ def _write(self, something): item = next(cur_iter) if iter_type == b"map": - if self.py2: + if self.py2: # pragma: no cover self.stream.write(b'' + _str_to_bytes(xml_esc(UnicodeType(item))) + b'') @@ -211,7 +212,7 @@ def _write(self, something): item = next(cur_iter) if iter_type == b"map": self._indent() - if self.py2: + if self.py2: # pragma: no cover self.stream.write(b'' + _str_to_bytes(xml_esc(UnicodeType(item))) + b'') From ca8b9b514fb6b1452c7f70e938864541246a1584 Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Mon, 5 Jun 2023 21:44:24 +0000 Subject: [PATCH 06/15] SL-19707 - notation format serialization perf and depth improvements. notation format now uses iteration to serialize, instead of recursion. --- llsd/base.py | 3 +- llsd/serde_notation.py | 112 +++++++++++++++++++++++++---------------- llsd/serde_xml.py | 3 +- tests/bench.py | 6 +++ 4 files changed, 77 insertions(+), 47 deletions(-) diff --git a/llsd/base.py b/llsd/base.py index cbeab54..16066c4 100644 --- a/llsd/base.py +++ b/llsd/base.py @@ -324,11 +324,12 @@ class LLSDBaseFormatter(object): role of this base class is to provide self.type_map based on the methods defined in its subclass. """ - __slots__ = ['stream', 'type_map'] + __slots__ = ['stream', 'type_map', 'py2'] def __init__(self): "Construct a new formatter dispatch table." self.stream = None + self.py2 = PY2 self.type_map = { type(None): self._UNDEF, undef: self._UNDEF, diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py index ca6518b..22b4640 100644 --- a/llsd/serde_notation.py +++ b/llsd/serde_notation.py @@ -1,11 +1,16 @@ import base64 import binascii -import re import uuid from llsd.base import (_LLSD, B, LLSDBaseFormatter, LLSDBaseParser, NOTATION_HEADER, LLSDParseError, LLSDSerializationError, UnicodeType, - _format_datestr, _parse_datestr, _str_to_bytes, binary, uri) + _format_datestr, _parse_datestr, _str_to_bytes, binary, uri, PY2) + +if not PY2: + STR_ESC_TRANS_SINGLE = str.maketrans({'\\': '\\\\', + '\'':'\\\''}) + STR_ESC_TRANS_DOUBLE = str.maketrans({'\\': '\\\\', + '\"':'\\\"'}) class LLSDNotationParser(LLSDBaseParser): @@ -412,15 +417,15 @@ class LLSDNotationFormatter(LLSDBaseFormatter): See http://wiki.secondlife.com/wiki/LLSD#Notation_Serialization """ def _LLSD(self, v): - return self._generate(v.thing) + raise LLSDSerializationError("We should never end up here") # pragma: no cover def _UNDEF(self, v): - self.stream.write(b'!') + return b'!' def _BOOLEAN(self, v): - self.stream.write(b'true' if v else b'false') + return b'true' if v else b'false' def _INTEGER(self, v): - self.stream.write(B("i%d") % v) + return B("i%d") % v def _REAL(self, v): - self.stream.write(B("r%r") % v) + return B("r%r") % v def _UUID(self, v): # latin-1 is the byte-to-byte encoding, mapping \x00-\xFF -> # \u0000-\u00FF. It's also the fastest encoding, I believe, from @@ -430,57 +435,76 @@ def _UUID(self, v): # error behavior in case someone passes an invalid hex string, with # things other than 0-9a-fA-F, so that they will fail in the UUID # decode, rather than with a UnicodeError. - self.stream.writelines([b"u", str(v).encode('latin-1')]) + return b"u" + str(v).encode('latin-1') def _BINARY(self, v): - self.stream.writelines([b'b64"', base64.b64encode(v).strip(), b'"']) + return b'b64"' + base64.b64encode(v).strip() + b'"' def _STRING(self, v): - self.stream.writelines([b"'", self._esc(v), b"'"]) + if self.py2: # pragma: no cover + return b"'" + self._esc(v) + b"'" + return b"'" + v.translate(STR_ESC_TRANS_SINGLE).encode('utf-8') + b"'" def _URI(self, v): - self.stream.writelines([b'l"', self._esc(v, b'"'), b'"']) + if self.py2: # pragma: no cover + return b'l"' + self._esc(v, b'"') + b'"' + return b'l"' + v.translate(STR_ESC_TRANS_DOUBLE).encode('utf-8') + b'"' def _DATE(self, v): - self.stream.writelines([b'd"', _format_datestr(v), b'"']) + return b'd"' + _format_datestr(v) + b'"' def _ARRAY(self, v): - self.stream.write(b'[') - delim = b'' - for item in v: - self.stream.write(delim) - self._generate(item) - delim = b',' - self.stream.write(b']') + raise LLSDSerializationError("We should never end up here") # pragma: no cover def _MAP(self, v): - self.stream.write(b'{') - delim = b'' - for key, value in v.items(): - self.stream.writelines([delim, b"'", self._esc(UnicodeType(key)), b"':"]) - self._generate(value) - delim = b',' - self.stream.write(b'}') - + raise LLSDSerializationError("We should never end up here") # pragma: no cover def _esc(self, data, quote=b"'"): return _str_to_bytes(data).replace(b"\\", b"\\\\").replace(quote, b'\\'+quote) - def _generate(self, something): + def _write(self, something): """ - Serialize a python object to self.stream as application/llsd+notation + Serialize a python object to self.stream as application/llsd+notation. - :param something: a python object (typically a dict) to be serialized. + :param something: A python object (typically a dict) to be serialized. + + NOTE: This is nearly identical to the above _write with the exception + that this one includes newlines and indentation. Doing something clever + for the above may decrease performance for the common case, so it's been + split out. We can probably revisit this, though. """ - t = type(something) - handler = self.type_map.get(t) - if handler: - return handler(something) - elif isinstance(something, _LLSD): - return self.type_map[_LLSD](something) - else: - try: - return self._ARRAY(iter(something)) - except TypeError: - raise LLSDSerializationError( - "Cannot serialize unknown type: %s (%s)" % (t, something)) - # _write() method is an alias for _generate() - _write = _generate + iter_stack = [[iter([something]), b"", None, b""]] + while True: + cur_iter, iter_type, iterable_obj, delim = iter_stack[-1] + try: + item = next(cur_iter) + self.stream.write(delim) + iter_stack[-1][3] = b"," + if iter_type == b"}": + if self.py2: # pragma: no cover + self.stream.writelines([b"'", self._esc(UnicodeType(item)), b"':"]) + else: + # calling translate directly is a bit faster + self.stream.writelines([b"'", + UnicodeType(item).translate(STR_ESC_TRANS_SINGLE).encode('utf-8'), + b"':"]) + item = iterable_obj[item] # pylint: disable=unsubscriptable-object + if isinstance(item, _LLSD): + item = item.thing + item_type = type(item) + if not item_type in self.type_map: + raise LLSDSerializationError( + "Cannot serialize unknown type: %s (%s)" % (item_type, item)) + tfunction = self.type_map[item_type] + + if tfunction == self._MAP: + self.stream.write(b'{') + iter_stack.append([iter(list(item)), b"}", item, b""]) + elif tfunction == self._ARRAY: + self.stream.write(b'[') + iter_stack.append([iter(item), b"]", None, b""]) + else: + self.stream.write(tfunction(item)) + except StopIteration: + self.stream.write(iter_type) + iter_stack.pop() + if len(iter_stack) == 1: + break def format_notation(something): diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py index 42b784c..ac3e179 100644 --- a/llsd/serde_xml.py +++ b/llsd/serde_xml.py @@ -76,7 +76,6 @@ def __init__(self, indent_atom = None): "Construct a pretty serializer." # Call the super class constructor so that we have the type map super(LLSDXMLFormatter, self).__init__() - self.py2 = PY2 def _LLSD(self, v): raise LLSDSerializationError("We should never end up here") # pragma: no cover @@ -221,7 +220,7 @@ def _write(self, something): self.stream.write(b'' + UnicodeType(item).translate(XML_ESC_TRANS).encode('utf-8') + b'\n') - item = iterable_obj[item] + item = iterable_obj[item] # pylint: disable=unsubscriptable-object if isinstance(item, _LLSD): item = item.thing item_type = type(item) diff --git a/tests/bench.py b/tests/bench.py index 272de2c..e9fa7b7 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -157,3 +157,9 @@ def test_format_xml_deep(benchmark): def test_format_xml_wide(benchmark): benchmark(llsd.format_xml, _wide_bench_data) + +def test_format_notation_deep(benchmark): + benchmark(llsd.format_notation, _deep_bench_data) + +def test_format_notation_wide(benchmark): + benchmark(llsd.format_notation, _wide_bench_data) From bba106f99ea12060bd8980055e5d0331a3136fd2 Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Mon, 5 Jun 2023 22:24:29 +0000 Subject: [PATCH 07/15] SL-19707 Slight perf improvements --- llsd/serde_notation.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py index 22b4640..cb1a029 100644 --- a/llsd/serde_notation.py +++ b/llsd/serde_notation.py @@ -1,5 +1,6 @@ import base64 import binascii +from collections import deque import uuid from llsd.base import (_LLSD, B, LLSDBaseFormatter, LLSDBaseParser, NOTATION_HEADER, @@ -468,21 +469,23 @@ def _write(self, something): split out. We can probably revisit this, though. """ - iter_stack = [[iter([something]), b"", None, b""]] + iter_stack = deque() + iter_stack.append((iter([something]), b"", None, b"")) while True: - cur_iter, iter_type, iterable_obj, delim = iter_stack[-1] + cur_iter, iter_type, iterable_obj, delim = iter_stack.pop() try: item = next(cur_iter) self.stream.write(delim) - iter_stack[-1][3] = b"," + delim = b"," + iter_stack.append((cur_iter, iter_type, iterable_obj, delim)) if iter_type == b"}": if self.py2: # pragma: no cover - self.stream.writelines([b"'", self._esc(UnicodeType(item)), b"':"]) + self.stream.writelines((b"'", self._esc(UnicodeType(item)), b"':")) else: # calling translate directly is a bit faster - self.stream.writelines([b"'", + self.stream.writelines((b"'", UnicodeType(item).translate(STR_ESC_TRANS_SINGLE).encode('utf-8'), - b"':"]) + b"':")) item = iterable_obj[item] # pylint: disable=unsubscriptable-object if isinstance(item, _LLSD): item = item.thing @@ -494,15 +497,14 @@ def _write(self, something): if tfunction == self._MAP: self.stream.write(b'{') - iter_stack.append([iter(list(item)), b"}", item, b""]) + iter_stack.append((iter(list(item)), b"}", item, b"")) elif tfunction == self._ARRAY: self.stream.write(b'[') - iter_stack.append([iter(item), b"]", None, b""]) + iter_stack.append((iter(item), b"]", None, b"")) else: self.stream.write(tfunction(item)) except StopIteration: self.stream.write(iter_type) - iter_stack.pop() if len(iter_stack) == 1: break From 27da089bacd7ac4153867b06b2f58be6fc20ed97 Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Tue, 6 Jun 2023 20:21:16 +0000 Subject: [PATCH 08/15] SL-19707 Some small perf optimizations --- llsd/serde_notation.py | 58 +++++++++-------------- llsd/serde_xml.py | 101 +++++++++++++++++++---------------------- tests/bench.py | 14 ++++++ 3 files changed, 82 insertions(+), 91 deletions(-) diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py index cb1a029..dc38bf8 100644 --- a/llsd/serde_notation.py +++ b/llsd/serde_notation.py @@ -1,6 +1,5 @@ import base64 import binascii -from collections import deque import uuid from llsd.base import (_LLSD, B, LLSDBaseFormatter, LLSDBaseParser, NOTATION_HEADER, @@ -420,13 +419,13 @@ class LLSDNotationFormatter(LLSDBaseFormatter): def _LLSD(self, v): raise LLSDSerializationError("We should never end up here") # pragma: no cover def _UNDEF(self, v): - return b'!' + self.stream.write(b'!') def _BOOLEAN(self, v): - return b'true' if v else b'false' + self.stream.write(b'true' if v else b'false') def _INTEGER(self, v): - return B("i%d") % v + self.stream.write(B("i%d") % v) def _REAL(self, v): - return B("r%r") % v + self.stream.write(B("r%r") % v) def _UUID(self, v): # latin-1 is the byte-to-byte encoding, mapping \x00-\xFF -> # \u0000-\u00FF. It's also the fastest encoding, I believe, from @@ -436,24 +435,25 @@ def _UUID(self, v): # error behavior in case someone passes an invalid hex string, with # things other than 0-9a-fA-F, so that they will fail in the UUID # decode, rather than with a UnicodeError. - return b"u" + str(v).encode('latin-1') + self.stream.writelines([b"u", str(v).encode('latin-1')]) def _BINARY(self, v): - return b'b64"' + base64.b64encode(v).strip() + b'"' - + self.stream.writelines([b'b64"', base64.b64encode(v).strip(), b'"']) def _STRING(self, v): if self.py2: # pragma: no cover - return b"'" + self._esc(v) + b"'" - return b"'" + v.translate(STR_ESC_TRANS_SINGLE).encode('utf-8') + b"'" + return self.stream.writelines([b"'", self._esc(v), b"'"]) + self.stream.writelines([b"'", v.translate(STR_ESC_TRANS_SINGLE).encode('utf-8'), b"'"]) def _URI(self, v): if self.py2: # pragma: no cover - return b'l"' + self._esc(v, b'"') + b'"' - return b'l"' + v.translate(STR_ESC_TRANS_DOUBLE).encode('utf-8') + b'"' + return self.stream.writelines([b'l"', self._esc(v, b'"'), b'"']) + self.stream.writelines([b'l"', v.translate(STR_ESC_TRANS_DOUBLE).encode('utf-8'), b'"']) def _DATE(self, v): - return b'd"' + _format_datestr(v) + b'"' + self.stream.writelines([b'd"', _format_datestr(v), b'"']) def _ARRAY(self, v): - raise LLSDSerializationError("We should never end up here") # pragma: no cover + self.stream.write(b'[') + self.iter_stack.append([iter(v), b"]", None, b""]) def _MAP(self, v): - raise LLSDSerializationError("We should never end up here") # pragma: no cover + self.stream.write(b'{') + self.iter_stack.append([iter(v), b"}", v, b""]) def _esc(self, data, quote=b"'"): return _str_to_bytes(data).replace(b"\\", b"\\\\").replace(quote, b'\\'+quote) @@ -463,21 +463,15 @@ def _write(self, something): :param something: A python object (typically a dict) to be serialized. - NOTE: This is nearly identical to the above _write with the exception - that this one includes newlines and indentation. Doing something clever - for the above may decrease performance for the common case, so it's been - split out. We can probably revisit this, though. """ - iter_stack = deque() - iter_stack.append((iter([something]), b"", None, b"")) + self.iter_stack = [[iter([something]), b"", None, b""]] while True: - cur_iter, iter_type, iterable_obj, delim = iter_stack.pop() + cur_iter, iter_type, iterable_obj, delim = self.iter_stack[-1] try: item = next(cur_iter) self.stream.write(delim) - delim = b"," - iter_stack.append((cur_iter, iter_type, iterable_obj, delim)) + self.iter_stack[-1][3] = b"," if iter_type == b"}": if self.py2: # pragma: no cover self.stream.writelines((b"'", self._esc(UnicodeType(item)), b"':")) @@ -490,22 +484,14 @@ def _write(self, something): if isinstance(item, _LLSD): item = item.thing item_type = type(item) - if not item_type in self.type_map: + if item_type not in self.type_map: raise LLSDSerializationError( "Cannot serialize unknown type: %s (%s)" % (item_type, item)) - tfunction = self.type_map[item_type] - - if tfunction == self._MAP: - self.stream.write(b'{') - iter_stack.append((iter(list(item)), b"}", item, b"")) - elif tfunction == self._ARRAY: - self.stream.write(b'[') - iter_stack.append((iter(item), b"]", None, b"")) - else: - self.stream.write(tfunction(item)) + self.type_map[item_type](item) except StopIteration: self.stream.write(iter_type) - if len(iter_stack) == 1: + self.iter_stack.pop() + if len(self.iter_stack) == 1: break diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py index ac3e179..9404fca 100644 --- a/llsd/serde_xml.py +++ b/llsd/serde_xml.py @@ -80,35 +80,37 @@ def __init__(self, indent_atom = None): def _LLSD(self, v): raise LLSDSerializationError("We should never end up here") # pragma: no cover def _UNDEF(self, _v): - return b'' + self.stream.write(b'') def _BOOLEAN(self, v): if v: - return b'true' - return b'false' + return self.stream.write(b'true') + self.stream.write(b'false') def _INTEGER(self, v): - return b'' + str(v).encode('utf-8') + b'' + self.stream.writelines([b'', str(v).encode('utf-8'), b'']) def _REAL(self, v): - return b'' + str(v).encode('utf-8') + b'' + self.stream.writelines([b'', str(v).encode('utf-8'), b'']) def _UUID(self, v): if v.int == 0: - return b'' - return b'' + str(v).encode('utf-8') + b'' + return self.stream.write(b'') + self.stream.writelines([b'', str(v).encode('utf-8'), b'']) def _BINARY(self, v): - return b'' + base64.b64encode(v).strip() + b'' + self.stream.writelines([b'', base64.b64encode(v).strip(), b'']) def _STRING(self, v): if self.py2: # pragma: no cover - return b'' + _str_to_bytes(xml_esc(v)) + b'' - return b'' + v.translate(XML_ESC_TRANS).encode('utf-8') + b'' + return self.stream.writelines([b'', _str_to_bytes(xml_esc(v)), b'']) + self.stream.writelines([b'', v.translate(XML_ESC_TRANS).encode('utf-8'), b'']) def _URI(self, v): if self.py2: # pragma: no cover - return b'' + _str_to_bytes(xml_esc(v)) + b'' - return b'' + UnicodeType(v).translate(XML_ESC_TRANS).encode('utf-8') + b'' + return self.stream.writelines([b'', _str_to_bytes(xml_esc(v)), b'']) + self.stream.writelines([b'', UnicodeType(v).translate(XML_ESC_TRANS).encode('utf-8'), b'']) def _DATE(self, v): - return b'' + _format_datestr(v) + b'' + self.stream.writelines([b'', _format_datestr(v), b'']) def _ARRAY(self, v): - raise LLSDSerializationError("We should never end up here") # pragma: no cover + self.stream.write(b'') + self.iter_stack.append((iter(v), b"array", None)) def _MAP(self, v): - raise LLSDSerializationError("We should never end up here") # pragma: no cover + self.stream.write(b'') + self.iter_stack.append((iter(v), b"map", v)) def _write(self, something): """ @@ -119,23 +121,23 @@ def _write(self, something): self.stream.write(b'' b'') - iter_stack = [(iter([something]), b"", None)] + self.iter_stack = [(iter([something]), b"", None)] while True: - cur_iter, iter_type, iterable_obj = iter_stack[-1] + cur_iter, iter_type, iterable_obj = self.iter_stack[-1] try: item = next(cur_iter) if iter_type == b"map": if self.py2: # pragma: no cover - self.stream.write(b'' + - _str_to_bytes(xml_esc(UnicodeType(item))) + - b'') + self.stream.writelines([b'', + _str_to_bytes(xml_esc(UnicodeType(item))), + b'']) else: # fair performance improvement by explicitly doing the # translate for py3 instead of calling xml_esc - self.stream.write(b'' + - UnicodeType(item).translate(XML_ESC_TRANS).encode('utf-8') + - b'') + self.stream.writelines([b'', + UnicodeType(item).translate(XML_ESC_TRANS).encode('utf-8'), + b'']) item = iterable_obj[item] if isinstance(item, _LLSD): item = item.thing @@ -143,20 +145,11 @@ def _write(self, something): if not item_type in self.type_map: raise LLSDSerializationError( "Cannot serialize unknown type: %s (%s)" % (item_type, item)) - tfunction = self.type_map[item_type] - - if tfunction == self._MAP: - self.stream.write(b'') - iter_stack.append((iter(list(item)), b"map", item)) - elif tfunction == self._ARRAY: - self.stream.write(b'') - iter_stack.append((iter(item), b"array", None)) - else: - self.stream.write(tfunction(item)) + self.type_map[item_type](item) except StopIteration: - self.stream.write(b'') - iter_stack.pop() - if len(iter_stack) == 1: + self.stream.writelines([b'']) + self.iter_stack.pop() + if len(self.iter_stack) == 1: break self.stream.write(b'') @@ -186,6 +179,16 @@ def __init__(self, indent_atom = None): else: self._indent_atom = indent_atom + def _ARRAY(self, v): + self.stream.write(b'') + self._indent_level += 1 + self.iter_stack.append((iter(v), b"array", None)) + + def _MAP(self, v): + self.stream.write(b'') + self._indent_level += 1 + self.iter_stack.append((iter(v), b"map", v)) + def _indent(self): "Write an indentation based on the atom and indentation level." self.stream.writelines([self._indent_atom] * self._indent_level) @@ -204,9 +207,9 @@ def _write(self, something): self.stream.write(b'\n' b'\n') - iter_stack = [(iter([something]), b"", None)] + self.iter_stack = [(iter([something]), b"", None)] while True: - cur_iter, iter_type, iterable_obj = iter_stack[-1] + cur_iter, iter_type, iterable_obj = self.iter_stack[-1] try: item = next(cur_iter) if iter_type == b"map": @@ -227,28 +230,16 @@ def _write(self, something): if not item_type in self.type_map: raise LLSDSerializationError( "Cannot serialize unknown type: %s (%s)" % (item_type, item)) - tfunction = self.type_map[item_type] - if tfunction == self._MAP: - self._indent() - self.stream.write(b'\n') - self._indent_level += 1 - iter_stack.append((iter(list(item)), b"map", item)) - elif tfunction == self._ARRAY: - self._indent() - self.stream.write(b'\n') - self._indent_level += 1 - iter_stack.append((iter(item), b"array", None)) - else: - self._indent() - self.stream.write(tfunction(item)) - self.stream.write(b'\n') + self._indent() + self.type_map[item_type](item) + self.stream.write(b'\n') except StopIteration: self._indent_level -= 1 self._indent() self.stream.write(b'\n') - iter_stack.pop() - if len(iter_stack) == 1: + self.iter_stack.pop() + if len(self.iter_stack) == 1: break self.stream.write(b'\n') diff --git a/tests/bench.py b/tests/bench.py index e9fa7b7..86a53f4 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -105,6 +105,17 @@ def build_wide_xml(): return wide_data _wide_bench_data = build_wide_xml() +def build_wide_array_xml(): + + wide_xml = b""" +wide_array" +""" + wide_data = [] + for i in range(100000): + wide_data.append({"item1":2.345, "item2": [1,2,3], "item3": "string", "item4":{"subitem": llsd.uri("http://foo.bar.com")}}) + return wide_data +_wide_array_bench_data = build_wide_array_xml() + def bench_stream(parse, stream): ret = parse(stream) stream.seek(0) @@ -163,3 +174,6 @@ def test_format_notation_deep(benchmark): def test_format_notation_wide(benchmark): benchmark(llsd.format_notation, _wide_bench_data) + +def test_format_notation_wide_array(benchmark): + benchmark(llsd.format_notation, _wide_array_bench_data) From 1573a08b9f09b70e3fe7ea2bf5b97452029d80e2 Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Tue, 6 Jun 2023 22:01:57 +0000 Subject: [PATCH 09/15] SL-19707 - Allow 'unlimited' depth binary formatting of llsd And a few small perf improvements --- llsd/serde_binary.py | 89 ++++++++++++++++++++++++++++++++++++++---- llsd/serde_notation.py | 4 +- llsd/serde_xml.py | 2 +- tests/bench.py | 11 +++++- 4 files changed, 95 insertions(+), 11 deletions(-) diff --git a/llsd/serde_binary.py b/llsd/serde_binary.py index 6f0d93e..d901f1b 100644 --- a/llsd/serde_binary.py +++ b/llsd/serde_binary.py @@ -4,8 +4,8 @@ import struct import uuid -from llsd.base import (_LLSD, LLSDBaseParser, LLSDSerializationError, BINARY_HEADER, - _str_to_bytes, binary, is_integer, is_string, uri) +from llsd.base import (_LLSD, LLSDBaseFormatter, LLSDBaseParser, LLSDSerializationError, BINARY_HEADER, + LLSDSerializationError, UnicodeType, _str_to_bytes, binary, is_integer, is_string, uri) try: @@ -148,6 +148,84 @@ def _parse_date(self): self._error(exc, -8) + +class LLSDBinaryFormatter(LLSDBaseFormatter): + """ + Serialize a python object as application/llsd+binary + + See http://wiki.secondlife.com/wiki/LLSD#Notation_Serialization + """ + def _LLSD(self, v): + raise LLSDSerializationError("We should never end up here") # pragma: no cover + def _UNDEF(self, v): + self.stream.write(b'!') + def _BOOLEAN(self, v): + self.stream.write(b'1' if v else b'0') + def _INTEGER(self, v): + try: + self.stream.writelines([b'i', struct.pack('!i', v)]) + except (OverflowError, struct.error) as exc: + raise LLSDSerializationError(str(exc), v) + def _REAL(self, v): + try: + self.stream.writelines([b'r', struct.pack('!d', v)]) + except SystemError as exc: + raise LLSDSerializationError(str(exc), something) + def _UUID(self, v): + self.stream.writelines([b'u', v.bytes]) + def _BINARY(self, v): + self.stream.writelines([b'b', struct.pack('!i', len(v)), v]) + def _STRING(self, v): + v = _str_to_bytes(v) + self.stream.writelines([b's', struct.pack('!i', len(v)), v]) + def _URI(self, v): + uri_bytes = _str_to_bytes(v) + self.stream.writelines([b'l', struct.pack('!i', len(uri_bytes)), uri_bytes]) + def _DATE(self, v): + if isinstance(v, datetime.datetime): + seconds_since_epoch = calendar.timegm(v.utctimetuple()) \ + + v.microsecond // 1e6 + if isinstance(v, datetime.date): + seconds_since_epoch = calendar.timegm(v.timetuple()) + self.stream.writelines([b'd', struct.pack('\n') + self.iter_stack = [[iter([something]), b"", None]] + while True: + cur_iter, iter_type, iterable_obj = self.iter_stack[-1] + try: + item = next(cur_iter) + if iterable_obj: + key = _str_to_bytes(item) + self.stream.writelines([b'k', struct.pack('!i', len(key)), key]) + item = iterable_obj[item] # pylint: disable=unsubscriptable-object + while isinstance(item, _LLSD): + item = item.thing + item_type = type(item) + if item_type not in self.type_map: + raise LLSDSerializationError( + "Cannot serialize unknown type: %s (%s)" % (item_type, item)) + self.type_map[item_type](item) + except StopIteration: + self.stream.write(iter_type) + self.iter_stack.pop() + if len(self.iter_stack) == 1: + break + def format_binary(something): """ Format application/llsd+binary to a python object. @@ -157,14 +235,11 @@ def format_binary(something): :param something: a python object (typically a dict) to be serialized. :returns: Returns a LLSD binary formatted string. """ - stream = io.BytesIO() - write_binary(stream, something) - return stream.getvalue() + return LLSDBinaryFormatter().format(something) def write_binary(stream, something): - stream.write(b'\n') - _write_binary_recurse(stream, something) + return LLSDBinaryFormatter().write(stream, something) def _write_binary_recurse(stream, something): diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py index dc38bf8..0db3985 100644 --- a/llsd/serde_notation.py +++ b/llsd/serde_notation.py @@ -472,7 +472,7 @@ def _write(self, something): item = next(cur_iter) self.stream.write(delim) self.iter_stack[-1][3] = b"," - if iter_type == b"}": + if iterable_obj: if self.py2: # pragma: no cover self.stream.writelines((b"'", self._esc(UnicodeType(item)), b"':")) else: @@ -481,7 +481,7 @@ def _write(self, something): UnicodeType(item).translate(STR_ESC_TRANS_SINGLE).encode('utf-8'), b"':")) item = iterable_obj[item] # pylint: disable=unsubscriptable-object - if isinstance(item, _LLSD): + while isinstance(item, _LLSD): item = item.thing item_type = type(item) if item_type not in self.type_map: diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py index 9404fca..bd7bb9d 100644 --- a/llsd/serde_xml.py +++ b/llsd/serde_xml.py @@ -139,7 +139,7 @@ def _write(self, something): UnicodeType(item).translate(XML_ESC_TRANS).encode('utf-8'), b'']) item = iterable_obj[item] - if isinstance(item, _LLSD): + while isinstance(item, _LLSD): item = item.thing item_type = type(item) if not item_type in self.type_map: diff --git a/tests/bench.py b/tests/bench.py index 86a53f4..1021adc 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -112,7 +112,7 @@ def build_wide_array_xml(): """ wide_data = [] for i in range(100000): - wide_data.append({"item1":2.345, "item2": [1,2,3], "item3": "string", "item4":{"subitem": llsd.uri("http://foo.bar.com")}}) + wide_data.append([2.345,[1,2,3], "string", [llsd.uri("http://foo.bar.com")]]) return wide_data _wide_array_bench_data = build_wide_array_xml() @@ -177,3 +177,12 @@ def test_format_notation_wide(benchmark): def test_format_notation_wide_array(benchmark): benchmark(llsd.format_notation, _wide_array_bench_data) + +def test_format_binary_deep(benchmark): + benchmark(llsd.format_binary, _deep_bench_data) + +def test_format_binary_wide(benchmark): + benchmark(llsd.format_binary, _wide_bench_data) + +def test_format_binary_wide_array(benchmark): + benchmark(llsd.format_binary, _wide_array_bench_data) From a73b2aca4f40e1dae1f21ff807ec0d8c9fa4fca1 Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Thu, 8 Jun 2023 21:56:27 +0000 Subject: [PATCH 10/15] SL-19707 - allow "infinite" depth when parsing XML LLSD Uses iteration instead of recursion. --- llsd/base.py | 108 ------------------------------ llsd/serde_notation.py | 5 +- llsd/serde_xml.py | 146 +++++++++++++++++++++++++++++++++++++++-- tests/bench.py | 4 ++ 4 files changed, 149 insertions(+), 114 deletions(-) diff --git a/llsd/base.py b/llsd/base.py index 16066c4..318373f 100644 --- a/llsd/base.py +++ b/llsd/base.py @@ -209,114 +209,6 @@ def _parse_datestr(datestr): return datetime.datetime(year, month, day, hour, minute, second, usec) -def _bool_to_python(node): - "Convert boolean node to a python object." - val = node.text or '' - try: - # string value, accept 'true' or 'True' or whatever - return (val.lower() in ('true', '1', '1.0')) - except AttributeError: - # not a string (no lower() method), use normal Python rules - return bool(val) - - -def _int_to_python(node): - "Convert integer node to a python object." - val = node.text or '' - if not val.strip(): - return 0 - return int(val) - - -def _real_to_python(node): - "Convert floating point node to a python object." - val = node.text or '' - if not val.strip(): - return 0.0 - return float(val) - - -def _uuid_to_python(node): - "Convert uuid node to a python object." - if node.text: - return uuid.UUID(hex=node.text) - return uuid.UUID(int=0) - - -def _str_to_python(node): - "Convert string node to a python object." - return node.text or '' - - -def _bin_to_python(node): - base = node.get('encoding') or 'base64' - try: - if base == 'base16': - # parse base16 encoded data - return binary(base64.b16decode(node.text or '')) - elif base == 'base64': - # parse base64 encoded data - return binary(base64.b64decode(node.text or '')) - elif base == 'base85': - return LLSDParseError("Parser doesn't support base85 encoding") - except binascii.Error as exc: - # convert exception class so it's more catchable - return LLSDParseError("Encoded binary data: " + str(exc)) - except TypeError as exc: - # convert exception class so it's more catchable - return LLSDParseError("Bad binary data: " + str(exc)) - - -def _date_to_python(node): - "Convert date node to a python object." - val = node.text or '' - if not val: - val = "1970-01-01T00:00:00Z" - return _parse_datestr(val) - - -def _uri_to_python(node): - "Convert uri node to a python object." - val = node.text or '' - return uri(val) - - -def _map_to_python(node): - "Convert map node to a python object." - result = {} - for index in range(len(node))[::2]: - if node[index].text is None: - result[''] = _to_python(node[index+1]) - else: - result[node[index].text] = _to_python(node[index+1]) - return result - - -def _array_to_python(node): - "Convert array node to a python object." - return [_to_python(child) for child in node] - - -NODE_HANDLERS = dict( - undef=lambda x: None, - boolean=_bool_to_python, - integer=_int_to_python, - real=_real_to_python, - uuid=_uuid_to_python, - string=_str_to_python, - binary=_bin_to_python, - date=_date_to_python, - uri=_uri_to_python, - map=_map_to_python, - array=_array_to_python, -) - - -def _to_python(node): - "Convert node to a python object." - return NODE_HANDLERS[node.tag](node) - - class LLSDBaseFormatter(object): """ This base class cannot be instantiated on its own: it assumes a subclass diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py index 0db3985..bb3ef79 100644 --- a/llsd/serde_notation.py +++ b/llsd/serde_notation.py @@ -467,11 +467,12 @@ def _write(self, something): self.iter_stack = [[iter([something]), b"", None, b""]] while True: - cur_iter, iter_type, iterable_obj, delim = self.iter_stack[-1] + stack_top = self.iter_stack[-1] + cur_iter, iter_type, iterable_obj, delim = stack_top try: item = next(cur_iter) self.stream.write(delim) - self.iter_stack[-1][3] = b"," + stack_top[3] = b"," if iterable_obj: if self.py2: # pragma: no cover self.stream.writelines((b"'", self._esc(UnicodeType(item)), b"':")) diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py index bd7bb9d..61107bf 100644 --- a/llsd/serde_xml.py +++ b/llsd/serde_xml.py @@ -1,11 +1,12 @@ import base64 +import binascii import io import re -import types +import uuid from llsd.base import (_LLSD, ALL_CHARS, LLSDBaseParser, LLSDBaseFormatter, XML_HEADER, LLSDParseError, LLSDSerializationError, UnicodeType, - _format_datestr, _str_to_bytes, _to_python, is_unicode, PY2) + _format_datestr, _str_to_bytes, is_unicode, PY2, uri, binary, _parse_datestr) from llsd.fastest_elementtree import ElementTreeError, fromstring, parse as _parse INVALID_XML_BYTES = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c'\ @@ -138,7 +139,7 @@ def _write(self, something): self.stream.writelines([b'', UnicodeType(item).translate(XML_ESC_TRANS).encode('utf-8'), b'']) - item = iterable_obj[item] + item = iterable_obj[item] # pylint: disable=unsubscriptable-object while isinstance(item, _LLSD): item = item.thing item_type = type(item) @@ -178,6 +179,7 @@ def __init__(self, indent_atom = None): self._indent_atom = b' ' else: self._indent_atom = indent_atom + self.iter_stack = None def _ARRAY(self, v): self.stream.write(b'') @@ -283,6 +285,140 @@ def write_pretty_xml(stream, something): return LLSDXMLPrettyFormatter().write(stream, something) +class LLSDXMLParser: + def __init__(self): + "Construct an xml node parser." + + self.NODE_HANDLERS = { + "undef": lambda x: None, + "boolean": self._bool_to_python, + "integer": self._int_to_python, + "real": self._real_to_python, + "uuid": self._uuid_to_python, + "string": self._str_to_python, + "binary": self._bin_to_python, + "date": self._date_to_python, + "uri": self._uri_to_python, + "map": self._map_to_python, + "array": self._array_to_python, + } + + self.parse_stack = [] + + def _bool_to_python(self, node): + "Convert boolean node to a python object." + val = node.text or '' + try: + # string value, accept 'true' or 'True' or whatever + return (val.lower() in ('true', '1', '1.0')) + except AttributeError: + # not a string (no lower() method), use normal Python rules + return bool(val) + + def _int_to_python(self, node): + "Convert integer node to a python object." + val = node.text or '' + if not val.strip(): + return 0 + return int(val) + + def _real_to_python(self, node): + "Convert floating point node to a python object." + val = node.text or '' + if not val.strip(): + return 0.0 + return float(val) + + def _uuid_to_python(self, node): + "Convert uuid node to a python object." + if node.text: + return uuid.UUID(hex=node.text) + return uuid.UUID(int=0) + + def _str_to_python(self, node): + "Convert string node to a python object." + return node.text or '' + + def _bin_to_python(self, node): + base = node.get('encoding') or 'base64' + try: + if base == 'base16': + # parse base16 encoded data + return binary(base64.b16decode(node.text or '')) + elif base == 'base64': + # parse base64 encoded data + return binary(base64.b64decode(node.text or '')) + raise LLSDParseError("Parser doesn't support %s encoding" % base) + + except binascii.Error as exc: + # convert exception class so it's more catchable + raise LLSDParseError("Encoded binary data: " + str(exc)) + except TypeError as exc: + # convert exception class so it's more catchable + raise LLSDParseError("Bad binary data: " + str(exc)) + + def _date_to_python(self, node): + "Convert date node to a python object." + val = node.text or '' + if not val: + val = "1970-01-01T00:00:00Z" + return _parse_datestr(val) + + def _uri_to_python(self, node): + "Convert uri node to a python object." + val = node.text or '' + return uri(val) + + def _map_to_python(self, node): + "Convert map node to a python object." + self.parse_stack.append([iter(node), node, {}]) + return self.parse_stack[-1][2] + + def _array_to_python(self, node): + "Convert array node to a python object." + self.parse_stack.append([iter(node), node, []]) + return self.parse_stack[-1][2] + + def parse_node(self, something): + """ + Parse an xml node tree via iteration. + :param something: The xml node to parse. + :returns: Returns a python object. + """ + if something.tag == "map": + cur_result = {} + elif something.tag == "array": + cur_result = [] + else: + if something.tag not in self.NODE_HANDLERS: + raise LLSDParseError("Unknown value type %s" % something.tag) + return self.NODE_HANDLERS[something.tag](something) + + self.parse_stack = [[iter(something), something, cur_result]] + while True: + node_iter, iterable, cur_result = self.parse_stack[-1] + try: + value = next(node_iter) + if iterable.tag == "map": + if value.tag != "key": + raise LLSDParseError("Expected 'key', got %s" % value.tag) + key = value.text + if key is None: + key = '' + value = next(node_iter) + if value.tag not in self.NODE_HANDLERS: + raise LLSDParseError("Unknown value type %s" % something.tag) + cur_result[key] = self.NODE_HANDLERS[value.tag](value) + elif iterable.tag == "array": + if value.tag not in self.NODE_HANDLERS: + raise LLSDParseError("Unknown value type %s" % something.tag) + cur_result.append(self.NODE_HANDLERS[value.tag](value)) + except StopIteration: + node_iter, iterable, cur_result = self.parse_stack.pop() + if len(self.parse_stack) == 0: + break + return cur_result + def parse_xml(something): """ This is the basic public interface for parsing llsd+xml. @@ -298,6 +434,8 @@ def parse_xml(something): return parse_xml_nohdr(parser) + + def parse_xml_nohdr(baseparser): """ Parse llsd+xml known to be without an header. May still @@ -326,7 +464,7 @@ def parse_xml_nohdr(baseparser): if element.tag != 'llsd': raise LLSDParseError("Invalid XML Declaration") # Extract its contents. - return _to_python(element[0]) + return LLSDXMLParser().parse_node(element[0]) def format_xml(something): diff --git a/tests/bench.py b/tests/bench.py index 1021adc..e434f4b 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -186,3 +186,7 @@ def test_format_binary_wide(benchmark): def test_format_binary_wide_array(benchmark): benchmark(llsd.format_binary, _wide_array_bench_data) + +def test_parse_xml_deep(benchmark): + deep_data = llsd.format_xml(_deep_bench_data) + benchmark(llsd.parse_xml, deep_data) From 82d13327ad512bd67080993c7a31f84ddbf1d999 Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Thu, 8 Jun 2023 22:17:31 +0000 Subject: [PATCH 11/15] SL-19707 faster xml parsing parsing error handling. --- llsd/serde_xml.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py index 61107bf..d09ade0 100644 --- a/llsd/serde_xml.py +++ b/llsd/serde_xml.py @@ -406,13 +406,11 @@ def parse_node(self, something): if key is None: key = '' value = next(node_iter) - if value.tag not in self.NODE_HANDLERS: - raise LLSDParseError("Unknown value type %s" % something.tag) cur_result[key] = self.NODE_HANDLERS[value.tag](value) elif iterable.tag == "array": - if value.tag not in self.NODE_HANDLERS: - raise LLSDParseError("Unknown value type %s" % something.tag) cur_result.append(self.NODE_HANDLERS[value.tag](value)) + except KeyError as err: + raise LLSDParseError("Unknown value type: " + str(err)) except StopIteration: node_iter, iterable, cur_result = self.parse_stack.pop() if len(self.parse_stack) == 0: From 44ddccc32b646f8bc056a554ce843c0911b02f7a Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Mon, 12 Jun 2023 20:27:34 +0000 Subject: [PATCH 12/15] SL-19707 - Binary parsing using iteration not recursion Also, changed xml to use deque for slight perf increase --- llsd/serde_binary.py | 82 ++++++++++++++++++++++++++------------------ llsd/serde_xml.py | 31 ++++++++++------- tests/bench.py | 4 +++ tests/llsd_test.py | 10 +++--- 4 files changed, 76 insertions(+), 51 deletions(-) diff --git a/llsd/serde_binary.py b/llsd/serde_binary.py index d901f1b..9ec9226 100644 --- a/llsd/serde_binary.py +++ b/llsd/serde_binary.py @@ -1,4 +1,5 @@ import calendar +from collections import deque import datetime import io import struct @@ -22,7 +23,7 @@ class LLSDBinaryParser(LLSDBaseParser): See http://wiki.secondlife.com/wiki/LLSD#Binary_Serialization """ - __slots__ = ['_dispatch', '_keep_binary'] + __slots__ = ['_dispatch', '_keep_binary', 'parse_stack'] def __init__(self): super(LLSDBinaryParser, self).__init__() @@ -63,6 +64,7 @@ def __init__(self): # entries in _dispatch. for c, func in _dispatch_dict.items(): self._dispatch[ord(c)] = func + self.parse_stack = deque([]) def parse(self, something, ignore_binary = False): """ @@ -81,46 +83,60 @@ def parse(self, something, ignore_binary = False): self._error(exc) def _parse(self): - "The actual parser which is called recursively when necessary." + "The actual iterative parser." cc = self._getc() - try: - func = self._dispatch[ord(cc)] - except IndexError: - self._error("invalid binary token", -1) + if cc == b'{': + cur_result = {} + max_size = struct.unpack("!i", self._getc(4))[0] + elif cc == b'[': + cur_result = [] + max_size = struct.unpack("!i", self._getc(4))[0] else: - return func() + return self._dispatch[ord(cc)]() + self.parse_stack.appendleft([0, max_size, cc, cur_result]) + while True: + item_count, max_size, iter_type, cur_result = self.parse_stack[0] + cc = self._getc() + if iter_type == b'{': + if cc == b'}': + item_count, max_size, iter_type, cur_result = self.parse_stack.popleft() + if item_count != max_size: + self._error("Invalid map close token") + else: + if cc == b'k': + key = self._parse_string() + elif cc in (b"'", b'"'): + key = self._parse_string_delim(cc) + else: + self._error("invalid map key %d" % ord(cc), -1) + cc = self._getc() + self.parse_stack[0][0] = item_count + 1 + cur_result[key] = self._dispatch[ord(cc)]() + elif iter_type == b'[': + if cc == b']': + item_count, max_size, iter_type, cur_result = self.parse_stack.popleft() + if item_count != max_size: + self._error("Invalid array close token") + else: + self.parse_stack[0][0] = item_count + 1 + cur_result.append(self._dispatch[ord(cc)]()) + if (len(self.parse_stack) == 0): + return cur_result def _parse_map(self): "Parse a single llsd map" - rv = {} - size = struct.unpack("!i", self._getc(4))[0] - count = 0 - cc = self._getc() - key = b'' - while (cc != b'}') and (count < size): - if cc == b'k': - key = self._parse_string() - elif cc in (b"'", b'"'): - key = self._parse_string_delim(cc) - else: - self._error("invalid map key", -1) - value = self._parse() - rv[key] = value - count += 1 - cc = self._getc() - if cc != b'}': - self._error("invalid map close token") - return rv + result = {} + max_size = struct.unpack("!i", self._getc(4))[0] + self.parse_stack.appendleft([0, max_size, b'{', result]) + return result def _parse_array(self): "Parse a single llsd array" - rv = [] - size = struct.unpack("!i", self._getc(4))[0] - for count in range(size): - rv.append(self._parse()) - if self._getc() != b']': - self._error("invalid array close token") - return rv + result = [] + max_size = self._getc(4) + max_size = struct.unpack("!i", max_size)[0] + self.parse_stack.appendleft([0, max_size, b'[', result]) + return result def _parse_string(self): try: diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py index d09ade0..7d3bb48 100644 --- a/llsd/serde_xml.py +++ b/llsd/serde_xml.py @@ -1,5 +1,6 @@ import base64 import binascii +from collections import deque import io import re import uuid @@ -108,10 +109,10 @@ def _DATE(self, v): self.stream.writelines([b'', _format_datestr(v), b'']) def _ARRAY(self, v): self.stream.write(b'') - self.iter_stack.append((iter(v), b"array", None)) + self.iter_stack.appendleft((iter(v), b"array", None)) def _MAP(self, v): self.stream.write(b'') - self.iter_stack.append((iter(v), b"map", v)) + self.iter_stack.appendleft((iter(v), b"map", v)) def _write(self, something): """ @@ -122,9 +123,9 @@ def _write(self, something): self.stream.write(b'' b'') - self.iter_stack = [(iter([something]), b"", None)] + self.iter_stack = deque([(iter([something]), b"", None)]) while True: - cur_iter, iter_type, iterable_obj = self.iter_stack[-1] + cur_iter, iter_type, iterable_obj = self.iter_stack[0] try: item = next(cur_iter) if iter_type == b"map": @@ -149,7 +150,7 @@ def _write(self, something): self.type_map[item_type](item) except StopIteration: self.stream.writelines([b'']) - self.iter_stack.pop() + self.iter_stack.popleft() if len(self.iter_stack) == 1: break self.stream.write(b'') @@ -303,7 +304,7 @@ def __init__(self): "array": self._array_to_python, } - self.parse_stack = [] + self.parse_stack = deque([]) def _bool_to_python(self, node): "Convert boolean node to a python object." @@ -371,13 +372,17 @@ def _uri_to_python(self, node): def _map_to_python(self, node): "Convert map node to a python object." - self.parse_stack.append([iter(node), node, {}]) - return self.parse_stack[-1][2] + new_result = {} + new_stack_entry = [iter(node), node, new_result] + self.parse_stack.appendleft(new_stack_entry) + return new_result def _array_to_python(self, node): "Convert array node to a python object." - self.parse_stack.append([iter(node), node, []]) - return self.parse_stack[-1][2] + new_result = [] + new_stack_entry = [iter(node), node, new_result] + self.parse_stack.appendleft(new_stack_entry) + return new_result def parse_node(self, something): """ @@ -394,9 +399,9 @@ def parse_node(self, something): raise LLSDParseError("Unknown value type %s" % something.tag) return self.NODE_HANDLERS[something.tag](something) - self.parse_stack = [[iter(something), something, cur_result]] + self.parse_stack.appendleft([iter(something), something, cur_result]) while True: - node_iter, iterable, cur_result = self.parse_stack[-1] + node_iter, iterable, cur_result = self.parse_stack[0] try: value = next(node_iter) if iterable.tag == "map": @@ -412,7 +417,7 @@ def parse_node(self, something): except KeyError as err: raise LLSDParseError("Unknown value type: " + str(err)) except StopIteration: - node_iter, iterable, cur_result = self.parse_stack.pop() + node_iter, iterable, cur_result = self.parse_stack.popleft() if len(self.parse_stack) == 0: break return cur_result diff --git a/tests/bench.py b/tests/bench.py index e434f4b..707107e 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -190,3 +190,7 @@ def test_format_binary_wide_array(benchmark): def test_parse_xml_deep(benchmark): deep_data = llsd.format_xml(_deep_bench_data) benchmark(llsd.parse_xml, deep_data) + +def test_parse_binary_deep(benchmark): + deep_data = llsd.format_binary(_deep_bench_data) + benchmark(llsd.parse_binary, deep_data) diff --git a/tests/llsd_test.py b/tests/llsd_test.py index 46f64fe..1bfa020 100644 --- a/tests/llsd_test.py +++ b/tests/llsd_test.py @@ -277,12 +277,12 @@ def testArray(self): # simple array array_notation = b"['foo', 'bar']" # composite array - array_within_array_notation = b"['foo', 'bar',['foo', 'bar']]" + array_within_array_notation = b"['foo', 'bar',['foo1', 'bar1']]" # blank array blank_array_notation = b"[]" python_array = [str("foo"), "bar"] - python_array_within_array = ["foo", "bar", ["foo", "bar"]] + python_array_within_array = ["foo", "bar", ["foo1", "bar1"]] python_blank_array = [] self.assertNotationRoundtrip(python_array, array_notation) @@ -616,8 +616,8 @@ def testArray(self): foo\ bar\ \ -foo\ -bar\ +foo1\ +bar1\ \ \ " @@ -627,7 +627,7 @@ def testArray(self): " python_array = ["foo", "bar"] - python_array_within_array = ["foo", "bar", ["foo", "bar"]] + python_array_within_array = ["foo", "bar", ["foo1", "bar1"]] self.assertEqual( python_array, From 61923f2f601261fe29bc2a54afd92c0f8fa6bacc Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Tue, 13 Jun 2023 22:26:14 +0000 Subject: [PATCH 13/15] SL-19707 Make notation parsing iterative Also, a few perf tweaks --- llsd/serde_binary.py | 26 ++++++---- llsd/serde_notation.py | 112 +++++++++++++++++++++-------------------- 2 files changed, 74 insertions(+), 64 deletions(-) diff --git a/llsd/serde_binary.py b/llsd/serde_binary.py index 9ec9226..8414285 100644 --- a/llsd/serde_binary.py +++ b/llsd/serde_binary.py @@ -1,12 +1,11 @@ import calendar from collections import deque import datetime -import io import struct import uuid -from llsd.base import (_LLSD, LLSDBaseFormatter, LLSDBaseParser, LLSDSerializationError, BINARY_HEADER, - LLSDSerializationError, UnicodeType, _str_to_bytes, binary, is_integer, is_string, uri) +from llsd.base import (_LLSD, LLSDBaseFormatter, LLSDBaseParser, BINARY_HEADER, + LLSDSerializationError, _str_to_bytes, binary, is_integer, is_string, uri) try: @@ -64,6 +63,7 @@ def __init__(self): # entries in _dispatch. for c, func in _dispatch_dict.items(): self._dispatch[ord(c)] = func + self._keep_binary = False self.parse_stack = deque([]) def parse(self, something, ignore_binary = False): @@ -120,7 +120,7 @@ def _parse(self): else: self.parse_stack[0][0] = item_count + 1 cur_result.append(self._dispatch[ord(cc)]()) - if (len(self.parse_stack) == 0): + if len(self.parse_stack) == 0: return cur_result def _parse_map(self): @@ -171,6 +171,11 @@ class LLSDBinaryFormatter(LLSDBaseFormatter): See http://wiki.secondlife.com/wiki/LLSD#Notation_Serialization """ + + def __init__(self): + super(LLSDBinaryFormatter, self).__init__() + self.iter_stack = deque([]) + def _LLSD(self, v): raise LLSDSerializationError("We should never end up here") # pragma: no cover def _UNDEF(self, v): @@ -186,7 +191,7 @@ def _REAL(self, v): try: self.stream.writelines([b'r', struct.pack('!d', v)]) except SystemError as exc: - raise LLSDSerializationError(str(exc), something) + raise LLSDSerializationError(str(exc), v) def _UUID(self, v): self.stream.writelines([b'u', v.bytes]) def _BINARY(self, v): @@ -206,10 +211,10 @@ def _DATE(self, v): self.stream.writelines([b'd', struct.pack('\n') - self.iter_stack = [[iter([something]), b"", None]] + self.iter_stack.appendleft([iter([something]), b"", None]) while True: - cur_iter, iter_type, iterable_obj = self.iter_stack[-1] + cur_iter, iter_type, iterable_obj = self.iter_stack[0] try: item = next(cur_iter) if iterable_obj: @@ -238,7 +243,7 @@ def _write(self, something): self.type_map[item_type](item) except StopIteration: self.stream.write(iter_type) - self.iter_stack.pop() + self.iter_stack.popleft() if len(self.iter_stack) == 1: break @@ -255,6 +260,7 @@ def format_binary(something): def write_binary(stream, something): + """ Primary binary writing entrypoint.""" return LLSDBinaryFormatter().write(stream, something) diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py index bb3ef79..4cdcce4 100644 --- a/llsd/serde_notation.py +++ b/llsd/serde_notation.py @@ -1,5 +1,6 @@ import base64 import binascii +from collections import deque import uuid from llsd.base import (_LLSD, B, LLSDBaseFormatter, LLSDBaseParser, NOTATION_HEADER, @@ -75,6 +76,7 @@ def __init__(self): # Then fill in specific entries based on the dict above. for c, func in _dispatch_dict.items(): self._dispatch[ord(c)] = func + self.parse_stack = deque([]) def parse(self, something, ignore_binary = False): """ @@ -111,15 +113,47 @@ def _get_until(self, delim): return b''.join(content) def _parse(self, cc): - "The notation parser workhorse." - try: - func = self._dispatch[ord(cc)] - except IndexError: - # output error if the token was out of range - self._error("Invalid notation token") + "The actual iterative parser." + if cc == b'{': + cur_result = {} + elif cc == b'[': + cur_result = [] else: - # pass the lookahead character that selected this func - return func(cc) + return self._dispatch[ord(cc)](cc) + self.parse_stack.appendleft([cc, cur_result]) + while True: + iter_type, cur_result = self.parse_stack[0] + cc = self._getc() + while cc.isspace() or cc == b',': + cc = self._getc() + if cc is None: + self._error("Unclosed collection") + if iter_type == b'{': + if cc == b'}': + iter_type, cur_result = self.parse_stack.popleft() + else: + if cc in (b"'", b'"', b's'): + key = self._parse_string(cc) + else: + self._error("Invalid map key") + cc = self._getc() + while cc != b':': + if not cc.isspace(): + self._error("Invalid map key %s" % cc.decode('utf-8')) + cc = self._getc() + + cc = self._getc() + cur_result[key] = self._dispatch[ord(cc)](cc) + elif iter_type == b'[': + while cc.isspace() or cc == b',': + cc = self._getc() + if cc == b']': + iter_type, cur_result = self.parse_stack.popleft() + else: + cur_result.append(self._dispatch[ord(cc)](cc)) + if (len(self.parse_stack) == 0): + return cur_result + def _parse_binary(self, cc): "parse a single binary object." @@ -184,36 +218,9 @@ def _parse_map(self, cc): map: { string:object, string:object } """ - rv = {} - key = b'' - found_key = False - # skip the beginning '{' - cc = self._getc() - while (cc != b'}'): - if cc is None: - self._error("Unclosed map") - if not found_key: - if cc in (b"'", b'"', b's'): - key = self._parse_string(cc) - found_key = True - elif cc.isspace() or cc == b',': - # ignore space or comma - pass - else: - self._error("Invalid map key") - elif cc.isspace(): - # ignore space - pass - elif cc == b':': - # skip the ':' - value = self._parse(self._getc()) - rv[key] = value - found_key = False - else: - self._error("missing separator") - cc = self._getc() - - return rv + result = {} + self.parse_stack.appendleft([b'{', result]) + return result def _parse_array(self, cc): """ @@ -223,17 +230,9 @@ def _parse_array(self, cc): """ rv = [] # skip the beginning '[' - cc = self._getc() - while (cc != b']'): - if cc is None: - self._error('Unclosed array') - if cc.isspace() or cc == b',': - cc = self._getc() - continue - rv.append(self._parse(cc)) - cc = self._getc() - - return rv + result = [] + self.parse_stack.appendleft([b'[', result]) + return result def _parse_uuid(self, cc): "Parse a uuid." @@ -416,6 +415,11 @@ class LLSDNotationFormatter(LLSDBaseFormatter): See http://wiki.secondlife.com/wiki/LLSD#Notation_Serialization """ + + def __init__(self): + super(LLSDNotationFormatter, self).__init__() + self.iter_stack = deque([]) + def _LLSD(self, v): raise LLSDSerializationError("We should never end up here") # pragma: no cover def _UNDEF(self, v): @@ -450,10 +454,10 @@ def _DATE(self, v): self.stream.writelines([b'd"', _format_datestr(v), b'"']) def _ARRAY(self, v): self.stream.write(b'[') - self.iter_stack.append([iter(v), b"]", None, b""]) + self.iter_stack.appendleft([iter(v), b"]", None, b""]) def _MAP(self, v): self.stream.write(b'{') - self.iter_stack.append([iter(v), b"}", v, b""]) + self.iter_stack.appendleft([iter(v), b"}", v, b""]) def _esc(self, data, quote=b"'"): return _str_to_bytes(data).replace(b"\\", b"\\\\").replace(quote, b'\\'+quote) @@ -465,9 +469,9 @@ def _write(self, something): """ - self.iter_stack = [[iter([something]), b"", None, b""]] + self.iter_stack.appendleft([iter([something]), b"", None, b""]) while True: - stack_top = self.iter_stack[-1] + stack_top = self.iter_stack[0] cur_iter, iter_type, iterable_obj, delim = stack_top try: item = next(cur_iter) @@ -491,7 +495,7 @@ def _write(self, something): self.type_map[item_type](item) except StopIteration: self.stream.write(iter_type) - self.iter_stack.pop() + self.iter_stack.popleft() if len(self.iter_stack) == 1: break From 1ebc616fe208ead4d84b789e923cdf64693ff87e Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Tue, 13 Jun 2023 22:51:05 +0000 Subject: [PATCH 14/15] SL-19707 - CR fixes remove some unused functions. --- llsd/base.py | 1 - llsd/serde_binary.py | 59 ------------------------------------------ llsd/serde_notation.py | 6 ++--- llsd/serde_xml.py | 31 ++++++++++++++-------- 4 files changed, 23 insertions(+), 74 deletions(-) diff --git a/llsd/base.py b/llsd/base.py index 318373f..d2602da 100644 --- a/llsd/base.py +++ b/llsd/base.py @@ -221,7 +221,6 @@ class LLSDBaseFormatter(object): def __init__(self): "Construct a new formatter dispatch table." self.stream = None - self.py2 = PY2 self.type_map = { type(None): self._UNDEF, undef: self._UNDEF, diff --git a/llsd/serde_binary.py b/llsd/serde_binary.py index 8414285..f766319 100644 --- a/llsd/serde_binary.py +++ b/llsd/serde_binary.py @@ -264,65 +264,6 @@ def write_binary(stream, something): return LLSDBinaryFormatter().write(stream, something) -def _write_binary_recurse(stream, something): - "Binary formatter workhorse." - if something is None: - stream.write(b'!') - elif isinstance(something, _LLSD): - _write_binary_recurse(stream, something.thing) - elif isinstance(something, bool): - stream.write(b'1' if something else b'0') - elif is_integer(something): - try: - stream.writelines([b'i', struct.pack('!i', something)]) - except (OverflowError, struct.error) as exc: - raise LLSDSerializationError(str(exc), something) - elif isinstance(something, float): - try: - stream.writelines([b'r', struct.pack('!d', something)]) - except SystemError as exc: - raise LLSDSerializationError(str(exc), something) - elif isinstance(something, uuid.UUID): - stream.writelines([b'u', something.bytes]) - elif isinstance(something, binary): - stream.writelines([b'b', struct.pack('!i', len(something)), something]) - elif is_string(something): - something = _str_to_bytes(something) - stream.writelines([b's', struct.pack('!i', len(something)), something]) - elif isinstance(something, uri): - stream.writelines([b'l', struct.pack('!i', len(something)), something]) - elif isinstance(something, datetime.datetime): - seconds_since_epoch = calendar.timegm(something.utctimetuple()) \ - + something.microsecond // 1e6 - stream.writelines([b'd', struct.pack('', base64.b64encode(v).strip(), b'']) def _STRING(self, v): - if self.py2: # pragma: no cover + # We don't simply have a function that encapsulates the PY2 vs PY3 calls, + # as that results in another function call and is slightly less performant + if PY2: # pragma: no cover return self.stream.writelines([b'', _str_to_bytes(xml_esc(v)), b'']) self.stream.writelines([b'', v.translate(XML_ESC_TRANS).encode('utf-8'), b'']) def _URI(self, v): - if self.py2: # pragma: no cover + # We don't simply have a function that encapsulates the PY2 vs PY3 calls, + # as that results in another function call and is slightly less performant + if PY2: # pragma: no cover return self.stream.writelines([b'', _str_to_bytes(xml_esc(v)), b'']) - self.stream.writelines([b'', UnicodeType(v).translate(XML_ESC_TRANS).encode('utf-8'), b'']) + self.stream.writelines([b'', str(v).translate(XML_ESC_TRANS).encode('utf-8'), b'']) def _DATE(self, v): self.stream.writelines([b'', _format_datestr(v), b'']) def _ARRAY(self, v): @@ -123,14 +128,16 @@ def _write(self, something): self.stream.write(b'' b'') - self.iter_stack = deque([(iter([something]), b"", None)]) + self.iter_stack.appendleft((iter([something]), b"", None)) while True: cur_iter, iter_type, iterable_obj = self.iter_stack[0] try: item = next(cur_iter) if iter_type == b"map": - if self.py2: # pragma: no cover + # We don't simply have a function that encapsulates the PY2 vs PY3 calls, + # as that results in another function call and is slightly less performant + if PY2: # pragma: no cover self.stream.writelines([b'', _str_to_bytes(xml_esc(UnicodeType(item))), b'']) @@ -138,7 +145,7 @@ def _write(self, something): # fair performance improvement by explicitly doing the # translate for py3 instead of calling xml_esc self.stream.writelines([b'', - UnicodeType(item).translate(XML_ESC_TRANS).encode('utf-8'), + str(item).translate(XML_ESC_TRANS).encode('utf-8'), b'']) item = iterable_obj[item] # pylint: disable=unsubscriptable-object while isinstance(item, _LLSD): @@ -217,14 +224,16 @@ def _write(self, something): item = next(cur_iter) if iter_type == b"map": self._indent() - if self.py2: # pragma: no cover + # We don't simply have a function that encapsulates the PY2 vs PY3 calls, + # as that results in another function call and is slightly less performant + if PY2: # pragma: no cover self.stream.write(b'' + _str_to_bytes(xml_esc(UnicodeType(item))) + b'') else: # calling translate directly is a bit faster self.stream.write(b'' + - UnicodeType(item).translate(XML_ESC_TRANS).encode('utf-8') + + str(item).translate(XML_ESC_TRANS).encode('utf-8') + b'\n') item = iterable_obj[item] # pylint: disable=unsubscriptable-object if isinstance(item, _LLSD): From 7c18a429a7b0a040845e02c05fe2c83e7d911105 Mon Sep 17 00:00:00 2001 From: Roxie Linden Date: Wed, 28 Jun 2023 16:54:17 +0000 Subject: [PATCH 15/15] SL-19707 - perf improvements --- llsd/serde_xml.py | 195 ++++++++++++++++++++++------------------------ 1 file changed, 95 insertions(+), 100 deletions(-) diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py index fe71488..3bd66c6 100644 --- a/llsd/serde_xml.py +++ b/llsd/serde_xml.py @@ -61,7 +61,6 @@ def xml_esc(v): # pragma: no cover return v.replace(b'&',b'&').replace(b'<',b'<').replace(b'>',b'>') - class LLSDXMLFormatter(LLSDBaseFormatter): """ Class which implements LLSD XML serialization. @@ -79,85 +78,117 @@ def __init__(self, indent_atom = None): # Call the super class constructor so that we have the type map super(LLSDXMLFormatter, self).__init__() self.iter_stack = deque([]) + self._indent_atom = b'' + self._eol = b'' + self._indent_level = 0 + + def _indent(self): + pass def _LLSD(self, v): raise LLSDSerializationError("We should never end up here") # pragma: no cover def _UNDEF(self, _v): - self.stream.write(b'') + self.stream.writelines([b'', self._eol]) def _BOOLEAN(self, v): if v: - return self.stream.write(b'true') - self.stream.write(b'false') + return self.stream.writelines([b'true', self._eol]) + self.stream.writelines([b'false', self._eol]) def _INTEGER(self, v): - self.stream.writelines([b'', str(v).encode('utf-8'), b'']) + self.stream.writelines([b'', str(v).encode('utf-8'), b'', self._eol]) def _REAL(self, v): - self.stream.writelines([b'', str(v).encode('utf-8'), b'']) + self.stream.writelines([b'', str(v).encode('utf-8'), b'', self._eol]) def _UUID(self, v): if v.int == 0: - return self.stream.write(b'') - self.stream.writelines([b'', str(v).encode('utf-8'), b'']) + return self.stream.writelines([b'', self._eol]) + self.stream.writelines([b'', str(v).encode('utf-8'), b'', self._eol]) def _BINARY(self, v): - self.stream.writelines([b'', base64.b64encode(v).strip(), b'']) + self.stream.writelines([b'', base64.b64encode(v).strip(), b'', self._eol]) def _STRING(self, v): # We don't simply have a function that encapsulates the PY2 vs PY3 calls, # as that results in another function call and is slightly less performant if PY2: # pragma: no cover - return self.stream.writelines([b'', _str_to_bytes(xml_esc(v)), b'']) - self.stream.writelines([b'', v.translate(XML_ESC_TRANS).encode('utf-8'), b'']) + return self.stream.writelines([b'', _str_to_bytes(xml_esc(v)), b'', self._eol]) + self.stream.writelines([b'', v.translate(XML_ESC_TRANS).encode('utf-8'), b'', self._eol]) def _URI(self, v): # We don't simply have a function that encapsulates the PY2 vs PY3 calls, # as that results in another function call and is slightly less performant if PY2: # pragma: no cover - return self.stream.writelines([b'', _str_to_bytes(xml_esc(v)), b'']) - self.stream.writelines([b'', str(v).translate(XML_ESC_TRANS).encode('utf-8'), b'']) + return self.stream.writelines([b'', _str_to_bytes(xml_esc(v)), b'', self._eol]) + self.stream.writelines([b'', str(v).translate(XML_ESC_TRANS).encode('utf-8'), b'', self._eol]) def _DATE(self, v): - self.stream.writelines([b'', _format_datestr(v), b'']) + self.stream.writelines([b'', _format_datestr(v), b'', self._eol]) def _ARRAY(self, v): - self.stream.write(b'') + self.stream.writelines([b'', self._eol]) self.iter_stack.appendleft((iter(v), b"array", None)) + self._indent_level = self._indent_level + 1 def _MAP(self, v): - self.stream.write(b'') + self.stream.writelines([b'', self._eol]) self.iter_stack.appendleft((iter(v), b"map", v)) + self._indent_level = self._indent_level + 1 def _write(self, something): """ Serialize a python object to self.stream as application/llsd+xml. + This serializer is iterative instead of recursive. Each element in + iter_stack is an iterator into either the list or the dict in the tree. + This limits depth by size of free memory instead of size of the function + call stack, allowing us to render deeper trees than a recursive model. :param something: A python object (typically a dict) to be serialized. """ - self.stream.write(b'' - b'') - + self.stream.writelines([b'', self._eol, + b'', self._eol]) + + # start by pushing the passed-in element onto the stack + # as an element of a tuple. The array acts as the + # root node. + # each element of the iter_stack is: + # 0 - iterator indicating the current position in the given level of the tree + # this can be either a list iterator position, or an iterator of + # keys for the dict. + # 1 - the type string for the element. Used to close out the xml element. + # 2 - the actual element object. self.iter_stack.appendleft((iter([something]), b"", None)) while True: cur_iter, iter_type, iterable_obj = self.iter_stack[0] try: item = next(cur_iter) + except StopIteration: + self._indent_level -= 1 + self._indent() + self.stream.writelines([b'', self._eol]) + self.iter_stack.popleft() + else: if iter_type == b"map": - + self._indent() # We don't simply have a function that encapsulates the PY2 vs PY3 calls, # as that results in another function call and is slightly less performant if PY2: # pragma: no cover self.stream.writelines([b'', _str_to_bytes(xml_esc(UnicodeType(item))), - b'']) + b'', self._eol]) else: # fair performance improvement by explicitly doing the # translate for py3 instead of calling xml_esc self.stream.writelines([b'', str(item).translate(XML_ESC_TRANS).encode('utf-8'), - b'']) + b'', self._eol]) + # grab the item from the dict item = iterable_obj[item] # pylint: disable=unsubscriptable-object + + # LLSD nodes point to an item, and we want to + # render the LLSD XML for the item itself. while isinstance(item, _LLSD): item = item.thing + item_type = type(item) - if not item_type in self.type_map: + try: + tfunction = self.type_map[item_type] + except KeyError: raise LLSDSerializationError( "Cannot serialize unknown type: %s (%s)" % (item_type, item)) - self.type_map[item_type](item) - except StopIteration: - self.stream.writelines([b'']) - self.iter_stack.popleft() + self._indent() + tfunction(item) if len(self.iter_stack) == 1: break self.stream.write(b'') @@ -187,74 +218,12 @@ def __init__(self, indent_atom = None): self._indent_atom = b' ' else: self._indent_atom = indent_atom - self.iter_stack = None - - def _ARRAY(self, v): - self.stream.write(b'') - self._indent_level += 1 - self.iter_stack.append((iter(v), b"array", None)) - - def _MAP(self, v): - self.stream.write(b'') - self._indent_level += 1 - self.iter_stack.append((iter(v), b"map", v)) + self._eol = b'\n' def _indent(self): "Write an indentation based on the atom and indentation level." self.stream.writelines([self._indent_atom] * self._indent_level) - def _write(self, something): - """ - Serialize a python object to self.stream as application/llsd+xml. - - :param something: A python object (typically a dict) to be serialized. - - NOTE: This is nearly identical to the above _write with the exception - that this one includes newlines and indentation. Doing something clever - for the above may decrease performance for the common case, so it's been - split out. We can probably revisit this, though. - """ - self.stream.write(b'\n' - b'\n') - - self.iter_stack = [(iter([something]), b"", None)] - while True: - cur_iter, iter_type, iterable_obj = self.iter_stack[-1] - try: - item = next(cur_iter) - if iter_type == b"map": - self._indent() - # We don't simply have a function that encapsulates the PY2 vs PY3 calls, - # as that results in another function call and is slightly less performant - if PY2: # pragma: no cover - self.stream.write(b'' + - _str_to_bytes(xml_esc(UnicodeType(item))) + - b'') - else: - # calling translate directly is a bit faster - self.stream.write(b'' + - str(item).translate(XML_ESC_TRANS).encode('utf-8') + - b'\n') - item = iterable_obj[item] # pylint: disable=unsubscriptable-object - if isinstance(item, _LLSD): - item = item.thing - item_type = type(item) - if not item_type in self.type_map: - raise LLSDSerializationError( - "Cannot serialize unknown type: %s (%s)" % (item_type, item)) - - self._indent() - self.type_map[item_type](item) - self.stream.write(b'\n') - except StopIteration: - self._indent_level -= 1 - self._indent() - self.stream.write(b'\n') - self.iter_stack.pop() - if len(self.iter_stack) == 1: - break - self.stream.write(b'\n') - def format_pretty_xml(something): """ @@ -395,10 +364,19 @@ def _array_to_python(self, node): def parse_node(self, something): """ - Parse an xml node tree via iteration. + Parse an ElementTree tree + This parser is iterative instead of recursive. It uses + Each element in parse_stack is an iterator into either the list + or the dict in the tree. This limits depth by size of free memory + instead of size of the function call stack, allowing us to render + deeper trees than a recursive model. :param something: The xml node to parse. :returns: Returns a python object. """ + + # if the passed in element is not a map or array, simply return + # its value. Otherwise, create a dict or array to receive + # child/leaf elements. if something.tag == "map": cur_result = {} elif something.tag == "array": @@ -408,27 +386,44 @@ def parse_node(self, something): raise LLSDParseError("Unknown value type %s" % something.tag) return self.NODE_HANDLERS[something.tag](something) + # start by pushing the current element iterator data onto + # the stack + # 0 - iterator indicating the current position in the given level of the tree + # this can be either a list iterator position, or an iterator of + # keys for the dict. + # 1 - the actual element object. + # 2 - the result for this level in the tree, onto which + # children or leafs will be appended/set self.parse_stack.appendleft([iter(something), something, cur_result]) while True: node_iter, iterable, cur_result = self.parse_stack[0] try: value = next(node_iter) + + except StopIteration: + node_iter, iterable, cur_result = self.parse_stack.popleft() + if len(self.parse_stack) == 0: + break + else: if iterable.tag == "map": if value.tag != "key": raise LLSDParseError("Expected 'key', got %s" % value.tag) key = value.text if key is None: key = '' - value = next(node_iter) - cur_result[key] = self.NODE_HANDLERS[value.tag](value) + try: + value = next(node_iter) + except StopIteration: + raise LLSDParseError("No value for map item %s" % key) + try: + cur_result[key] = self.NODE_HANDLERS[value.tag](value) + except KeyError as err: + raise LLSDParseError("Unknown value type: " + str(err)) elif iterable.tag == "array": - cur_result.append(self.NODE_HANDLERS[value.tag](value)) - except KeyError as err: - raise LLSDParseError("Unknown value type: " + str(err)) - except StopIteration: - node_iter, iterable, cur_result = self.parse_stack.popleft() - if len(self.parse_stack) == 0: - break + try: + cur_result.append(self.NODE_HANDLERS[value.tag](value)) + except KeyError as err: + raise LLSDParseError("Unknown value type: " + str(err)) return cur_result def parse_xml(something):