diff --git a/.github/workflows/cla.yaml b/.github/workflows/cla.yaml new file mode 100644 index 0000000..fa180c6 --- /dev/null +++ b/.github/workflows/cla.yaml @@ -0,0 +1,25 @@ +name: Check CLA + +on: + issue_comment: + types: [created] + pull_request_target: + types: [opened, closed, synchronize] + +jobs: + cla: + name: Check CLA + runs-on: ubuntu-latest + steps: + - name: CLA Assistant + if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target' + uses: secondlife-3p/contributor-assistant@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PERSONAL_ACCESS_TOKEN: ${{ secrets.SHARED_CLA_TOKEN }} + with: + branch: main + path-to-document: https://github.com/secondlife/cla/blob/master/CLA.md + path-to-signatures: signatures.json + remote-organization-name: secondlife + remote-repository-name: cla-signatures diff --git a/CREDITS.md b/CREDITS.md index 8ff6129..df3f841 100644 --- a/CREDITS.md +++ b/CREDITS.md @@ -1,4 +1,4 @@ # Credits Thanks to [Tao Takashi](https://github.com/mrtopf) for -llsd PyPI package name. +the llsd PyPI package name. diff --git a/llsd/__init__.py b/llsd/__init__.py index 9727479..85c3471 100644 --- a/llsd/__init__.py +++ b/llsd/__init__.py @@ -9,31 +9,9 @@ """ from llsd.base import (_LLSD, BINARY_MIME_TYPE, NOTATION_MIME_TYPE, XML_MIME_TYPE, LLSDParseError, LLSDSerializationError, LongType, UnicodeType, binary, starts_with, undef, uri) -from llsd.serde_binary import format_binary, parse_binary -from llsd.serde_notation import format_notation, parse_notation -from llsd.serde_xml import format_pretty_xml, format_xml, parse_xml - -__all__ = [ - "BINARY_MIME_TYPE", - "LLSD", - "LLSDParseError", - "LLSDSerializationError", - "LongType", - "NOTATION_MIME_TYPE", - "UnicodeType", - "XML_MIME_TYPE", - "binary", - "format_binary", - "format_notation", - "format_pretty_xml", - "format_xml", - "parse", - "parse_binary", - "parse_notation", - "parse_xml", - "undef", - "uri", -] +from llsd.serde_binary import LLSDBinaryParser, format_binary, parse_binary +from llsd.serde_notation import LLSDNotationFormatter, LLSDNotationParser, format_notation, parse_notation +from llsd.serde_xml import LLSDXMLFormatter, LLSDXMLPrettyFormatter, format_pretty_xml, format_xml, parse_xml def parse(something, mime_type = None): @@ -81,4 +59,4 @@ def __str__(self): as_xml = staticmethod(format_xml) as_pretty_xml = staticmethod(format_pretty_xml) as_binary = staticmethod(format_binary) - as_notation = staticmethod(format_notation) \ No newline at end of file + as_notation = staticmethod(format_notation) diff --git a/llsd/base.py b/llsd/base.py index 6ac695f..544f480 100644 --- a/llsd/base.py +++ b/llsd/base.py @@ -141,6 +141,19 @@ def B(fmt): return fmt +class PY3SemanticBytes(BytesType): + """Wrapper to make `buffer[n]` return an integer like in Py3""" + __slots__ = [] + + def __getitem__(self, item): + ret = super(PY3SemanticBytes, self).__getitem__(item) + # `buffer[n]` should return an integer, but slice syntax like + # `buffer[n:n+1]` should still return a `Bytes` object as before. + if is_integer(item): + return ord(ret) + return ret + + def is_integer(o): """ portable test if an object is like an int """ return isinstance(o, IntTypes) @@ -321,19 +334,6 @@ def _to_python(node): return NODE_HANDLERS[node.tag](node) -def _hex_as_nybble(hex): - "Accepts a single hex character and returns a nybble." - if (hex >= b'0') and (hex <= b'9'): - return ord(hex) - ord(b'0') - elif (hex >= b'a') and (hex <=b'f'): - return 10 + ord(hex) - ord(b'a') - elif (hex >= b'A') and (hex <=b'F'): - return 10 + ord(hex) - ord(b'A') - else: - raise LLSDParseError('Invalid hex character: %s' % hex) - - - class LLSDBaseFormatter(object): """ This base class cannot be instantiated on its own: it assumes a subclass @@ -366,13 +366,22 @@ def __init__(self): } +_X_ORD = ord(b'x') +_BACKSLASH_ORD = ord(b'\\') +_DECODE_BUFF_ALLOC_SIZE = 1024 + + class LLSDBaseParser(object): """ Utility methods useful for parser subclasses. """ + __slots__ = ['_buffer', '_index', '_decode_buff'] + def __init__(self): self._buffer = b'' - self._index = 0 + self._index = 0 + # Scratch space for decoding delimited strings + self._decode_buff = bytearray(_DECODE_BUFF_ALLOC_SIZE) def _error(self, message, offset=0): try: @@ -399,53 +408,85 @@ def _getc(self, num=1): # map char following escape char to corresponding character _escaped = { - b'a': b'\a', - b'b': b'\b', - b'f': b'\f', - b'n': b'\n', - b'r': b'\r', - b't': b'\t', - b'v': b'\v', + ord(b'a'): ord(b'\a'), + ord(b'b'): ord(b'\b'), + ord(b'f'): ord(b'\f'), + ord(b'n'): ord(b'\n'), + ord(b'r'): ord(b'\r'), + ord(b't'): ord(b'\t'), + ord(b'v'): ord(b'\v'), } def _parse_string_delim(self, delim): "Parse a delimited string." - parts = bytearray() - found_escape = False - found_hex = False - found_digit = False - byte = 0 + insert_idx = 0 + delim_ord = ord(delim) + # Preallocate a working buffer for the decoded string output + # to avoid allocs in the hot loop. + decode_buff = self._decode_buff + # Cache these in locals, otherwise we have to perform a lookup on + # `self` in the hot loop. + buff = self._buffer + read_idx = self._index + cc = 0 while True: - cc = self._getc() - if found_escape: - if found_hex: - if found_digit: - found_escape = False - found_hex = False - found_digit = False - byte <<= 4 - byte |= _hex_as_nybble(cc) - parts.append(byte) - byte = 0 + try: + cc = buff[read_idx] + read_idx += 1 + + if cc == _BACKSLASH_ORD: + # Backslash, figure out if this is an \xNN hex escape or + # something like \t + cc = buff[read_idx] + read_idx += 1 + if cc == _X_ORD: + # It's a hex escape. char is the value of the two + # following hex nybbles. This slice may result in + # a short read (0 or 1 bytes), but either a + # `ValueError` will be triggered by the first case, + # and the second will cause an `IndexError` on the + # next iteration of the loop. + hex_bytes = buff[read_idx:read_idx + 2] + read_idx += 2 + try: + # int() can parse a `bytes` containing hex, + # no explicit `bytes.decode("ascii")` required. + cc = int(hex_bytes, 16) + except ValueError as e: + # One of the hex characters was likely invalid. + # Wrap the ValueError so that we can provide a + # byte offset in the error. + self._index = read_idx + self._error(e, offset=-2) else: - found_digit = True - byte = _hex_as_nybble(cc) - elif cc == b'x': - found_hex = True - else: - found_escape = False - # escape char preceding anything other than the chars in - # _escaped just results in that same char without the - # escape char - parts.extend(self._escaped.get(cc, cc)) - elif cc == b'\\': - found_escape = True - elif cc == delim: - break - else: - parts.extend(cc) + # escape char preceding anything other than the chars + # in _escaped just results in that same char without + # the escape char + cc = self._escaped.get(cc, cc) + elif cc == delim_ord: + break + except IndexError: + # We can be reasonably sure that any IndexErrors inside here + # were caused by an out-of-bounds `buff[read_idx]`. + self._index = read_idx + self._error("Trying to read past end of buffer") + + try: + decode_buff[insert_idx] = cc + except IndexError: + # Oops, that overflowed the decoding buffer, make a + # new expanded buffer containing the existing contents. + decode_buff = bytearray(decode_buff) + decode_buff.extend(b"\x00" * _DECODE_BUFF_ALLOC_SIZE) + decode_buff[insert_idx] = cc + + insert_idx += 1 + + # Sync our local read index with the canonical one + self._index = read_idx try: - return parts.decode('utf-8') + # Slice off only what we used of the working decode buffer + return decode_buff[:insert_idx].decode('utf-8') except UnicodeDecodeError as exc: self._error(exc) @@ -457,4 +498,4 @@ def starts_with(startstr, something): pos = something.tell() s = something.read(len(startstr)) something.seek(pos, os.SEEK_SET) - return (s == startstr) \ No newline at end of file + return (s == startstr) diff --git a/llsd/serde_binary.py b/llsd/serde_binary.py index 41d24f7..cbf65e4 100644 --- a/llsd/serde_binary.py +++ b/llsd/serde_binary.py @@ -4,7 +4,7 @@ import uuid from llsd.base import (_LLSD, LLSDBaseParser, LLSDSerializationError, _str_to_bytes, binary, is_integer, is_string, - starts_with, uri) + starts_with, uri, PY2, is_bytes, PY3SemanticBytes) class LLSDBinaryParser(LLSDBaseParser): @@ -13,6 +13,8 @@ class LLSDBinaryParser(LLSDBaseParser): See http://wiki.secondlife.com/wiki/LLSD#Binary_Serialization """ + __slots__ = ['_dispatch', '_keep_binary'] + def __init__(self): super(LLSDBinaryParser, self).__init__() # One way of dispatching based on the next character we see would be a @@ -61,6 +63,10 @@ def parse(self, buffer, ignore_binary = False): :param ignore_binary: parser throws away data in llsd binary nodes. :returns: returns a python object. """ + if PY2 and is_bytes(buffer): + # We need to wrap this in a helper class so that individual element + # access works the same as in PY3 + buffer = PY3SemanticBytes(buffer) self._buffer = buffer self._index = 0 self._keep_binary = not ignore_binary diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py index 73cae03..e2e9340 100644 --- a/llsd/serde_notation.py +++ b/llsd/serde_notation.py @@ -4,7 +4,7 @@ import uuid from llsd.base import (_LLSD, B, LLSDBaseFormatter, LLSDBaseParser, LLSDParseError, LLSDSerializationError, UnicodeType, - _format_datestr, _parse_datestr, _str_to_bytes, binary, uri) + _format_datestr, _parse_datestr, _str_to_bytes, binary, uri, PY2, is_bytes, PY3SemanticBytes) _int_regex = re.compile(br"[-+]?\d+") _real_regex = re.compile(br"[-+]?(?:(\d+(\.\d*)?|\d*\.\d+)([eE][-+]?\d+)?)|[-+]?inf|[-+]?nan") @@ -86,6 +86,11 @@ def parse(self, buffer, ignore_binary = False): if buffer == b"": return False + if PY2 and is_bytes(buffer): + # We need to wrap this in a helper class so that individual element + # access works the same as in PY3 + buffer = PY3SemanticBytes(buffer) + self._buffer = buffer self._index = 0 return self._parse() @@ -328,6 +333,8 @@ class LLSDNotationFormatter(LLSDBaseFormatter): See http://wiki.secondlife.com/wiki/LLSD#Notation_Serialization """ + __slots__ = [] + def LLSD(self, v): return self._generate(v.thing) def UNDEF(self, v): diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py index fcec338..c8404a5 100644 --- a/llsd/serde_xml.py +++ b/llsd/serde_xml.py @@ -36,6 +36,7 @@ class LLSDXMLFormatter(LLSDBaseFormatter): module level format_xml is the most convenient interface to this functionality. """ + __slots__ = [] def _elt(self, name, contents=None): "Serialize a single element." diff --git a/tests/llsd_test.py b/tests/llsd_test.py index e8d5fe4..b86ab96 100644 --- a/tests/llsd_test.py +++ b/tests/llsd_test.py @@ -507,6 +507,21 @@ def testParseNotationIncorrectMIME(self): except llsd.LLSDParseError: pass + def testParseNotationUnterminatedString(self): + """ + Test with an unterminated delimited string + """ + self.assertRaises(llsd.LLSDParseError, self.llsd.parse, b"'foo") + + def testParseNotationHexEscapeNoChars(self): + self.assertRaises(llsd.LLSDParseError, self.llsd.parse, b"'\\x") + + def testParseNotationHalfTruncatedHex(self): + self.assertRaises(llsd.LLSDParseError, self.llsd.parse, b"'\\xf") + + def testParseNotationInvalidHex(self): + self.assertRaises(llsd.LLSDParseError, self.llsd.parse, b"'\\xzz'") + class LLSDBinaryUnitTest(unittest.TestCase): """