diff --git a/adafruit_json_stream.py b/adafruit_json_stream.py index 0c08ae5..b5172d7 100644 --- a/adafruit_json_stream.py +++ b/adafruit_json_stream.py @@ -40,7 +40,9 @@ def read(self): self.i += 1 return char - def fast_forward(self, closer, *, return_object=False): + def fast_forward( + self, closer, *, return_object=False + ): # pylint: disable=too-many-branches """ Read through the stream until the character is ``closer``, ``]`` (ending a list) or ``}`` (ending an object.) Intermediate lists and @@ -62,6 +64,7 @@ def fast_forward(self, closer, *, return_object=False): # } = 125, { = 123 buffer[0] = closer - 2 + ignore_next = False while close_stack: char = self.read() count += 1 @@ -71,8 +74,14 @@ def fast_forward(self, closer, *, return_object=False): new_buffer[: len(buffer)] = buffer buffer = new_buffer buffer[count] = char - if char == close_stack[-1]: + if ignore_next: + # that character was escaped, skip it + ignore_next = False + elif char == close_stack[-1]: close_stack.pop() + elif char == ord("\\") and close_stack[-1] == ord('"'): + # if backslash, ignore the next character + ignore_next = True elif char == ord('"'): close_stack.append(ord('"')) elif close_stack[-1] == ord('"'): @@ -96,26 +105,41 @@ def next_value(self, endswith=None): if isinstance(endswith, str): endswith = ord(endswith) in_string = False + ignore_next = False while True: try: char = self.read() except EOFError: char = endswith - if not in_string and (char == endswith or char in (ord("]"), ord("}"))): - self.last_char = char - if len(buf) == 0: - return None - value_string = bytes(buf).decode("utf-8") - return json.loads(value_string) - if char == ord("{"): - return TransientObject(self) - if char == ord("["): - return TransientList(self) + in_string = False + ignore_next = False if not in_string: - in_string = char == ord('"') + # end character or object/list end + if char == endswith or char in (ord("]"), ord("}")): + self.last_char = char + if len(buf) == 0: + return None + value_string = bytes(buf).decode("utf-8") + return json.loads(value_string) + # string or sub object + if char == ord("{"): + return TransientObject(self) + if char == ord("["): + return TransientList(self) + # start a string + if char == ord('"'): + in_string = True else: - in_string = char != ord('"') + # skipping any closing or opening character if in a string + # also skipping escaped characters (like quotes in string) + if ignore_next: + ignore_next = False + elif char == ord("\\"): + ignore_next = True + elif char == ord('"'): + in_string = False + buf.append(char) @@ -130,7 +154,7 @@ def __init__(self, stream): self.finish_char = "" def finish(self): - """Consume all of the characters for this list from the stream.""" + """Consume all of the characters for this container from the stream.""" if not self.done: if self.active_child: self.active_child.finish() @@ -139,7 +163,8 @@ def finish(self): self.done = True def as_object(self): - """Consume all of the characters for this list from the stream and return as an object.""" + """Consume all of the characters for this container from the stream + and return as an object.""" if self.has_read: raise BufferError("Object has already been partly read.") @@ -183,10 +208,17 @@ class TransientObject(Transient): def __init__(self, stream): super().__init__(stream) self.finish_char = "}" - self.active_child_key = None + self.active_key = None + + def finish(self): + """Consume all of the characters for this container from the stream.""" + if self.active_key and not self.active_child: + self.done = self.data.fast_forward(",") + self.active_key = None + super().finish() def __getitem__(self, key): - if self.active_child and self.active_child_key == key: + if self.active_child and self.active_key == key: return self.active_child self.has_read = True @@ -195,12 +227,16 @@ def __getitem__(self, key): self.active_child.finish() self.done = self.data.fast_forward(",") self.active_child = None - self.active_child_key = None + self.active_key = None if self.done: raise KeyError(key) while not self.done: - current_key = self.data.next_value(":") + if self.active_key: + current_key = self.active_key + self.active_key = None + else: + current_key = self.data.next_value(":") if current_key is None: self.done = True break @@ -210,11 +246,47 @@ def __getitem__(self, key): self.done = True if isinstance(next_value, Transient): self.active_child = next_value - self.active_child_key = key + self.active_key = key return next_value self.done = self.data.fast_forward(",") raise KeyError(key) + def __iter__(self): + return self + + def _next_key(self): + """Return the next item's key, without consuming the value.""" + if self.active_key: + if self.active_child: + self.active_child.finish() + self.active_child = None + self.done = self.data.fast_forward(",") + self.active_key = None + if self.done: + raise StopIteration() + + self.has_read = True + + current_key = self.data.next_value(":") + if current_key is None: + self.done = True + raise StopIteration() + + self.active_key = current_key + return current_key + + def __next__(self): + return self._next_key() + + def items(self): + """Return iterator in the dictionary’s items ((key, value) pairs).""" + try: + while not self.done: + key = self._next_key() + yield (key, self[key]) + except StopIteration: + return + def load(data_iter): """Returns an object to represent the top level of the given JSON stream.""" diff --git a/examples/json_stream_local_file_advanced.py b/examples/json_stream_local_file_advanced.py new file mode 100644 index 0000000..2920619 --- /dev/null +++ b/examples/json_stream_local_file_advanced.py @@ -0,0 +1,84 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 Scott Shawcroft for Adafruit Industries +# +# SPDX-License-Identifier: Unlicense + +import sys +import time + +import adafruit_json_stream as json_stream + +# import json_stream + + +class FakeResponse: + def __init__(self, file): + self.file = file + + def iter_content(self, chunk_size): + while True: + yield self.file.read(chunk_size) + + +f = open(sys.argv[1], "rb") # pylint: disable=consider-using-with +obj = json_stream.load(FakeResponse(f).iter_content(32)) + + +def find_keys(haystack, keys): + """If we don't know the order in which the keys are, + go through all of them and pick the ones we want""" + out = {} + # iterate on the items of an object + for key in haystack: + if key in keys: + # retrieve the value only if needed + value = haystack[key] + # if it's a sub object, get it all + if hasattr(value, "as_object"): + value = value.as_object() + out[key] = value + return out + + +months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", +] + + +def time_to_date(stamp): + tt = time.localtime(stamp) + month = months[tt.tm_mon] + return f"{tt.tm_mday:2d}th of {month}" + + +def ftoc(temp): + return (temp - 32) * 5 / 9 + + +currently = obj["currently"] +print("Currently:") +print(" ", time_to_date(currently["time"])) +print(" ", currently["icon"]) + +# iterate on the content of a list +for i, day in enumerate(obj["daily"]["data"]): + day_items = find_keys(day, ("time", "summary", "temperatureHigh")) + date = time_to_date(day_items["time"]) + print( + f'On {date}: {day_items["summary"]},', + f'Max: {int(day_items["temperatureHigh"])}F', + f'({int(ftoc(day_items["temperatureHigh"]))}C)', + ) + + if i > 4: + break diff --git a/tests/test_json_stream.py b/tests/test_json_stream.py index b8197fe..7ed05c9 100644 --- a/tests/test_json_stream.py +++ b/tests/test_json_stream.py @@ -66,6 +66,38 @@ def dict_with_all_types(): """ +@pytest.fixture +def list_with_bad_strings(): + return r""" + [ + "\"}\"", + "{\"a\": 1, \"b\": [2,3]}", + "\"", + "\\\"", + "\\\\\"", + "\\x40\"", + "[[[{{{", + "]]]}}}" + ] + """ + + +@pytest.fixture +def dict_with_bad_strings(): + return r""" + { + "1": "\"}\"", + "2": "{\"a\": 1, \"b\": [2,3]}", + "3": "\"", + "4": "\\\"", + "5": "\\\\\"", + "6": "\\x40\"", + "7": "[[[{{{", + "8": "]]]}}}" + } + """ + + @pytest.fixture def list_with_values(): return """ @@ -308,6 +340,116 @@ def test_complex_dict(complex_dict): assert sub_counter == 12 +def test_bad_strings_in_list(list_with_bad_strings): + """Test loading different strings that can confuse the parser.""" + + bad_strings = [ + '"}"', + '{"a": 1, "b": [2,3]}', + '"', + '\\"', + '\\\\"', + '\\x40"', + "[[[{{{", + "]]]}}}", + ] + + assert json.loads(list_with_bad_strings) + + # get each separately + stream = adafruit_json_stream.load(BytesChunkIO(list_with_bad_strings.encode())) + for i, item in enumerate(stream): + assert item == bad_strings[i] + + +def test_bad_strings_in_list_iter(list_with_bad_strings): + """Test loading different strings that can confuse the parser.""" + + bad_strings = [ + '"}"', + '{"a": 1, "b": [2,3]}', + '"', + '\\"', + '\\\\"', + '\\x40"', + "[[[{{{", + "]]]}}}", + ] + + assert json.loads(list_with_bad_strings) + + # get each separately + stream = adafruit_json_stream.load(BytesChunkIO(list_with_bad_strings.encode())) + for i, item in enumerate(stream): + assert item == bad_strings[i] + + +def test_bad_strings_in_dict_as_object(dict_with_bad_strings): + """Test loading different strings that can confuse the parser.""" + + bad_strings = { + "1": '"}"', + "2": '{"a": 1, "b": [2,3]}', + "3": '"', + "4": '\\"', + "5": '\\\\"', + "6": '\\x40"', + "7": "[[[{{{", + "8": "]]]}}}", + } + + # read all at once + stream = adafruit_json_stream.load(BytesChunkIO(dict_with_bad_strings.encode())) + assert stream.as_object() == bad_strings + + +def test_bad_strings_in_dict_all_keys(dict_with_bad_strings): + """Test loading different strings that can confuse the parser.""" + + bad_strings = { + "1": '"}"', + "2": '{"a": 1, "b": [2,3]}', + "3": '"', + "4": '\\"', + "5": '\\\\"', + "6": '\\x40"', + "7": "[[[{{{", + "8": "]]]}}}", + } + + # read one after the other with keys + stream = adafruit_json_stream.load(BytesChunkIO(dict_with_bad_strings.encode())) + assert stream["1"] == bad_strings["1"] + assert stream["2"] == bad_strings["2"] + assert stream["3"] == bad_strings["3"] + assert stream["4"] == bad_strings["4"] + assert stream["5"] == bad_strings["5"] + assert stream["6"] == bad_strings["6"] + assert stream["7"] == bad_strings["7"] + assert stream["8"] == bad_strings["8"] + + +def test_bad_strings_in_dict_skip_some(dict_with_bad_strings): + """Test loading different strings that can confuse the parser.""" + + bad_strings = { + "1": '"}"', + "2": '{"a": 1, "b": [2,3]}', + "3": '"', + "4": '\\"', + "5": '\\\\"', + "6": '\\x40"', + "7": "[[[{{{", + "8": "]]]}}}", + } + + # read some, skip some + stream = adafruit_json_stream.load(BytesChunkIO(dict_with_bad_strings.encode())) + assert stream["2"] == bad_strings["2"] + assert stream["5"] == bad_strings["5"] + assert stream["8"] == bad_strings["8"] + + def test_complex_dict_grabbing(complex_dict): """Test loading a complex dict and grabbing specific keys.""" @@ -543,3 +685,78 @@ def test_as_object_grabbing_multiple_subscriptable_levels_again_after_passed_rai assert next(dict_1["sub_list"]) == "a" with pytest.raises(KeyError, match="sub_dict"): dict_1["sub_dict"]["sub_dict_name"] + + +def test_iterating_keys(dict_with_keys): + """Iterate through keys of a simple object.""" + + bytes_io_chunk = BytesChunkIO(dict_with_keys.encode()) + stream = adafruit_json_stream.load(bytes_io_chunk) + output = list(stream) + assert output == ["field_1", "field_2", "field_3"] + + +def test_iterating_keys_get(dict_with_keys): + """Iterate through keys of a simple object and get values.""" + + the_dict = json.loads(dict_with_keys) + + bytes_io_chunk = BytesChunkIO(dict_with_keys.encode()) + stream = adafruit_json_stream.load(bytes_io_chunk) + for key in stream: + value = stream[key] + assert value == the_dict[key] + + +def test_iterating_items(dict_with_keys): + """Iterate through items of a simple object.""" + + bytes_io_chunk = BytesChunkIO(dict_with_keys.encode()) + stream = adafruit_json_stream.load(bytes_io_chunk) + output = list(stream.items()) + assert output == [("field_1", 1), ("field_2", 2), ("field_3", 3)] + + +def test_iterating_keys_after_get(dict_with_keys): + """Iterate through keys of a simple object after an item has already been read.""" + + bytes_io_chunk = BytesChunkIO(dict_with_keys.encode()) + stream = adafruit_json_stream.load(bytes_io_chunk) + assert stream["field_1"] == 1 + output = list(stream) + assert output == ["field_2", "field_3"] + + +def test_iterating_items_after_get(dict_with_keys): + """Iterate through items of a simple object after an item has already been read.""" + + bytes_io_chunk = BytesChunkIO(dict_with_keys.encode()) + stream = adafruit_json_stream.load(bytes_io_chunk) + assert stream["field_1"] == 1 + output = list(stream.items()) + assert output == [("field_2", 2), ("field_3", 3)] + + +def test_iterating_complex_dict(complex_dict): + """Mix iterating over items of objects in objects in arrays.""" + + names = ["one", "two", "three", "four"] + sub_values = [None, "two point one", "three point one", None] + + stream = adafruit_json_stream.load(BytesChunkIO(complex_dict.encode())) + + thing_num = 0 + for (index, item) in enumerate(stream.items()): + key, a_list = item + assert key == f"list_{index+1}" + for thing in a_list: + assert thing["dict_name"] == names[thing_num] + for sub_key in thing["sub_dict"]: + # break after getting a key with or without the value + # (testing finish() called from the parent list) + if sub_key == "sub_dict_name": + if thing_num in {1, 2}: + value = thing["sub_dict"][sub_key] + assert value == sub_values[thing_num] + break + thing_num += 1