Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7d6e076

Browse files
committed
Issue #7451: Improve decoding performance of JSON objects, and reduce
the memory consumption of said decoded objects when they use the same strings as keys.
1 parent d9107aa commit 7d6e076

5 files changed

Lines changed: 131 additions & 48 deletions

File tree

Lib/json/decoder.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,10 +147,14 @@ def py_scanstring(s, end, strict=True,
147147

148148

149149
def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
150-
_w=WHITESPACE.match, _ws=WHITESPACE_STR):
150+
memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
151151
s, end = s_and_end
152152
pairs = []
153153
pairs_append = pairs.append
154+
# Backwards compatibility
155+
if memo is None:
156+
memo = {}
157+
memo_get = memo.setdefault
154158
# Use a slice to prevent IndexError from being raised, the following
155159
# check will raise a more specific ValueError if the string is empty
156160
nextchar = s[end:end + 1]
@@ -167,6 +171,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
167171
end += 1
168172
while True:
169173
key, end = scanstring(s, end, strict)
174+
key = memo_get(key, key)
170175
# To skip some function call overhead we optimize the fast paths where
171176
# the JSON key separator is ": " or just ":".
172177
if s[end:end + 1] != ':':
@@ -214,7 +219,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
214219
pairs = object_hook(pairs)
215220
return pairs, end
216221

217-
def JSONArray(s_and_end, scan_once, context, _w=WHITESPACE.match):
222+
def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
218223
s, end = s_and_end
219224
values = []
220225
nextchar = s[end:end + 1]
@@ -314,6 +319,7 @@ def __init__(self, object_hook=None, parse_float=None,
314319
self.parse_object = JSONObject
315320
self.parse_array = JSONArray
316321
self.parse_string = scanstring
322+
self.memo = {}
317323
self.scan_once = make_scanner(self)
318324

319325

Lib/json/scanner.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ def py_make_scanner(context):
2222
parse_int = context.parse_int
2323
parse_constant = context.parse_constant
2424
object_hook = context.object_hook
25+
object_pairs_hook = context.object_pairs_hook
26+
memo = context.memo
2527

2628
def _scan_once(string, idx):
2729
try:
@@ -33,7 +35,7 @@ def _scan_once(string, idx):
3335
return parse_string(string, idx + 1, strict)
3436
elif nextchar == '{':
3537
return parse_object((string, idx + 1), strict,
36-
_scan_once, object_hook, object_pairs_hook)
38+
_scan_once, object_hook, object_pairs_hook, memo)
3739
elif nextchar == '[':
3840
return parse_array((string, idx + 1), _scan_once)
3941
elif nextchar == 'n' and string[idx:idx + 4] == 'null':
@@ -60,6 +62,12 @@ def _scan_once(string, idx):
6062
else:
6163
raise StopIteration
6264

65+
def scan_once(string, idx):
66+
try:
67+
return _scan_once(string, idx)
68+
finally:
69+
memo.clear()
70+
6371
return _scan_once
6472

6573
make_scanner = c_make_scanner or py_make_scanner

Lib/json/tests/test_decode.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,25 @@
11
import decimal
22
from unittest import TestCase
33
from io import StringIO
4+
from contextlib import contextmanager
45

56
import json
7+
import json.decoder
8+
import json.scanner
69
from collections import OrderedDict
710

11+
12+
@contextmanager
13+
def use_python_scanner():
14+
py_scanner = json.scanner.py_make_scanner
15+
old_scanner = json.decoder.make_scanner
16+
json.decoder.make_scanner = py_scanner
17+
try:
18+
yield
19+
finally:
20+
json.decoder.make_scanner = old_scanner
21+
22+
823
class TestDecode(TestCase):
924
def test_decimal(self):
1025
rval = json.loads('1.1', parse_float=decimal.Decimal)
@@ -39,3 +54,16 @@ def test_decoder_optimizations(self):
3954
# exercise the uncommon cases. The array cases are already covered.
4055
rval = json.loads('{ "key" : "value" , "k":"v" }')
4156
self.assertEquals(rval, {"key":"value", "k":"v"})
57+
58+
def check_keys_reuse(self, source, loads):
59+
rval = loads(source)
60+
(a, b), (c, d) = sorted(rval[0]), sorted(rval[1])
61+
self.assertIs(a, c)
62+
self.assertIs(b, d)
63+
64+
def test_keys_reuse(self):
65+
s = '[{"a_key": 1, "b_\xe9": 2}, {"a_key": 3, "b_\xe9": 4}]'
66+
self.check_keys_reuse(s, json.loads)
67+
# Disabled: the pure Python version of json simply doesn't work
68+
with use_python_scanner():
69+
self.check_keys_reuse(s, json.decoder.JSONDecoder().decode)

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,10 @@ Extensions
165165
Library
166166
-------
167167

168+
- Issue #7451: Improve decoding performance of JSON objects, and reduce
169+
the memory consumption of said decoded objects when they use the same
170+
strings as keys.
171+
168172
- Issue #1100562: Fix deep-copying of objects derived from the list and
169173
dict types. Patch by Michele Orrù and Björn Lindqvist.
170174

Modules/_json.c

Lines changed: 82 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ typedef struct _PyScannerObject {
3636
PyObject *parse_float;
3737
PyObject *parse_int;
3838
PyObject *parse_constant;
39+
PyObject *memo;
3940
} PyScannerObject;
4041

4142
static PyMemberDef scanner_members[] = {
@@ -305,6 +306,21 @@ _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) {
305306
return tpl;
306307
}
307308

309+
#define APPEND_OLD_CHUNK \
310+
if (chunk != NULL) { \
311+
if (chunks == NULL) { \
312+
chunks = PyList_New(0); \
313+
if (chunks == NULL) { \
314+
goto bail; \
315+
} \
316+
} \
317+
if (PyList_Append(chunks, chunk)) { \
318+
Py_DECREF(chunk); \
319+
goto bail; \
320+
} \
321+
Py_CLEAR(chunk); \
322+
}
323+
308324
static PyObject *
309325
scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr)
310326
{
@@ -316,23 +332,21 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
316332
317333
Return value is a new PyUnicode
318334
*/
319-
PyObject *rval;
335+
PyObject *rval = NULL;
320336
Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
321337
Py_ssize_t begin = end - 1;
322338
Py_ssize_t next = begin;
323339
const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
324-
PyObject *chunks = PyList_New(0);
325-
if (chunks == NULL) {
326-
goto bail;
327-
}
340+
PyObject *chunks = NULL;
341+
PyObject *chunk = NULL;
342+
328343
if (end < 0 || len <= end) {
329344
PyErr_SetString(PyExc_ValueError, "end is out of bounds");
330345
goto bail;
331346
}
332347
while (1) {
333348
/* Find the end of the string or the next escape */
334349
Py_UNICODE c = 0;
335-
PyObject *chunk = NULL;
336350
for (next = end; next < len; next++) {
337351
c = buf[next];
338352
if (c == '"' || c == '\\') {
@@ -349,15 +363,11 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
349363
}
350364
/* Pick up this chunk if it's not zero length */
351365
if (next != end) {
366+
APPEND_OLD_CHUNK
352367
chunk = PyUnicode_FromUnicode(&buf[end], next - end);
353368
if (chunk == NULL) {
354369
goto bail;
355370
}
356-
if (PyList_Append(chunks, chunk)) {
357-
Py_DECREF(chunk);
358-
goto bail;
359-
}
360-
Py_DECREF(chunk);
361371
}
362372
next++;
363373
if (c == '"') {
@@ -459,27 +469,34 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
459469
}
460470
#endif
461471
}
472+
APPEND_OLD_CHUNK
462473
chunk = PyUnicode_FromUnicode(&c, 1);
463474
if (chunk == NULL) {
464475
goto bail;
465476
}
466-
if (PyList_Append(chunks, chunk)) {
467-
Py_DECREF(chunk);
477+
}
478+
479+
if (chunks == NULL) {
480+
if (chunk != NULL)
481+
rval = chunk;
482+
else
483+
rval = PyUnicode_FromStringAndSize("", 0);
484+
}
485+
else {
486+
APPEND_OLD_CHUNK
487+
rval = join_list_unicode(chunks);
488+
if (rval == NULL) {
468489
goto bail;
469490
}
470-
Py_DECREF(chunk);
491+
Py_CLEAR(chunks);
471492
}
472493

473-
rval = join_list_unicode(chunks);
474-
if (rval == NULL) {
475-
goto bail;
476-
}
477-
Py_DECREF(chunks);
478494
*next_end_ptr = end;
479495
return rval;
480496
bail:
481497
*next_end_ptr = -1;
482498
Py_XDECREF(chunks);
499+
Py_XDECREF(chunk);
483500
return NULL;
484501
}
485502

@@ -578,6 +595,7 @@ scanner_clear(PyObject *self)
578595
Py_CLEAR(s->parse_float);
579596
Py_CLEAR(s->parse_int);
580597
Py_CLEAR(s->parse_constant);
598+
Py_CLEAR(s->memo);
581599
return 0;
582600
}
583601

@@ -593,10 +611,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
593611
Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
594612
Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
595613
PyObject *val = NULL;
596-
PyObject *rval = PyList_New(0);
614+
PyObject *rval = NULL;
597615
PyObject *key = NULL;
598616
int strict = PyObject_IsTrue(s->strict);
617+
int has_pairs_hook = (s->object_pairs_hook != Py_None);
599618
Py_ssize_t next_idx;
619+
620+
if (has_pairs_hook)
621+
rval = PyList_New(0);
622+
else
623+
rval = PyDict_New();
600624
if (rval == NULL)
601625
return NULL;
602626

@@ -606,6 +630,8 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
606630
/* only loop if the object is non-empty */
607631
if (idx <= end_idx && str[idx] != '}') {
608632
while (idx <= end_idx) {
633+
PyObject *memokey;
634+
609635
/* read key */
610636
if (str[idx] != '"') {
611637
raise_errmsg("Expecting property name", pystr, idx);
@@ -614,6 +640,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
614640
key = scanstring_unicode(pystr, idx + 1, strict, &next_idx);
615641
if (key == NULL)
616642
goto bail;
643+
memokey = PyDict_GetItem(s->memo, key);
644+
if (memokey != NULL) {
645+
Py_INCREF(memokey);
646+
Py_DECREF(key);
647+
key = memokey;
648+
}
649+
else {
650+
if (PyDict_SetItem(s->memo, key, key) < 0)
651+
goto bail;
652+
}
617653
idx = next_idx;
618654

619655
/* skip whitespace between key and : delimiter, read :, skip whitespace */
@@ -630,19 +666,24 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
630666
if (val == NULL)
631667
goto bail;
632668

633-
{
634-
PyObject *tuple = PyTuple_Pack(2, key, val);
635-
if (tuple == NULL)
669+
if (has_pairs_hook) {
670+
PyObject *item = PyTuple_Pack(2, key, val);
671+
if (item == NULL)
636672
goto bail;
637-
if (PyList_Append(rval, tuple) == -1) {
638-
Py_DECREF(tuple);
673+
Py_CLEAR(key);
674+
Py_CLEAR(val);
675+
if (PyList_Append(rval, item) == -1) {
676+
Py_DECREF(item);
639677
goto bail;
640678
}
641-
Py_DECREF(tuple);
679+
Py_DECREF(item);
680+
}
681+
else {
682+
if (PyDict_SetItem(rval, key, val) < 0)
683+
goto bail;
684+
Py_CLEAR(key);
685+
Py_CLEAR(val);
642686
}
643-
644-
Py_CLEAR(key);
645-
Py_CLEAR(val);
646687
idx = next_idx;
647688

648689
/* skip whitespace before } or , */
@@ -672,36 +713,23 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
672713

673714
*next_idx_ptr = idx + 1;
674715

675-
if (s->object_pairs_hook != Py_None) {
716+
if (has_pairs_hook) {
676717
val = PyObject_CallFunctionObjArgs(s->object_pairs_hook, rval, NULL);
677-
if (val == NULL)
678-
goto bail;
679718
Py_DECREF(rval);
680719
return val;
681720
}
682721

683-
val = PyDict_New();
684-
if (val == NULL)
685-
goto bail;
686-
if (PyDict_MergeFromSeq2(val, rval, 1) == -1)
687-
goto bail;
688-
Py_DECREF(rval);
689-
rval = val;
690-
691722
/* if object_hook is not None: rval = object_hook(rval) */
692723
if (s->object_hook != Py_None) {
693724
val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL);
694-
if (val == NULL)
695-
goto bail;
696725
Py_DECREF(rval);
697-
rval = val;
698-
val = NULL;
726+
return val;
699727
}
700728
return rval;
701729
bail:
702730
Py_XDECREF(key);
703731
Py_XDECREF(val);
704-
Py_DECREF(rval);
732+
Py_XDECREF(rval);
705733
return NULL;
706734
}
707735

@@ -988,6 +1016,9 @@ scanner_call(PyObject *self, PyObject *args, PyObject *kwds)
9881016
Py_TYPE(pystr)->tp_name);
9891017
return NULL;
9901018
}
1019+
PyDict_Clear(s->memo);
1020+
if (rval == NULL)
1021+
return NULL;
9911022
return _build_rval_index_tuple(rval, next_idx);
9921023
}
9931024

@@ -1021,6 +1052,12 @@ scanner_init(PyObject *self, PyObject *args, PyObject *kwds)
10211052
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx))
10221053
return -1;
10231054

1055+
if (s->memo == NULL) {
1056+
s->memo = PyDict_New();
1057+
if (s->memo == NULL)
1058+
goto bail;
1059+
}
1060+
10241061
/* All of these will fail "gracefully" so we don't need to verify them */
10251062
s->strict = PyObject_GetAttrString(ctx, "strict");
10261063
if (s->strict == NULL)

0 commit comments

Comments
 (0)