From 2e915fd82a9b4e9fd6d64e79da8f1b6b4ec12805 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 6 May 2024 11:08:43 +0300 Subject: [PATCH 1/3] gh-95382: Use cache for indentations in the JSON encoder --- Modules/_json.c | 174 +++++++++++++++++++++++++++++------------------- 1 file changed, 106 insertions(+), 68 deletions(-) diff --git a/Modules/_json.c b/Modules/_json.c index e33ef1f5eea92f..664ea27ccbca86 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -85,11 +85,11 @@ encoder_dealloc(PyObject *self); static int encoder_clear(PyEncoderObject *self); static int -encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *seq, PyObject *newline_indent); +encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *seq, Py_ssize_t indent_level, PyObject *indent_cache); static int -encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *obj, PyObject *newline_indent); +encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *obj, Py_ssize_t indent_level, PyObject *indent_cache); static int -encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *dct, PyObject *newline_indent); +encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *dct, Py_ssize_t indent_level, PyObject *indent_cache); static PyObject * _encoded_const(PyObject *obj); static void @@ -1252,14 +1252,81 @@ encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) } static PyObject * -_create_newline_indent(PyObject *indent, Py_ssize_t indent_level) +_create_indent_cache(PyEncoderObject *s, Py_ssize_t indent_level) { PyObject *newline_indent = PyUnicode_FromOrdinal('\n'); if (newline_indent != NULL && indent_level) { PyUnicode_AppendAndDel(&newline_indent, - PySequence_Repeat(indent, indent_level)); + PySequence_Repeat(s->indent, indent_level)); } - return newline_indent; + if (newline_indent == NULL) { + return NULL; + } + PyObject *indent_cache = PyList_New(2); + if (indent_cache == NULL) { + Py_XDECREF(newline_indent); + return NULL; + } + PyList_SET_ITEM(indent_cache, 0, newline_indent); + PyList_SET_ITEM(indent_cache, 1, Py_NewRef(Py_None)); // not used + return indent_cache; +} + +static int +update_newline_indent(PyEncoderObject *s, + Py_ssize_t indent_level, PyObject *indent_cache) +{ + assert(indent_level * 2 == PyList_GET_SIZE(indent_cache)); + assert(indent_level > 0); + PyObject *newline_indent = PyList_GET_ITEM(indent_cache, (indent_level - 1)*2); + newline_indent = PyUnicode_Concat(newline_indent, s->indent); + if (newline_indent == NULL) { + return -1; + } + if (PyList_Append(indent_cache, newline_indent) < 0) { + Py_DECREF(newline_indent); + return -1; + } + PyObject *separator_indent = PyUnicode_Concat(s->item_separator, newline_indent); + Py_DECREF(newline_indent); + if (PyList_Append(indent_cache, separator_indent) < 0) { + Py_DECREF(separator_indent); + return -1; + } + Py_DECREF(separator_indent); + return 0; +} + +static PyObject * +do_indent(PyEncoderObject *s, _PyUnicodeWriter *writer, + Py_ssize_t indent_level, PyObject *indent_cache) +{ + assert(indent_level > 0); + assert(s->indent != Py_None); + PyObject *newline_indent; + if (indent_level * 2 == PyList_GET_SIZE(indent_cache)) { + if (update_newline_indent(s, indent_level, indent_cache) < 0) { + return NULL; + } + } + assert(indent_level * 2 <= PyList_GET_SIZE(indent_cache) - 2); + + newline_indent = PyList_GET_ITEM(indent_cache, indent_level * 2); + if (_PyUnicodeWriter_WriteStr(writer, newline_indent) < 0) { + return NULL; + } + return PyList_GET_ITEM(indent_cache, indent_level * 2 + 1); +} + +static int +do_dedent(PyEncoderObject *s, _PyUnicodeWriter *writer, + Py_ssize_t indent_level, PyObject *indent_cache) +{ + assert(indent_level >= 0); + assert(indent_level * 2 <= PyList_GET_SIZE(indent_cache) - 4); + assert(s->indent != Py_None); + PyObject *newline_indent = PyList_GET_ITEM(indent_cache, indent_level * 2); + return _PyUnicodeWriter_WriteStr(writer, newline_indent); } static PyObject * @@ -1278,20 +1345,20 @@ encoder_call(PyEncoderObject *self, PyObject *args, PyObject *kwds) _PyUnicodeWriter_Init(&writer); writer.overallocate = 1; - PyObject *newline_indent = NULL; + PyObject *indent_cache = NULL; if (self->indent != Py_None) { - newline_indent = _create_newline_indent(self->indent, indent_level); - if (newline_indent == NULL) { + indent_cache = _create_indent_cache(self, indent_level); + if (indent_cache == NULL) { _PyUnicodeWriter_Dealloc(&writer); return NULL; } } - if (encoder_listencode_obj(self, &writer, obj, newline_indent)) { + if (encoder_listencode_obj(self, &writer, obj, 0, indent_cache)) { _PyUnicodeWriter_Dealloc(&writer); - Py_XDECREF(newline_indent); + Py_XDECREF(indent_cache); return NULL; } - Py_XDECREF(newline_indent); + Py_XDECREF(indent_cache); result = PyTuple_New(1); if (result == NULL || @@ -1379,7 +1446,8 @@ _steal_accumulate(_PyUnicodeWriter *writer, PyObject *stolen) static int encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer, - PyObject *obj, PyObject *newline_indent) + PyObject *obj, + Py_ssize_t indent_level, PyObject *indent_cache) { /* Encode Python object obj to a JSON term */ PyObject *newobj; @@ -1415,14 +1483,14 @@ encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer, else if (PyList_Check(obj) || PyTuple_Check(obj)) { if (_Py_EnterRecursiveCall(" while encoding a JSON object")) return -1; - rv = encoder_listencode_list(s, writer, obj, newline_indent); + rv = encoder_listencode_list(s, writer, obj, indent_level, indent_cache); _Py_LeaveRecursiveCall(); return rv; } else if (PyDict_Check(obj)) { if (_Py_EnterRecursiveCall(" while encoding a JSON object")) return -1; - rv = encoder_listencode_dict(s, writer, obj, newline_indent); + rv = encoder_listencode_dict(s, writer, obj, indent_level, indent_cache); _Py_LeaveRecursiveCall(); return rv; } @@ -1456,7 +1524,7 @@ encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer, Py_XDECREF(ident); return -1; } - rv = encoder_listencode_obj(s, writer, newobj, newline_indent); + rv = encoder_listencode_obj(s, writer, newobj, indent_level, indent_cache); _Py_LeaveRecursiveCall(); Py_DECREF(newobj); @@ -1478,7 +1546,7 @@ encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer, static int encoder_encode_key_value(PyEncoderObject *s, _PyUnicodeWriter *writer, bool *first, PyObject *key, PyObject *value, - PyObject *newline_indent, + Py_ssize_t indent_level, PyObject *indent_cache, PyObject *item_separator) { PyObject *keystr = NULL; @@ -1534,7 +1602,7 @@ encoder_encode_key_value(PyEncoderObject *s, _PyUnicodeWriter *writer, bool *fir if (_PyUnicodeWriter_WriteStr(writer, s->key_separator) < 0) { return -1; } - if (encoder_listencode_obj(s, writer, value, newline_indent) < 0) { + if (encoder_listencode_obj(s, writer, value, indent_level, indent_cache) < 0) { return -1; } return 0; @@ -1542,15 +1610,14 @@ encoder_encode_key_value(PyEncoderObject *s, _PyUnicodeWriter *writer, bool *fir static int encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, - PyObject *dct, PyObject *newline_indent) + PyObject *dct, + Py_ssize_t indent_level, PyObject *indent_cache) { /* Encode Python dict dct a JSON term */ PyObject *ident = NULL; PyObject *items = NULL; PyObject *key, *value; bool first = true; - PyObject *new_newline_indent = NULL; - PyObject *separator_indent = NULL; if (PyDict_GET_SIZE(dct) == 0) /* Fast path */ return _PyUnicodeWriter_WriteASCIIString(writer, "{}", 2); @@ -1574,19 +1641,11 @@ encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, if (_PyUnicodeWriter_WriteChar(writer, '{')) goto bail; - PyObject *current_item_separator = s->item_separator; // borrowed reference + PyObject *separator = s->item_separator; // borrowed reference if (s->indent != Py_None) { - new_newline_indent = PyUnicode_Concat(newline_indent, s->indent); - if (new_newline_indent == NULL) { - goto bail; - } - separator_indent = PyUnicode_Concat(current_item_separator, new_newline_indent); - if (separator_indent == NULL) { - goto bail; - } - // update item separator with a borrowed reference - current_item_separator = separator_indent; - if (_PyUnicodeWriter_WriteStr(writer, new_newline_indent) < 0) { + indent_level++; + separator = do_indent(s, writer, indent_level, indent_cache); + if (separator == NULL) { goto bail; } } @@ -1607,8 +1666,8 @@ encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, key = PyTuple_GET_ITEM(item, 0); value = PyTuple_GET_ITEM(item, 1); if (encoder_encode_key_value(s, writer, &first, key, value, - new_newline_indent, - current_item_separator) < 0) + indent_level, indent_cache, + separator) < 0) goto bail; } Py_CLEAR(items); @@ -1617,8 +1676,8 @@ encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, Py_ssize_t pos = 0; while (PyDict_Next(dct, &pos, &key, &value)) { if (encoder_encode_key_value(s, writer, &first, key, value, - new_newline_indent, - current_item_separator) < 0) + indent_level, indent_cache, + separator) < 0) goto bail; } } @@ -1629,12 +1688,8 @@ encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, Py_CLEAR(ident); } if (s->indent != Py_None) { - Py_CLEAR(new_newline_indent); - Py_CLEAR(separator_indent); - - if (_PyUnicodeWriter_WriteStr(writer, newline_indent) < 0) { - goto bail; - } + indent_level--; + do_dedent(s, writer, indent_level, indent_cache); } if (_PyUnicodeWriter_WriteChar(writer, '}')) @@ -1644,20 +1699,17 @@ encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, bail: Py_XDECREF(items); Py_XDECREF(ident); - Py_XDECREF(separator_indent); - Py_XDECREF(new_newline_indent); return -1; } static int encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, - PyObject *seq, PyObject *newline_indent) + PyObject *seq, + Py_ssize_t indent_level, PyObject *indent_cache) { PyObject *ident = NULL; PyObject *s_fast = NULL; Py_ssize_t i; - PyObject *new_newline_indent = NULL; - PyObject *separator_indent = NULL; ident = NULL; s_fast = PySequence_Fast(seq, "_iterencode_list needs a sequence"); @@ -1689,20 +1741,11 @@ encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *separator = s->item_separator; // borrowed reference if (s->indent != Py_None) { - new_newline_indent = PyUnicode_Concat(newline_indent, s->indent); - if (new_newline_indent == NULL) { - goto bail; - } - - if (_PyUnicodeWriter_WriteStr(writer, new_newline_indent) < 0) { + indent_level++; + separator = do_indent(s, writer, indent_level, indent_cache); + if (separator == NULL) { goto bail; } - - separator_indent = PyUnicode_Concat(separator, new_newline_indent); - if (separator_indent == NULL) { - goto bail; - } - separator = separator_indent; // assign separator with borrowed reference } for (i = 0; i < PySequence_Fast_GET_SIZE(s_fast); i++) { PyObject *obj = PySequence_Fast_GET_ITEM(s_fast, i); @@ -1710,7 +1753,7 @@ encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, if (_PyUnicodeWriter_WriteStr(writer, separator) < 0) goto bail; } - if (encoder_listencode_obj(s, writer, obj, new_newline_indent)) + if (encoder_listencode_obj(s, writer, obj, indent_level, indent_cache)) goto bail; } if (ident != NULL) { @@ -1720,11 +1763,8 @@ encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, } if (s->indent != Py_None) { - Py_CLEAR(new_newline_indent); - Py_CLEAR(separator_indent); - if (_PyUnicodeWriter_WriteStr(writer, newline_indent) < 0) { - goto bail; - } + indent_level--; + do_dedent(s, writer, indent_level, indent_cache); } if (_PyUnicodeWriter_WriteChar(writer, ']')) @@ -1735,8 +1775,6 @@ encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, bail: Py_XDECREF(ident); Py_DECREF(s_fast); - Py_XDECREF(separator_indent); - Py_XDECREF(new_newline_indent); return -1; } From 2a7b274544151c7cb98208c352d3349e541fb814 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 6 May 2024 18:47:17 +0300 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Pieter Eendebak --- Modules/_json.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Modules/_json.c b/Modules/_json.c index 664ea27ccbca86..f4e2bb8bdf6902 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -1689,7 +1689,9 @@ encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, } if (s->indent != Py_None) { indent_level--; - do_dedent(s, writer, indent_level, indent_cache); + if (do_dedent(s, writer, indent_level, indent_cache) < 0) { + goto bail; + } } if (_PyUnicodeWriter_WriteChar(writer, '}')) @@ -1764,7 +1766,9 @@ encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, if (s->indent != Py_None) { indent_level--; - do_dedent(s, writer, indent_level, indent_cache); + if (do_dedent(s, writer, indent_level, indent_cache) < 0) { + goto bail; + } } if (_PyUnicodeWriter_WriteChar(writer, ']')) From 86586a5e14e3ad6ee7aabbc529e18c0a722b5336 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 7 May 2024 10:46:33 +0300 Subject: [PATCH 3/3] Refactoring. --- Modules/_json.c | 84 ++++++++++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/Modules/_json.c b/Modules/_json.c index f4e2bb8bdf6902..a029a464670335 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -1251,8 +1251,19 @@ encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) return (PyObject *)s; } + +/* indent_cache is a list that contains intermixed values at even and odd + * positions: + * + * 2*k : '\n' + indent * (k + initial_indent_level) + * strings written after opening and before closing brackets + * 2*k-1 : item_separator + '\n' + indent * (k + initial_indent_level) + * strings written between items + * + * Its size is always an odd number. + */ static PyObject * -_create_indent_cache(PyEncoderObject *s, Py_ssize_t indent_level) +create_indent_cache(PyEncoderObject *s, Py_ssize_t indent_level) { PyObject *newline_indent = PyUnicode_FromOrdinal('\n'); if (newline_indent != NULL && indent_level) { @@ -1262,73 +1273,70 @@ _create_indent_cache(PyEncoderObject *s, Py_ssize_t indent_level) if (newline_indent == NULL) { return NULL; } - PyObject *indent_cache = PyList_New(2); + PyObject *indent_cache = PyList_New(1); if (indent_cache == NULL) { - Py_XDECREF(newline_indent); + Py_DECREF(newline_indent); return NULL; } PyList_SET_ITEM(indent_cache, 0, newline_indent); - PyList_SET_ITEM(indent_cache, 1, Py_NewRef(Py_None)); // not used return indent_cache; } +/* Extend indent_cache by adding values for the next level. + * It should have values for the indent_level-1 level before the call. + */ static int -update_newline_indent(PyEncoderObject *s, - Py_ssize_t indent_level, PyObject *indent_cache) +update_indent_cache(PyEncoderObject *s, + Py_ssize_t indent_level, PyObject *indent_cache) { - assert(indent_level * 2 == PyList_GET_SIZE(indent_cache)); + assert(indent_level * 2 == PyList_GET_SIZE(indent_cache) + 1); assert(indent_level > 0); PyObject *newline_indent = PyList_GET_ITEM(indent_cache, (indent_level - 1)*2); newline_indent = PyUnicode_Concat(newline_indent, s->indent); if (newline_indent == NULL) { return -1; } - if (PyList_Append(indent_cache, newline_indent) < 0) { + PyObject *separator_indent = PyUnicode_Concat(s->item_separator, newline_indent); + if (separator_indent == NULL) { Py_DECREF(newline_indent); return -1; } - PyObject *separator_indent = PyUnicode_Concat(s->item_separator, newline_indent); - Py_DECREF(newline_indent); - if (PyList_Append(indent_cache, separator_indent) < 0) { + + if (PyList_Append(indent_cache, separator_indent) < 0 || + PyList_Append(indent_cache, newline_indent) < 0) + { Py_DECREF(separator_indent); + Py_DECREF(newline_indent); return -1; } Py_DECREF(separator_indent); + Py_DECREF(newline_indent); return 0; } static PyObject * -do_indent(PyEncoderObject *s, _PyUnicodeWriter *writer, - Py_ssize_t indent_level, PyObject *indent_cache) +get_item_separator(PyEncoderObject *s, + Py_ssize_t indent_level, PyObject *indent_cache) { assert(indent_level > 0); - assert(s->indent != Py_None); - PyObject *newline_indent; - if (indent_level * 2 == PyList_GET_SIZE(indent_cache)) { - if (update_newline_indent(s, indent_level, indent_cache) < 0) { + if (indent_level * 2 > PyList_GET_SIZE(indent_cache)) { + if (update_indent_cache(s, indent_level, indent_cache) < 0) { return NULL; } } - assert(indent_level * 2 <= PyList_GET_SIZE(indent_cache) - 2); - - newline_indent = PyList_GET_ITEM(indent_cache, indent_level * 2); - if (_PyUnicodeWriter_WriteStr(writer, newline_indent) < 0) { - return NULL; - } - return PyList_GET_ITEM(indent_cache, indent_level * 2 + 1); + assert(indent_level * 2 < PyList_GET_SIZE(indent_cache)); + return PyList_GET_ITEM(indent_cache, indent_level * 2 - 1); } static int -do_dedent(PyEncoderObject *s, _PyUnicodeWriter *writer, - Py_ssize_t indent_level, PyObject *indent_cache) +write_newline_indent(_PyUnicodeWriter *writer, + Py_ssize_t indent_level, PyObject *indent_cache) { - assert(indent_level >= 0); - assert(indent_level * 2 <= PyList_GET_SIZE(indent_cache) - 4); - assert(s->indent != Py_None); PyObject *newline_indent = PyList_GET_ITEM(indent_cache, indent_level * 2); return _PyUnicodeWriter_WriteStr(writer, newline_indent); } + static PyObject * encoder_call(PyEncoderObject *self, PyObject *args, PyObject *kwds) { @@ -1347,7 +1355,7 @@ encoder_call(PyEncoderObject *self, PyObject *args, PyObject *kwds) PyObject *indent_cache = NULL; if (self->indent != Py_None) { - indent_cache = _create_indent_cache(self, indent_level); + indent_cache = create_indent_cache(self, indent_level); if (indent_cache == NULL) { _PyUnicodeWriter_Dealloc(&writer); return NULL; @@ -1644,8 +1652,10 @@ encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *separator = s->item_separator; // borrowed reference if (s->indent != Py_None) { indent_level++; - separator = do_indent(s, writer, indent_level, indent_cache); - if (separator == NULL) { + separator = get_item_separator(s, indent_level, indent_cache); + if (separator == NULL || + write_newline_indent(writer, indent_level, indent_cache) < 0) + { goto bail; } } @@ -1689,7 +1699,7 @@ encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, } if (s->indent != Py_None) { indent_level--; - if (do_dedent(s, writer, indent_level, indent_cache) < 0) { + if (write_newline_indent(writer, indent_level, indent_cache) < 0) { goto bail; } } @@ -1744,8 +1754,10 @@ encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *separator = s->item_separator; // borrowed reference if (s->indent != Py_None) { indent_level++; - separator = do_indent(s, writer, indent_level, indent_cache); - if (separator == NULL) { + separator = get_item_separator(s, indent_level, indent_cache); + if (separator == NULL || + write_newline_indent(writer, indent_level, indent_cache) < 0) + { goto bail; } } @@ -1766,7 +1778,7 @@ encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, if (s->indent != Py_None) { indent_level--; - if (do_dedent(s, writer, indent_level, indent_cache) < 0) { + if (write_newline_indent(writer, indent_level, indent_cache) < 0) { goto bail; } }