Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ available in the repository.
2. [option](https://github.com/ijl/orjson?tab=readme-ov-file#option)
3. [Fragment](https://github.com/ijl/orjson?tab=readme-ov-file#fragment)
5. [Deserialize](https://github.com/ijl/orjson?tab=readme-ov-file#deserialize)
1. [loads_next](https://github.com/ijl/orjson?tab=readme-ov-file#loads_next)
2. [Types](https://github.com/ijl/orjson?tab=readme-ov-file#types)
1. [dataclass](https://github.com/ijl/orjson?tab=readme-ov-file#dataclass)
2. [datetime](https://github.com/ijl/orjson?tab=readme-ov-file#datetime)
Expand Down Expand Up @@ -627,6 +628,90 @@ to parse the document.
`JSONDecodeError` is a subclass of `json.JSONDecodeError` and `ValueError`.
This is for compatibility with the standard library.

### loads_next

```python
def loads_next(__obj: Union[bytes, bytearray, memoryview]) -> Tuple[Any, int]: ...
```

`loads_next()` deserializes the next JSON document from a buffer and returns
a tuple of `(parsed_object, bytes_consumed)`. This is useful for parsing
multiple JSON documents from a single buffer, such as concatenated JSON
objects or newline-delimited JSON (NDJSON).

Unlike `loads()`, which requires the input to contain exactly one JSON
document with only whitespace after it, `loads_next()` stops parsing after
the first complete JSON document and reports how many bytes were consumed.
This allows the caller to continue parsing additional documents from the
same buffer.

Only binary input (`bytes`, `bytearray`, `memoryview`) is accepted.
`str` input is not supported and will raise `TypeError`.

The input must be valid UTF-8.

```python
>>> import orjson
>>> data = b'{"a":1}{"b":2}{"c":3}'
>>> obj1, n1 = orjson.loads_next(data)
>>> obj1
{'a': 1}
>>> n1
7
>>> obj2, n2 = orjson.loads_next(data[n1:])
>>> obj2
{'b': 2}
>>> obj3, n3 = orjson.loads_next(data[n1 + n2:])
>>> obj3
{'c': 3}
```

This is particularly useful for processing NDJSON (newline-delimited JSON)
files or streams:

```python
>>> import orjson
>>> ndjson_data = b'{"id":1,"value":"a"}\n{"id":2,"value":"b"}\n{"id":3,"value":"c"}\n'
>>> offset = 0
>>> results = []
>>> while offset < len(ndjson_data):
...
obj, consumed = orjson.loads_next(ndjson_data[offset:])
...
results.append(obj)
...
offset += consumed
>>> results
[{'id': 1, 'value': 'a'}, {'id': 2, 'value': 'b'}, {'id': 3, 'value': 'c'}]
```

Whitespace (spaces, tabs, newlines, carriage returns) before and after the
JSON document is consumed and included in the byte count:

```python
>>> import orjson
>>> data = b' {"key": "value"} \n{"next": true}'
>>> obj, consumed = orjson.loads_next(data)
>>> obj
{'key': 'value'}
>>> consumed # Includes leading and trailing whitespace
21
```

All of the same error conditions from `loads()` apply to `loads_next()`,
with the addition of a `TypeError` if given a `str` instead of binary input.

Note that with a large input, the library may attempt to allocate much more
memory than is actually required to parse the next document and fail accordingly.
User code may alleviate this by slicing the input to a smaller size if necessary,
and retrying with a larger slice if there was not enough data to parse a complete
document.

**When to use:** Use `loads()` when you have exactly one JSON document.
Use `loads_next()` when you need to parse multiple JSON documents from a
single buffer or stream, such as NDJSON, concatenated JSON objects, or
streaming JSON APIs.

## Types

### dataclass
Expand Down
3 changes: 0 additions & 3 deletions include/yyjson/yyjson.c
Original file line number Diff line number Diff line change
Expand Up @@ -5689,7 +5689,6 @@ static_noinline yyjson_doc *read_root_single(u8 *hdr,
} else {
while (char_is_space(*cur)) cur++;
}
if (unlikely(cur < end)) goto fail_garbage;
}

doc = (yyjson_doc *)val_hdr;
Expand Down Expand Up @@ -6053,7 +6052,6 @@ static_inline yyjson_doc *read_root_minify(u8 *hdr,
} else {
while (char_is_space(*cur)) cur++;
}
if (unlikely(cur < end)) goto fail_garbage;
}

doc = (yyjson_doc *)val_hdr;
Expand Down Expand Up @@ -6454,7 +6452,6 @@ static_inline yyjson_doc *read_root_pretty(u8 *hdr,
} else {
while (char_is_space(*cur)) cur++;
}
if (unlikely(cur < end)) goto fail_garbage;
}

doc = (yyjson_doc *)val_hdr;
Expand Down
1 change: 1 addition & 0 deletions pysrc/orjson/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"JSONDecodeError",
"JSONEncodeError",
"loads",
"loads_next",
"OPT_APPEND_NEWLINE",
"OPT_INDENT_2",
"OPT_NAIVE_UTC",
Expand Down
1 change: 1 addition & 0 deletions pysrc/orjson/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def dumps(
option: int | None = ...,
) -> bytes: ...
def loads(__obj: bytes | bytearray | memoryview | str) -> Any: ...
def loads_next(__obj: bytes | bytearray | memoryview) -> tuple[Any, int]: ...

class JSONDecodeError(json.JSONDecodeError): ...
class JSONEncodeError(TypeError): ...
Expand Down
2 changes: 2 additions & 0 deletions script/vendor-yyjson
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,5 @@ sed -i 's/(pre && !false)/(false)/g' include/yyjson/yyjson.c

git apply include/yyjson-recursion-limit.patch
git apply include/yyjson-reduce-unused.patch

sed -i '/ if (unlikely(cur < end)) goto fail_garbage;/d' include/yyjson/yyjson.c
28 changes: 24 additions & 4 deletions src/deserialize/backend/yyjson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ use super::ffi::{
YYJSON_READ_SUCCESS, yyjson_alc, yyjson_alc_pool_init, yyjson_doc, yyjson_read_err,
yyjson_read_opts, yyjson_val,
};
use crate::deserialize::DeserializeError;
use crate::deserialize::pyobject::{
get_unicode_key, parse_f64, parse_false, parse_i64, parse_none, parse_true, parse_u64,
};
use crate::deserialize::{DeserializeError, DeserializeResult};
use crate::str::PyStr;
use crate::util::usize_to_isize;
use core::ffi::c_char;
Expand Down Expand Up @@ -57,6 +57,10 @@ fn unsafe_yyjson_is_ctn(val: *mut yyjson_val) -> bool {
unsafe { (*val).tag as u8 & 0b00000110 == 0b00000110 }
}

fn unsafe_yyjson_doc_get_read_size(doc: *mut yyjson_doc) -> usize {
unsafe { (*doc).dat_read }
}

#[allow(clippy::cast_ptr_alignment)]
fn unsafe_yyjson_get_next_container(val: *mut yyjson_val) -> *mut yyjson_val {
unsafe { (val.cast::<u8>().add((*val).uni.ofs)).cast::<yyjson_val>() }
Expand All @@ -68,8 +72,9 @@ fn unsafe_yyjson_get_next_non_container(val: *mut yyjson_val) -> *mut yyjson_val
}

pub(crate) fn deserialize(
data: &'static str,
) -> Result<NonNull<crate::ffi::PyObject>, DeserializeError<'static>> {
data: &'static [u8],
must_read_all: bool,
) -> Result<DeserializeResult, DeserializeError<'static>> {
assume!(!data.is_empty());
let buffer_capacity = buffer_capacity_to_allocate(data.len());
let buffer_ptr = ffi!(PyMem_Malloc(buffer_capacity));
Expand Down Expand Up @@ -109,6 +114,18 @@ pub(crate) fn deserialize(
let msg: Cow<str> = unsafe { core::ffi::CStr::from_ptr(err.msg).to_string_lossy() };
return Err(DeserializeError::from_yyjson(msg, err.pos as i64, data));
}

let bytes_read = unsafe { unsafe_yyjson_doc_get_read_size(doc) };

if must_read_all && bytes_read != data.len() {
ffi!(PyMem_Free(buffer_ptr));
return Err(DeserializeError::from_yyjson(
Cow::Borrowed("Did not consume all input data"),
bytes_read as i64,
data,
));
}

let val = yyjson_doc_get_root(doc);
let pyval = {
if !unsafe_yyjson_is_ctn(val) {
Expand Down Expand Up @@ -140,7 +157,10 @@ pub(crate) fn deserialize(
}
};
ffi!(PyMem_Free(buffer_ptr));
Ok(pyval)
Ok(DeserializeResult {
obj: pyval,
bytes_read,
})
}

enum ElementType {
Expand Down
30 changes: 22 additions & 8 deletions src/deserialize/deserializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,39 @@ use crate::deserialize::utf8::read_input_to_buf;
use crate::typeref::EMPTY_UNICODE;
use core::ptr::NonNull;

pub(crate) struct DeserializeResult {
pub(crate) obj: NonNull<crate::ffi::PyObject>,
pub(crate) bytes_read: usize,
}

pub(crate) fn deserialize(
ptr: *mut crate::ffi::PyObject,
) -> Result<NonNull<crate::ffi::PyObject>, DeserializeError<'static>> {
must_read_all: bool,
) -> Result<DeserializeResult, DeserializeError<'static>> {
debug_assert!(ffi!(Py_REFCNT(ptr)) >= 1);
let buffer = read_input_to_buf(ptr)?;
debug_assert!(!buffer.is_empty());

if buffer.len() == 2 {
cold_path!();
if buffer == b"[]" {
return Ok(nonnull!(ffi!(PyList_New(0))));
return Ok(DeserializeResult {
obj: nonnull!(ffi!(PyList_New(0))),
bytes_read: 2,
});
} else if buffer == b"{}" {
return Ok(nonnull!(ffi!(PyDict_New())));
return Ok(DeserializeResult {
obj: nonnull!(ffi!(PyDict_New())),
bytes_read: 2,
});
} else if buffer == b"\"\"" {
unsafe { return Ok(nonnull!(use_immortal!(EMPTY_UNICODE))) }
unsafe {
return Ok(DeserializeResult {
obj: nonnull!(use_immortal!(EMPTY_UNICODE)),
bytes_read: 2,
});
}
}
}

let buffer_str = unsafe { core::str::from_utf8_unchecked(buffer) };

crate::deserialize::backend::deserialize(buffer_str)
crate::deserialize::backend::deserialize(buffer, must_read_all)
}
3 changes: 2 additions & 1 deletion src/deserialize/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ impl<'a> DeserializeError<'a> {
}

#[cold]
pub fn from_yyjson(message: Cow<'a, str>, pos: i64, data: &'a str) -> Self {
pub fn from_yyjson(message: Cow<'a, str>, pos: i64, data: &'a [u8]) -> Self {
let data = unsafe { core::str::from_utf8_unchecked(data) };
DeserializeError {
message: message,
data: Some(data),
Expand Down
2 changes: 1 addition & 1 deletion src/deserialize/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ mod utf8;

#[cfg(not(Py_GIL_DISABLED))]
pub(crate) use cache::{KEY_MAP, KeyMap};
pub(crate) use deserializer::deserialize;
pub(crate) use deserializer::{DeserializeResult, deserialize};
pub(crate) use error::DeserializeError;
61 changes: 55 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,13 @@ mod typeref;

use crate::ffi::{
METH_KEYWORDS, METH_O, Py_DECREF, Py_SIZE, Py_ssize_t, PyCFunction_NewEx, PyErr_SetObject,
PyLong_AsLong, PyLong_FromLongLong, PyMethodDef, PyMethodDefPointer, PyModuleDef,
PyModuleDef_HEAD_INIT, PyModuleDef_Slot, PyObject, PyTuple_New, PyUnicode_FromStringAndSize,
PyUnicode_InternFromString, PyVectorcall_NARGS,
PyExc_TypeError, PyLong_AsLong, PyLong_FromLongLong, PyMethodDef, PyMethodDefPointer,
PyModuleDef, PyModuleDef_HEAD_INIT, PyModuleDef_Slot, PyObject, PyTuple_New,
PyUnicode_FromStringAndSize, PyUnicode_InternFromString, PyVectorcall_NARGS,
};
use core::ffi::{c_char, c_int, c_void};

use crate::typeref::STR_TYPE;
use crate::util::{isize_to_usize, usize_to_isize};

#[allow(unused_imports)]
Expand Down Expand Up @@ -185,6 +186,25 @@ pub(crate) unsafe extern "C" fn orjson_init_exec(mptr: *mut PyObject) -> c_int {
add!(mptr, c"loads", func);
}

{
let loads_next_doc = c"loads_next(obj, /)\n--\n\nDeserialize the next JSON document from an UTF-8 byte buffer, returning the document and number of bytes read.";

let wrapped_loads_next = PyMethodDef {
ml_name: c"loads_next".as_ptr(),
ml_meth: PyMethodDefPointer {
PyCFunction: loads_next,
},
ml_flags: METH_O,
ml_doc: loads_next_doc.as_ptr(),
};
let func = PyCFunction_NewEx(
Box::into_raw(Box::new(wrapped_loads_next)),
null_mut(),
PyUnicode_InternFromString(c"orjson".as_ptr()),
);
add!(mptr, c"loads_next", func);
}

add!(mptr, c"Fragment", typeref::FRAGMENT_TYPE.cast::<PyObject>());

opt!(mptr, c"OPT_APPEND_NEWLINE", opt::APPEND_NEWLINE);
Expand Down Expand Up @@ -268,7 +288,7 @@ pub(crate) unsafe extern "C" fn PyInit_orjson() -> *mut PyModuleDef {
#[cold]
#[inline(never)]
#[cfg_attr(feature = "optimize", optimize(size))]
fn raise_loads_exception(err: deserialize::DeserializeError) -> *mut PyObject {
pub(crate) fn raise_loads_exception(err: deserialize::DeserializeError) -> *mut PyObject {
unsafe {
let err_pos = err.pos();
let msg = err.message;
Expand Down Expand Up @@ -373,8 +393,37 @@ fn raise_dumps_exception_dynamic(err: &str) -> *mut PyObject {

#[unsafe(no_mangle)]
pub(crate) unsafe extern "C" fn loads(_self: *mut PyObject, obj: *mut PyObject) -> *mut PyObject {
match crate::deserialize::deserialize(obj) {
Ok(val) => val.as_ptr(),
match deserialize::deserialize(obj, true) {
Ok(deserialize::DeserializeResult { obj, .. }) => obj.as_ptr(),
Err(err) => raise_loads_exception(err),
}
}

#[unsafe(no_mangle)]
pub(crate) unsafe extern "C" fn loads_next(
_self: *mut PyObject,
obj: *mut PyObject,
) -> *mut PyObject {
if is_type!(ob_type!(obj), STR_TYPE) {
cold_path!();
unsafe {
let msg = "loads_next requires binary input, not str";
let err_msg =
PyUnicode_FromStringAndSize(msg.as_ptr().cast::<c_char>(), msg.len() as isize);
PyErr_SetObject(PyExc_TypeError, err_msg);
Py_DECREF(err_msg);
return null_mut();
};
}

match deserialize::deserialize(obj, false) {
Ok(deserialize::DeserializeResult { obj, bytes_read }) => unsafe {
let result_tuple = PyTuple_New(2);
ffi::PyTuple_SET_ITEM(result_tuple, 0, obj.as_ptr());
let bytes_read_obj = PyLong_FromLongLong(bytes_read as i64);
ffi::PyTuple_SET_ITEM(result_tuple, 1, bytes_read_obj);
result_tuple
},
Err(err) => raise_loads_exception(err),
}
}
Expand Down
Loading