Thanks to visit codestin.com
Credit goes to github.com

Skip to content

feat!: reading JSON data as the pyarrow native JSON type when available #1521

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bigframes/bigquery/_operations/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def json_set(
>>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
>>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
0 {"a":100,"b":"hi"}
Name: data, dtype: extension<dbjson<JSONArrowType>>[pyarrow]
Name: data, dtype: extension<arrow.json>[pyarrow]

Args:
input (bigframes.series.Series):
Expand Down Expand Up @@ -253,7 +253,7 @@ def parse_json(
dtype: string
>>> bbq.parse_json(s)
0 {"class":{"students":[{"id":5},{"id":12}]}}
dtype: extension<dbjson<JSONArrowType>>[pyarrow]
dtype: extension<arrow.json>[pyarrow]

Args:
input (bigframes.series.Series):
Expand Down
4 changes: 2 additions & 2 deletions bigframes/core/array_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ def from_table(
raise ValueError("must set at most one of 'offests', 'primary_key'")
if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
msg = bfe.format_message(
"JSON column interpretation as a custom PyArrow extention in `db_dtypes` "
"is a preview feature and subject to change."
"JSON column interpretation as a PyArrow JSON extention type is a preview "
"feature and subject to change."
)
warnings.warn(msg, bfe.PreviewWarning)
# define data source only for needed columns, this makes row-hashing cheaper
Expand Down
3 changes: 1 addition & 2 deletions bigframes/core/compile/ibis_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
dtype as python_type_to_ibis_type,
)
import bigframes_vendored.ibis.expr.types as ibis_types
import db_dtypes # type: ignore
import geopandas as gpd # type: ignore
import google.cloud.bigquery as bigquery
import pandas as pd
Expand Down Expand Up @@ -75,7 +74,7 @@
IBIS_GEO_TYPE,
gpd.array.GeometryDtype(),
),
(ibis_dtypes.json, pd.ArrowDtype(db_dtypes.JSONArrowType())),
(ibis_dtypes.json, bigframes.dtypes.JSON_DTYPE),
)

BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = {
Expand Down
21 changes: 15 additions & 6 deletions bigframes/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,15 @@ def timedelta_to_micros(
raise TypeError(f"Unrecognized input type: {type(timedelta)}")


def _is_timedelat64_dtype(dtype: dtypes.Dtype) -> bool:
try:
return pdtypes.is_timedelta64_dtype(dtype)
except NotImplementedError:
# Workaround the known issue in pandas:
# https://github.com/pandas-dev/pandas/issues/60958
return False


def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
"""
Replaces in-place timedeltas to integer values in microseconds. Nanosecond part is ignored.
Expand All @@ -234,11 +243,11 @@ def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
updated_columns = []

for col in dataframe.columns:
if pdtypes.is_timedelta64_dtype(dataframe[col].dtype):
if _is_timedelat64_dtype(dataframe[col].dtype):
dataframe[col] = dataframe[col].apply(timedelta_to_micros)
updated_columns.append(col)

if pdtypes.is_timedelta64_dtype(dataframe.index.dtype):
if _is_timedelat64_dtype(dataframe.index.dtype):
dataframe.index = dataframe.index.map(timedelta_to_micros)
updated_columns.append(dataframe.index.name)

Expand All @@ -249,15 +258,15 @@ def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool:
"""
Searches recursively for JSON array type within a PyArrow DataType.
"""
if arrow_type == dtypes.JSON_ARROW_TYPE:
return True
if pa.types.is_list(arrow_type):
return _search_for_nested_json_type(arrow_type.value_type)
if pa.types.is_struct(arrow_type):
for i in range(arrow_type.num_fields):
if _search_for_nested_json_type(arrow_type.field(i).type):
return True
return False
if dtypes.is_json_arrow_type(arrow_type):
return True
return False


Expand All @@ -272,7 +281,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:

for col in dataframe.columns:
column_type = dataframe[col].dtype
if column_type == dtypes.JSON_DTYPE:
if dtypes.is_json_type(column_type):
dataframe[col] = dataframe[col].astype(dtypes.STRING_DTYPE)
updated_columns.append(col)
elif isinstance(column_type, pd.ArrowDtype) and _search_for_nested_json_type(
Expand All @@ -283,7 +292,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
f"are currently unsupported for upload. {constants.FEEDBACK_LINK}"
)

if dataframe.index.dtype == dtypes.JSON_DTYPE:
if dtypes.is_json_type(dataframe.index.dtype):
dataframe.index = dataframe.index.astype(dtypes.STRING_DTYPE)
updated_columns.append(dataframe.index.name)
elif isinstance(
Expand Down
19 changes: 15 additions & 4 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@
# No arrow equivalent
GEO_DTYPE = gpd.array.GeometryDtype()
# JSON
# TODO: switch to pyarrow.json_(pyarrow.string()) when available.
JSON_ARROW_TYPE = db_dtypes.JSONArrowType()
JSON_ARROW_TYPE = (
pa.json_(pa.string()) if hasattr(pa, "JsonType") else db_dtypes.JSONArrowType()
)
JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)
OBJ_REF_DTYPE = pd.ArrowDtype(
pa.struct(
Expand Down Expand Up @@ -169,7 +170,7 @@ class SimpleDtypeInfo:
),
SimpleDtypeInfo(
dtype=JSON_DTYPE,
arrow_dtype=db_dtypes.JSONArrowType(),
arrow_dtype=JSON_ARROW_TYPE,
type_kind=("JSON",),
orderable=False,
clusterable=False,
Expand Down Expand Up @@ -330,8 +331,18 @@ def is_struct_like(type_: ExpressionType) -> bool:
)


def is_json_arrow_type(type_: pa.DataType) -> bool:
return (hasattr(pa, "JsonType") and isinstance(type_, pa.JsonType)) or (
not hasattr(pa, "JsonType") and isinstance(type_, db_dtypes.JSONArrowType)
)


def is_json_type(type_: ExpressionType) -> bool:
return isinstance(type_, pd.ArrowDtype) and is_json_arrow_type(type_.pyarrow_dtype)


def is_json_like(type_: ExpressionType) -> bool:
return type_ == JSON_DTYPE or type_ == STRING_DTYPE # Including JSON string
return is_json_type(type_) or type_ == STRING_DTYPE # Including JSON string


def is_json_encoding_type(type_: ExpressionType) -> bool:
Expand Down
7 changes: 6 additions & 1 deletion bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,12 @@ def _read_pandas_inline(
) -> dataframe.DataFrame:
import bigframes.dataframe as dataframe

memory_usage = pandas_dataframe.memory_usage(deep=True).sum()
try:
memory_usage = pandas_dataframe.memory_usage(deep=True).sum()
except NotImplementedError: # TODO: add unit test
# Workaround the known issue in pandas:
# https://github.com/pandas-dev/pandas/issues/60958
raise ValueError("Could not determine the DataFrame's memory usage.")
if memory_usage > MAX_INLINE_DF_BYTES:
raise ValueError(
f"DataFrame size ({memory_usage} bytes) exceeds the maximum allowed "
Expand Down
16 changes: 11 additions & 5 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4607,22 +4607,28 @@ def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, sub
],
)
def test_df_drop_duplicates_w_json(json_df, keep):
bf_df = json_df.drop_duplicates(keep=keep).to_pandas(allow_large_results=True)
bf_df = json_df.drop_duplicates(keep=keep)
assert dtypes.is_json_type(bf_df.dtypes["json_col"])

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
json_pandas_df = json_df.to_pandas(allow_large_results=True)
# drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible
# with Arrow string extension types. Temporary conversion to standard Pandas
# strings is required.
# allow_large_results=True for b/401630655
json_pandas_df = json_df.to_pandas(allow_large_results=True)
json_pandas_df["json_col"] = json_pandas_df["json_col"].astype(
pd.StringDtype(storage="pyarrow")
)

pd_df = json_pandas_df.drop_duplicates(keep=keep)
pd_df["json_col"] = pd_df["json_col"].astype(dtypes.JSON_DTYPE)
assert dtypes.is_json_type(pd_df.dtypes["json_col"])

# `check_exact=False` can workaround the known issue in pandas:
# https://github.com/pandas-dev/pandas/issues/60958
pd.testing.assert_frame_equal(
pd_df,
bf_df,
# TODO(b/401630655): JSON is not compatible with allow_large_results=False
bf_df.to_pandas(allow_large_results=True),
check_exact=False,
)


Expand Down
3 changes: 1 addition & 2 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import re
import tempfile

import db_dtypes # type: ignore
import geopandas as gpd # type: ignore
import numpy
from packaging.version import Version
Expand Down Expand Up @@ -384,9 +383,9 @@ def test_get_column(scalars_dfs, col_name, expected_dtype):

def test_get_column_w_json(json_df, json_pandas_df):
series = json_df["json_col"]
assert dtypes.is_json_type(series.dtype)
# Until b/401630655 is resolved, json not compatible with allow_large_results=False
series_pandas = series.to_pandas(allow_large_results=True)
assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
assert series_pandas.shape[0] == json_pandas_df.shape[0]


Expand Down
27 changes: 18 additions & 9 deletions tests/system/small/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import warnings

import bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq
import db_dtypes # type:ignore
import google
import google.cloud.bigquery as bigquery
import numpy as np
Expand Down Expand Up @@ -633,7 +632,7 @@ def test_read_gbq_w_json(session):
# TODO(b/401630655): JSON is not compatible with allow_large_results=False
df = session.read_gbq(sql, index_col="id").to_pandas(allow_large_results=True)

assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType())
assert bigframes.dtypes.is_json_type(df.dtypes["json_col"])

assert df["json_col"][0] == '{"boolean":true}'
assert df["json_col"][1] == '{"int":100}'
Expand All @@ -649,19 +648,23 @@ def test_read_gbq_w_json(session):

def test_read_gbq_w_json_and_compare_w_pandas_json(session):
df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col")
assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType())
assert bigframes.dtypes.is_json_type(df.dtypes["json_col"])

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
result = df.to_pandas(allow_large_results=True)

# These JSON strings are compatible with BigQuery's JSON storage,
pd_df = pd.DataFrame(
{"json_col": ['{"bar":true,"foo":10}']},
dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()),
dtype=bigframes.dtypes.JSON_DTYPE,
)
pd_df.index = pd_df.index.astype("Int64")
pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes)
pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"])
# `check_exact=False` can workaround the known issue in pandas:
# https://github.com/pandas-dev/pandas/issues/60958
pd.testing.assert_series_equal(
result["json_col"], pd_df["json_col"], check_exact=False
)


def test_read_gbq_w_json_in_struct(session):
Expand Down Expand Up @@ -695,7 +698,7 @@ def test_read_gbq_w_json_in_struct(session):
assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType)

data = df["struct_col"].struct.field("data")
assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
assert bigframes.dtypes.is_json_type(data.dtype)

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
data = data.to_pandas(allow_large_results=True)
Expand Down Expand Up @@ -736,7 +739,7 @@ def test_read_gbq_w_json_in_array(session):

data = df["array_col"]
assert data.list.len()[0] == 7
assert data.list[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
assert bigframes.dtypes.is_json_type(data.list[0].dtype)

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
pd_data = data.to_pandas(allow_large_results=True)
Expand Down Expand Up @@ -933,7 +936,11 @@ def test_read_pandas_json_dataframes(session, write_engine):

if write_engine == "bigquery_streaming":
expected_df.index = pd.Index([pd.NA] * 4, dtype="Int64")
pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False)
# `check_exact=False` can workaround the known issue in pandas:
# https://github.com/pandas-dev/pandas/issues/60958
pd.testing.assert_frame_equal(
actual_result, expected_df, check_index_type=False, check_exact=False
)


@pytest.mark.parametrize(
Expand All @@ -953,8 +960,10 @@ def test_read_pandas_json_series(session, write_engine):
actual_result = session.read_pandas(
expected_series, write_engine=write_engine
).to_pandas(allow_large_results=True)
# `check_exact=False` can workaround the known issue in pandas:
# https://github.com/pandas-dev/pandas/issues/60958
pd.testing.assert_series_equal(
actual_result, expected_series, check_index_type=False
actual_result, expected_series, check_index_type=False, check_exact=False
)


Expand Down