From c40452c5d6a23caf60cc65bec57ed28c766e53d4 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 20 Mar 2025 22:59:57 +0000 Subject: [PATCH 1/2] feat: reading JSON data as the pyarrow JSON type when available --- bigframes/bigquery/_operations/json.py | 4 ++-- bigframes/core/array_value.py | 4 ++-- bigframes/core/compile/ibis_types.py | 3 +-- bigframes/core/utils.py | 21 +++++++++++++++------ bigframes/dtypes.py | 19 +++++++++++++++---- bigframes/session/__init__.py | 7 ++++++- tests/system/small/test_series.py | 3 +-- tests/system/small/test_session.py | 10 ++++++++-- 8 files changed, 50 insertions(+), 21 deletions(-) diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 07efc5fa51..bf06f63de3 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -53,7 +53,7 @@ def json_set( >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) 0 {"a":100,"b":"hi"} - Name: data, dtype: extension>[pyarrow] + Name: data, dtype: extension[pyarrow] Args: input (bigframes.series.Series): @@ -253,7 +253,7 @@ def parse_json( dtype: string >>> bbq.parse_json(s) 0 {"class":{"students":[{"id":5},{"id":12}]}} - dtype: extension>[pyarrow] + dtype: extension[pyarrow] Args: input (bigframes.series.Series): diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 7ede7b7e65..f451f82d2d 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -108,8 +108,8 @@ def from_table( raise ValueError("must set at most one of 'offests', 'primary_key'") if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names): msg = bfe.format_message( - "JSON column interpretation as a custom PyArrow extention in `db_dtypes` " - "is a preview feature and subject to change." + "JSON column interpretation as a PyArrow JSON extention type is a preview " + "feature and subject to change." ) warnings.warn(msg, bfe.PreviewWarning) # define data source only for needed columns, this makes row-hashing cheaper diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 54b0a1408a..767722a951 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -24,7 +24,6 @@ dtype as python_type_to_ibis_type, ) import bigframes_vendored.ibis.expr.types as ibis_types -import db_dtypes # type: ignore import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery import pandas as pd @@ -75,7 +74,7 @@ IBIS_GEO_TYPE, gpd.array.GeometryDtype(), ), - (ibis_dtypes.json, pd.ArrowDtype(db_dtypes.JSONArrowType())), + (ibis_dtypes.json, bigframes.dtypes.JSON_DTYPE), ) BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = { diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 684290bf81..68297df11e 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -224,6 +224,15 @@ def timedelta_to_micros( raise TypeError(f"Unrecognized input type: {type(timedelta)}") +def _is_timedelat64_dtype(dtype: dtypes.Dtype) -> bool: + try: + return pdtypes.is_timedelta64_dtype(dtype) + except NotImplementedError: + # Workaround the known issue in pandas: + # https://github.com/pandas-dev/pandas/issues/60958 + return False + + def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]: """ Replaces in-place timedeltas to integer values in microseconds. Nanosecond part is ignored. @@ -234,11 +243,11 @@ def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]: updated_columns = [] for col in dataframe.columns: - if pdtypes.is_timedelta64_dtype(dataframe[col].dtype): + if _is_timedelat64_dtype(dataframe[col].dtype): dataframe[col] = dataframe[col].apply(timedelta_to_micros) updated_columns.append(col) - if pdtypes.is_timedelta64_dtype(dataframe.index.dtype): + if _is_timedelat64_dtype(dataframe.index.dtype): dataframe.index = dataframe.index.map(timedelta_to_micros) updated_columns.append(dataframe.index.name) @@ -249,8 +258,6 @@ def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool: """ Searches recursively for JSON array type within a PyArrow DataType. """ - if arrow_type == dtypes.JSON_ARROW_TYPE: - return True if pa.types.is_list(arrow_type): return _search_for_nested_json_type(arrow_type.value_type) if pa.types.is_struct(arrow_type): @@ -258,6 +265,8 @@ def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool: if _search_for_nested_json_type(arrow_type.field(i).type): return True return False + if dtypes.is_json_arrow_type(arrow_type): + return True return False @@ -272,7 +281,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]: for col in dataframe.columns: column_type = dataframe[col].dtype - if column_type == dtypes.JSON_DTYPE: + if dtypes.is_json_type(column_type): dataframe[col] = dataframe[col].astype(dtypes.STRING_DTYPE) updated_columns.append(col) elif isinstance(column_type, pd.ArrowDtype) and _search_for_nested_json_type( @@ -283,7 +292,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]: f"are currently unsupported for upload. {constants.FEEDBACK_LINK}" ) - if dataframe.index.dtype == dtypes.JSON_DTYPE: + if dtypes.is_json_type(dataframe.index.dtype): dataframe.index = dataframe.index.astype(dtypes.STRING_DTYPE) updated_columns.append(dataframe.index.name) elif isinstance( diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 22cc521e8e..525b5d3782 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -62,8 +62,9 @@ # No arrow equivalent GEO_DTYPE = gpd.array.GeometryDtype() # JSON -# TODO: switch to pyarrow.json_(pyarrow.string()) when available. -JSON_ARROW_TYPE = db_dtypes.JSONArrowType() +JSON_ARROW_TYPE = ( + pa.json_(pa.string()) if hasattr(pa, "JsonType") else db_dtypes.JSONArrowType() +) JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE) OBJ_REF_DTYPE = pd.ArrowDtype( pa.struct( @@ -169,7 +170,7 @@ class SimpleDtypeInfo: ), SimpleDtypeInfo( dtype=JSON_DTYPE, - arrow_dtype=db_dtypes.JSONArrowType(), + arrow_dtype=JSON_ARROW_TYPE, type_kind=("JSON",), orderable=False, clusterable=False, @@ -330,8 +331,18 @@ def is_struct_like(type_: ExpressionType) -> bool: ) +def is_json_arrow_type(type_: pa.DataType) -> bool: + return (hasattr(pa, "JsonType") and isinstance(type_, pa.JsonType)) or ( + not hasattr(pa, "JsonType") and isinstance(type_, db_dtypes.JSONArrowType) + ) + + +def is_json_type(type_: ExpressionType) -> bool: + return isinstance(type_, pd.ArrowDtype) and is_json_arrow_type(type_.pyarrow_dtype) + + def is_json_like(type_: ExpressionType) -> bool: - return type_ == JSON_DTYPE or type_ == STRING_DTYPE # Including JSON string + return is_json_type(type_) or type_ == STRING_DTYPE # Including JSON string def is_json_encoding_type(type_: ExpressionType) -> bool: diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index dfee41c90b..d8ed11f035 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -814,7 +814,12 @@ def _read_pandas_inline( ) -> dataframe.DataFrame: import bigframes.dataframe as dataframe - memory_usage = pandas_dataframe.memory_usage(deep=True).sum() + try: + memory_usage = pandas_dataframe.memory_usage(deep=True).sum() + except NotImplementedError: # TODO: add unit test + # Workaround the known issue in pandas: + # https://github.com/pandas-dev/pandas/issues/60958 + raise ValueError("Could not determine the DataFrame's memory usage.") if memory_usage > MAX_INLINE_DF_BYTES: raise ValueError( f"DataFrame size ({memory_usage} bytes) exceeds the maximum allowed " diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index acd267aaf8..1d18f0b310 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -17,7 +17,6 @@ import re import tempfile -import db_dtypes # type: ignore import geopandas as gpd # type: ignore import numpy from packaging.version import Version @@ -384,9 +383,9 @@ def test_get_column(scalars_dfs, col_name, expected_dtype): def test_get_column_w_json(json_df, json_pandas_df): series = json_df["json_col"] + assert dtypes.is_json_type(series.dtype) # Until b/401630655 is resolved, json not compatible with allow_large_results=False series_pandas = series.to_pandas(allow_large_results=True) - assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) assert series_pandas.shape[0] == json_pandas_df.shape[0] diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index e286c40450..413f5f4157 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -933,7 +933,11 @@ def test_read_pandas_json_dataframes(session, write_engine): if write_engine == "bigquery_streaming": expected_df.index = pd.Index([pd.NA] * 4, dtype="Int64") - pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False) + # `check_exact=False` can workaround the known issue in pandas: + # https://github.com/pandas-dev/pandas/issues/60958 + pd.testing.assert_frame_equal( + actual_result, expected_df, check_index_type=False, check_exact=False + ) @pytest.mark.parametrize( @@ -953,8 +957,10 @@ def test_read_pandas_json_series(session, write_engine): actual_result = session.read_pandas( expected_series, write_engine=write_engine ).to_pandas(allow_large_results=True) + # `check_exact=False` can workaround the known issue in pandas: + # https://github.com/pandas-dev/pandas/issues/60958 pd.testing.assert_series_equal( - actual_result, expected_series, check_index_type=False + actual_result, expected_series, check_index_type=False, check_exact=False ) From f6e85e3e63d1ad1e1cf457525dba42d07544f1f1 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 24 Mar 2025 20:52:46 +0000 Subject: [PATCH 2/2] fix tests --- tests/system/small/test_dataframe.py | 16 +++++++++++----- tests/system/small/test_session.py | 17 ++++++++++------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 8cc3be1577..fd83a95b79 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -4607,22 +4607,28 @@ def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, sub ], ) def test_df_drop_duplicates_w_json(json_df, keep): - bf_df = json_df.drop_duplicates(keep=keep).to_pandas(allow_large_results=True) + bf_df = json_df.drop_duplicates(keep=keep) + assert dtypes.is_json_type(bf_df.dtypes["json_col"]) + # TODO(b/401630655): JSON is not compatible with allow_large_results=False + json_pandas_df = json_df.to_pandas(allow_large_results=True) # drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible # with Arrow string extension types. Temporary conversion to standard Pandas # strings is required. - # allow_large_results=True for b/401630655 - json_pandas_df = json_df.to_pandas(allow_large_results=True) json_pandas_df["json_col"] = json_pandas_df["json_col"].astype( pd.StringDtype(storage="pyarrow") ) - pd_df = json_pandas_df.drop_duplicates(keep=keep) pd_df["json_col"] = pd_df["json_col"].astype(dtypes.JSON_DTYPE) + assert dtypes.is_json_type(pd_df.dtypes["json_col"]) + + # `check_exact=False` can workaround the known issue in pandas: + # https://github.com/pandas-dev/pandas/issues/60958 pd.testing.assert_frame_equal( pd_df, - bf_df, + # TODO(b/401630655): JSON is not compatible with allow_large_results=False + bf_df.to_pandas(allow_large_results=True), + check_exact=False, ) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 413f5f4157..a66a737235 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -22,7 +22,6 @@ import warnings import bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq -import db_dtypes # type:ignore import google import google.cloud.bigquery as bigquery import numpy as np @@ -633,7 +632,7 @@ def test_read_gbq_w_json(session): # TODO(b/401630655): JSON is not compatible with allow_large_results=False df = session.read_gbq(sql, index_col="id").to_pandas(allow_large_results=True) - assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) + assert bigframes.dtypes.is_json_type(df.dtypes["json_col"]) assert df["json_col"][0] == '{"boolean":true}' assert df["json_col"][1] == '{"int":100}' @@ -649,7 +648,7 @@ def test_read_gbq_w_json(session): def test_read_gbq_w_json_and_compare_w_pandas_json(session): df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col") - assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) + assert bigframes.dtypes.is_json_type(df.dtypes["json_col"]) # TODO(b/401630655): JSON is not compatible with allow_large_results=False result = df.to_pandas(allow_large_results=True) @@ -657,11 +656,15 @@ def test_read_gbq_w_json_and_compare_w_pandas_json(session): # These JSON strings are compatible with BigQuery's JSON storage, pd_df = pd.DataFrame( {"json_col": ['{"bar":true,"foo":10}']}, - dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()), + dtype=bigframes.dtypes.JSON_DTYPE, ) pd_df.index = pd_df.index.astype("Int64") pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes) - pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"]) + # `check_exact=False` can workaround the known issue in pandas: + # https://github.com/pandas-dev/pandas/issues/60958 + pd.testing.assert_series_equal( + result["json_col"], pd_df["json_col"], check_exact=False + ) def test_read_gbq_w_json_in_struct(session): @@ -695,7 +698,7 @@ def test_read_gbq_w_json_in_struct(session): assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType) data = df["struct_col"].struct.field("data") - assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) + assert bigframes.dtypes.is_json_type(data.dtype) # TODO(b/401630655): JSON is not compatible with allow_large_results=False data = data.to_pandas(allow_large_results=True) @@ -736,7 +739,7 @@ def test_read_gbq_w_json_in_array(session): data = df["array_col"] assert data.list.len()[0] == 7 - assert data.list[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) + assert bigframes.dtypes.is_json_type(data.list[0].dtype) # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd_data = data.to_pandas(allow_large_results=True)