From c40452c5d6a23caf60cc65bec57ed28c766e53d4 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Thu, 20 Mar 2025 22:59:57 +0000
Subject: [PATCH 1/2] feat: reading JSON data as the pyarrow JSON type when
 available

---
 bigframes/bigquery/_operations/json.py |  4 ++--
 bigframes/core/array_value.py          |  4 ++--
 bigframes/core/compile/ibis_types.py   |  3 +--
 bigframes/core/utils.py                | 21 +++++++++++++++------
 bigframes/dtypes.py                    | 19 +++++++++++++++----
 bigframes/session/__init__.py          |  7 ++++++-
 tests/system/small/test_series.py      |  3 +--
 tests/system/small/test_session.py     | 10 ++++++++--
 8 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py
index 07efc5fa51..bf06f63de3 100644
--- a/bigframes/bigquery/_operations/json.py
+++ b/bigframes/bigquery/_operations/json.py
@@ -53,7 +53,7 @@ def json_set(
         >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
         >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
             0    {"a":100,"b":"hi"}
-            Name: data, dtype: extension<dbjson<JSONArrowType>>[pyarrow]
+            Name: data, dtype: extension<arrow.json>[pyarrow]
 
     Args:
         input (bigframes.series.Series):
@@ -253,7 +253,7 @@ def parse_json(
         dtype: string
         >>> bbq.parse_json(s)
         0    {"class":{"students":[{"id":5},{"id":12}]}}
-        dtype: extension<dbjson<JSONArrowType>>[pyarrow]
+        dtype: extension<arrow.json>[pyarrow]
 
     Args:
         input (bigframes.series.Series):
diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py
index 7ede7b7e65..f451f82d2d 100644
--- a/bigframes/core/array_value.py
+++ b/bigframes/core/array_value.py
@@ -108,8 +108,8 @@ def from_table(
             raise ValueError("must set at most one of 'offests', 'primary_key'")
         if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
             msg = bfe.format_message(
-                "JSON column interpretation as a custom PyArrow extention in `db_dtypes` "
-                "is a preview feature and subject to change."
+                "JSON column interpretation as a PyArrow JSON extention type is a preview "
+                "feature and subject to change."
             )
             warnings.warn(msg, bfe.PreviewWarning)
         # define data source only for needed columns, this makes row-hashing cheaper
diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py
index 54b0a1408a..767722a951 100644
--- a/bigframes/core/compile/ibis_types.py
+++ b/bigframes/core/compile/ibis_types.py
@@ -24,7 +24,6 @@
     dtype as python_type_to_ibis_type,
 )
 import bigframes_vendored.ibis.expr.types as ibis_types
-import db_dtypes  # type: ignore
 import geopandas as gpd  # type: ignore
 import google.cloud.bigquery as bigquery
 import pandas as pd
@@ -75,7 +74,7 @@
         IBIS_GEO_TYPE,
         gpd.array.GeometryDtype(),
     ),
-    (ibis_dtypes.json, pd.ArrowDtype(db_dtypes.JSONArrowType())),
+    (ibis_dtypes.json, bigframes.dtypes.JSON_DTYPE),
 )
 
 BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = {
diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py
index 684290bf81..68297df11e 100644
--- a/bigframes/core/utils.py
+++ b/bigframes/core/utils.py
@@ -224,6 +224,15 @@ def timedelta_to_micros(
     raise TypeError(f"Unrecognized input type: {type(timedelta)}")
 
 
+def _is_timedelat64_dtype(dtype: dtypes.Dtype) -> bool:
+    try:
+        return pdtypes.is_timedelta64_dtype(dtype)
+    except NotImplementedError:
+        # Workaround the known issue in pandas:
+        # https://github.com/pandas-dev/pandas/issues/60958
+        return False
+
+
 def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
     """
     Replaces in-place timedeltas to integer values in microseconds. Nanosecond part is ignored.
@@ -234,11 +243,11 @@ def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
     updated_columns = []
 
     for col in dataframe.columns:
-        if pdtypes.is_timedelta64_dtype(dataframe[col].dtype):
+        if _is_timedelat64_dtype(dataframe[col].dtype):
             dataframe[col] = dataframe[col].apply(timedelta_to_micros)
             updated_columns.append(col)
 
-    if pdtypes.is_timedelta64_dtype(dataframe.index.dtype):
+    if _is_timedelat64_dtype(dataframe.index.dtype):
         dataframe.index = dataframe.index.map(timedelta_to_micros)
         updated_columns.append(dataframe.index.name)
 
@@ -249,8 +258,6 @@ def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool:
     """
     Searches recursively for JSON array type within a PyArrow DataType.
     """
-    if arrow_type == dtypes.JSON_ARROW_TYPE:
-        return True
     if pa.types.is_list(arrow_type):
         return _search_for_nested_json_type(arrow_type.value_type)
     if pa.types.is_struct(arrow_type):
@@ -258,6 +265,8 @@ def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool:
             if _search_for_nested_json_type(arrow_type.field(i).type):
                 return True
         return False
+    if dtypes.is_json_arrow_type(arrow_type):
+        return True
     return False
 
 
@@ -272,7 +281,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
 
     for col in dataframe.columns:
         column_type = dataframe[col].dtype
-        if column_type == dtypes.JSON_DTYPE:
+        if dtypes.is_json_type(column_type):
             dataframe[col] = dataframe[col].astype(dtypes.STRING_DTYPE)
             updated_columns.append(col)
         elif isinstance(column_type, pd.ArrowDtype) and _search_for_nested_json_type(
@@ -283,7 +292,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
                 f"are currently unsupported for upload. {constants.FEEDBACK_LINK}"
             )
 
-    if dataframe.index.dtype == dtypes.JSON_DTYPE:
+    if dtypes.is_json_type(dataframe.index.dtype):
         dataframe.index = dataframe.index.astype(dtypes.STRING_DTYPE)
         updated_columns.append(dataframe.index.name)
     elif isinstance(
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 22cc521e8e..525b5d3782 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -62,8 +62,9 @@
 # No arrow equivalent
 GEO_DTYPE = gpd.array.GeometryDtype()
 # JSON
-# TODO: switch to pyarrow.json_(pyarrow.string()) when available.
-JSON_ARROW_TYPE = db_dtypes.JSONArrowType()
+JSON_ARROW_TYPE = (
+    pa.json_(pa.string()) if hasattr(pa, "JsonType") else db_dtypes.JSONArrowType()
+)
 JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)
 OBJ_REF_DTYPE = pd.ArrowDtype(
     pa.struct(
@@ -169,7 +170,7 @@ class SimpleDtypeInfo:
     ),
     SimpleDtypeInfo(
         dtype=JSON_DTYPE,
-        arrow_dtype=db_dtypes.JSONArrowType(),
+        arrow_dtype=JSON_ARROW_TYPE,
         type_kind=("JSON",),
         orderable=False,
         clusterable=False,
@@ -330,8 +331,18 @@ def is_struct_like(type_: ExpressionType) -> bool:
     )
 
 
+def is_json_arrow_type(type_: pa.DataType) -> bool:
+    return (hasattr(pa, "JsonType") and isinstance(type_, pa.JsonType)) or (
+        not hasattr(pa, "JsonType") and isinstance(type_, db_dtypes.JSONArrowType)
+    )
+
+
+def is_json_type(type_: ExpressionType) -> bool:
+    return isinstance(type_, pd.ArrowDtype) and is_json_arrow_type(type_.pyarrow_dtype)
+
+
 def is_json_like(type_: ExpressionType) -> bool:
-    return type_ == JSON_DTYPE or type_ == STRING_DTYPE  # Including JSON string
+    return is_json_type(type_) or type_ == STRING_DTYPE  # Including JSON string
 
 
 def is_json_encoding_type(type_: ExpressionType) -> bool:
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index dfee41c90b..d8ed11f035 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -814,7 +814,12 @@ def _read_pandas_inline(
     ) -> dataframe.DataFrame:
         import bigframes.dataframe as dataframe
 
-        memory_usage = pandas_dataframe.memory_usage(deep=True).sum()
+        try:
+            memory_usage = pandas_dataframe.memory_usage(deep=True).sum()
+        except NotImplementedError:  # TODO: add unit test
+            # Workaround the known issue in pandas:
+            # https://github.com/pandas-dev/pandas/issues/60958
+            raise ValueError("Could not determine the DataFrame's memory usage.")
         if memory_usage > MAX_INLINE_DF_BYTES:
             raise ValueError(
                 f"DataFrame size ({memory_usage} bytes) exceeds the maximum allowed "
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index acd267aaf8..1d18f0b310 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -17,7 +17,6 @@
 import re
 import tempfile
 
-import db_dtypes  # type: ignore
 import geopandas as gpd  # type: ignore
 import numpy
 from packaging.version import Version
@@ -384,9 +383,9 @@ def test_get_column(scalars_dfs, col_name, expected_dtype):
 
 def test_get_column_w_json(json_df, json_pandas_df):
     series = json_df["json_col"]
+    assert dtypes.is_json_type(series.dtype)
     # Until b/401630655 is resolved, json not compatible with allow_large_results=False
     series_pandas = series.to_pandas(allow_large_results=True)
-    assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
     assert series_pandas.shape[0] == json_pandas_df.shape[0]
 
 
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index e286c40450..413f5f4157 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -933,7 +933,11 @@ def test_read_pandas_json_dataframes(session, write_engine):
 
     if write_engine == "bigquery_streaming":
         expected_df.index = pd.Index([pd.NA] * 4, dtype="Int64")
-    pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False)
+    # `check_exact=False` can workaround the known issue in pandas:
+    # https://github.com/pandas-dev/pandas/issues/60958
+    pd.testing.assert_frame_equal(
+        actual_result, expected_df, check_index_type=False, check_exact=False
+    )
 
 
 @pytest.mark.parametrize(
@@ -953,8 +957,10 @@ def test_read_pandas_json_series(session, write_engine):
     actual_result = session.read_pandas(
         expected_series, write_engine=write_engine
     ).to_pandas(allow_large_results=True)
+    # `check_exact=False` can workaround the known issue in pandas:
+    # https://github.com/pandas-dev/pandas/issues/60958
     pd.testing.assert_series_equal(
-        actual_result, expected_series, check_index_type=False
+        actual_result, expected_series, check_index_type=False, check_exact=False
     )
 
 

From f6e85e3e63d1ad1e1cf457525dba42d07544f1f1 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Mon, 24 Mar 2025 20:52:46 +0000
Subject: [PATCH 2/2] fix tests

---
 tests/system/small/test_dataframe.py | 16 +++++++++++-----
 tests/system/small/test_session.py   | 17 ++++++++++-------
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 8cc3be1577..fd83a95b79 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -4607,22 +4607,28 @@ def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, sub
     ],
 )
 def test_df_drop_duplicates_w_json(json_df, keep):
-    bf_df = json_df.drop_duplicates(keep=keep).to_pandas(allow_large_results=True)
+    bf_df = json_df.drop_duplicates(keep=keep)
+    assert dtypes.is_json_type(bf_df.dtypes["json_col"])
 
+    # TODO(b/401630655): JSON is not compatible with allow_large_results=False
+    json_pandas_df = json_df.to_pandas(allow_large_results=True)
     # drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible
     # with Arrow string extension types. Temporary conversion to standard Pandas
     # strings is required.
-    # allow_large_results=True for b/401630655
-    json_pandas_df = json_df.to_pandas(allow_large_results=True)
     json_pandas_df["json_col"] = json_pandas_df["json_col"].astype(
         pd.StringDtype(storage="pyarrow")
     )
-
     pd_df = json_pandas_df.drop_duplicates(keep=keep)
     pd_df["json_col"] = pd_df["json_col"].astype(dtypes.JSON_DTYPE)
+    assert dtypes.is_json_type(pd_df.dtypes["json_col"])
+
+    # `check_exact=False` can workaround the known issue in pandas:
+    # https://github.com/pandas-dev/pandas/issues/60958
     pd.testing.assert_frame_equal(
         pd_df,
-        bf_df,
+        # TODO(b/401630655): JSON is not compatible with allow_large_results=False
+        bf_df.to_pandas(allow_large_results=True),
+        check_exact=False,
     )
 
 
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index 413f5f4157..a66a737235 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -22,7 +22,6 @@
 import warnings
 
 import bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq
-import db_dtypes  # type:ignore
 import google
 import google.cloud.bigquery as bigquery
 import numpy as np
@@ -633,7 +632,7 @@ def test_read_gbq_w_json(session):
     # TODO(b/401630655): JSON is not compatible with allow_large_results=False
     df = session.read_gbq(sql, index_col="id").to_pandas(allow_large_results=True)
 
-    assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType())
+    assert bigframes.dtypes.is_json_type(df.dtypes["json_col"])
 
     assert df["json_col"][0] == '{"boolean":true}'
     assert df["json_col"][1] == '{"int":100}'
@@ -649,7 +648,7 @@ def test_read_gbq_w_json(session):
 
 def test_read_gbq_w_json_and_compare_w_pandas_json(session):
     df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col")
-    assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType())
+    assert bigframes.dtypes.is_json_type(df.dtypes["json_col"])
 
     # TODO(b/401630655): JSON is not compatible with allow_large_results=False
     result = df.to_pandas(allow_large_results=True)
@@ -657,11 +656,15 @@ def test_read_gbq_w_json_and_compare_w_pandas_json(session):
     # These JSON strings are compatible with BigQuery's JSON storage,
     pd_df = pd.DataFrame(
         {"json_col": ['{"bar":true,"foo":10}']},
-        dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()),
+        dtype=bigframes.dtypes.JSON_DTYPE,
     )
     pd_df.index = pd_df.index.astype("Int64")
     pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes)
-    pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"])
+    # `check_exact=False` can workaround the known issue in pandas:
+    # https://github.com/pandas-dev/pandas/issues/60958
+    pd.testing.assert_series_equal(
+        result["json_col"], pd_df["json_col"], check_exact=False
+    )
 
 
 def test_read_gbq_w_json_in_struct(session):
@@ -695,7 +698,7 @@ def test_read_gbq_w_json_in_struct(session):
     assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType)
 
     data = df["struct_col"].struct.field("data")
-    assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
+    assert bigframes.dtypes.is_json_type(data.dtype)
 
     # TODO(b/401630655): JSON is not compatible with allow_large_results=False
     data = data.to_pandas(allow_large_results=True)
@@ -736,7 +739,7 @@ def test_read_gbq_w_json_in_array(session):
 
     data = df["array_col"]
     assert data.list.len()[0] == 7
-    assert data.list[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
+    assert bigframes.dtypes.is_json_type(data.list[0].dtype)
 
     # TODO(b/401630655): JSON is not compatible with allow_large_results=False
     pd_data = data.to_pandas(allow_large_results=True)