Thanks to visit codestin.com
Credit goes to github.com

Skip to content

fix: Raise error for large inline DataFrames in read_pandas #1525

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 20 additions & 22 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,13 +794,14 @@ def _read_pandas(
)

if write_engine == "default":
inline_df = self._read_pandas_inline(pandas_dataframe, should_raise=False)
if inline_df is not None:
try:
inline_df = self._read_pandas_inline(pandas_dataframe)
return inline_df
except ValueError:
pass
return self._read_pandas_load_job(pandas_dataframe, api_name)
elif write_engine == "bigquery_inline":
# Regarding the type: ignore, with should_raise=True, this should never return None.
return self._read_pandas_inline(pandas_dataframe, should_raise=True) # type: ignore
return self._read_pandas_inline(pandas_dataframe)
elif write_engine == "bigquery_load":
return self._read_pandas_load_job(pandas_dataframe, api_name)
elif write_engine == "bigquery_streaming":
Expand All @@ -809,12 +810,16 @@ def _read_pandas(
raise ValueError(f"Got unexpected write_engine '{write_engine}'")

def _read_pandas_inline(
self, pandas_dataframe: pandas.DataFrame, should_raise=False
) -> Optional[dataframe.DataFrame]:
self, pandas_dataframe: pandas.DataFrame
) -> dataframe.DataFrame:
import bigframes.dataframe as dataframe

if pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES:
return None
memory_usage = pandas_dataframe.memory_usage(deep=True).sum()
if memory_usage > MAX_INLINE_DF_BYTES:
raise ValueError(
f"DataFrame size ({memory_usage} bytes) exceeds the maximum allowed "
f"for inline data ({MAX_INLINE_DF_BYTES} bytes)."
)

try:
local_block = blocks.Block.from_local(pandas_dataframe, self)
Expand All @@ -825,29 +830,22 @@ def _read_pandas_inline(
ValueError, # Thrown by ibis for some unhandled types
TypeError, # Not all types handleable by local code path
) as exc:
if should_raise:
raise ValueError(
f"Could not convert with a BigQuery type: `{exc}`. "
) from exc
else:
return None

inline_types = inline_df._block.expr.schema.dtypes
raise ValueError(
f"Could not convert with a BigQuery type: `{exc}`. "
) from exc

# Make sure all types are inlinable to avoid escaping errors.
inline_types = inline_df._block.expr.schema.dtypes
noninlinable_types = [
dtype for dtype in inline_types if dtype not in INLINABLE_DTYPES
]
if len(noninlinable_types) == 0:
return inline_df

if should_raise:
if len(noninlinable_types) != 0:
raise ValueError(
f"Could not inline with a BigQuery type: `{noninlinable_types}`. "
f"{constants.FEEDBACK_LINK}"
)
else:
return None

return inline_df

def _read_pandas_load_job(
self,
Expand Down
33 changes: 33 additions & 0 deletions tests/unit/session/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
import google.api_core.exceptions
import google.cloud.bigquery
import google.cloud.bigquery.table
import pandas as pd
import pyarrow as pa
import pytest

import bigframes
Expand Down Expand Up @@ -458,3 +460,34 @@ def today(cls):

with pytest.warns(bigframes.exceptions.ObsoleteVersionWarning):
resources.create_bigquery_session()


@mock.patch("bigframes.session.MAX_INLINE_DF_BYTES", 1)
def test_read_pandas_inline_exceeds_limit_raises_error():
session = resources.create_bigquery_session()
pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
with pytest.raises(
ValueError,
match=r"DataFrame size \(.* bytes\) exceeds the maximum allowed for inline data \(1 bytes\)\.",
):
session.read_pandas(pd_df, write_engine="bigquery_inline")


def test_read_pandas_inline_w_interval_type_raises_error():
session = resources.create_bigquery_session()
df = pd.DataFrame(pd.arrays.IntervalArray.from_breaks([0, 10, 20, 30, 40, 50]))
with pytest.raises(ValueError, match="Could not convert with a BigQuery type: "):
session.read_pandas(df, write_engine="bigquery_inline")


def test_read_pandas_inline_w_noninlineable_type_raises_error():
session = resources.create_bigquery_session()
data = [
[1, 2, 3],
[4, 5],
None,
[6, 7, 8, 9],
]
s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
with pytest.raises(ValueError, match="Could not inline with a BigQuery type:"):
session.read_pandas(s, write_engine="bigquery_inline")