From 26b50754d72784b4da0701207f13764b66fac1f2 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 30 Nov 2023 00:08:45 +0000 Subject: [PATCH 1/2] feat: add DataFrame from_dict and from_records methods --- bigframes/dataframe.py | 26 +++++++ tests/system/small/test_dataframe.py | 48 ++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 75 +++++++++++++++++++ 3 files changed, 149 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f7796291b9..fe5a18b3bd 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2368,6 +2368,32 @@ def _split( blocks = self._block._split(ns=ns, fracs=fracs, random_state=random_state) return [DataFrame(block) for block in blocks] + @classmethod + def from_dict( + cls, + data: dict, + orient: str = "columns", + dtype=None, + columns=None, + ) -> DataFrame: + return cls(pandas.DataFrame.from_dict(data, orient, dtype, columns)) + + @classmethod + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float: bool = False, + nrows: int | None = None, + ) -> DataFrame: + return cls( + pandas.DataFrame.from_records( + data, index, exclude, columns, coerce_float, nrows + ) + ) + def to_csv( self, path_or_buf: str, sep=",", *, header: bool = True, index: bool = True ) -> None: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9744d3f6e9..b6834dd2b5 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3289,6 +3289,54 @@ def test_df_duplicated(scalars_df_index, scalars_pandas_df_index, keep, subset): pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) +def test_df_from_dict_columns_orient(): + data = {"a": [1, 2], "b": [3.3, 2.4]} + bf_result = dataframe.DataFrame.from_dict(data, orient="columns").to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="columns") + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_dict_index_orient(): + data = {"a": [1, 2], "b": [3.3, 2.4]} + bf_result = dataframe.DataFrame.from_dict( + data, orient="index", columns=["col1", "col2"] + ).to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="index", columns=["col1", "col2"]) + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_dict_tight_orient(): + data = { + "index": [("i1", "i2"), ("i3", "i4")], + "columns": ["col1", "col2"], + "data": [[1, 2.6], [3, 4.5]], + "index_names": ["in1", "in2"], + "column_names": ["column_axis"], + } + + bf_result = dataframe.DataFrame.from_dict(data, orient="tight").to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="tight") + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_records(): + records = ((1, "a"), (2.5, "b"), (3.3, "c"), (4.9, "d")) + + bf_result = dataframe.DataFrame.from_records( + records, columns=["c1", "c2"] + ).to_pandas() + pd_result = pd.DataFrame.from_records(records, columns=["c1", "c2"]) + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + def test_df_to_dict(scalars_df_index, scalars_pandas_df_index): unsupported = ["numeric_col"] # formatted differently bf_result = scalars_df_index.drop(columns=unsupported).to_dict() diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 099d8b8e66..384036681a 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -160,6 +160,81 @@ def memory_usage(self, index: bool = True): # ---------------------------------------------------------------------- # IO methods (to / from other formats) + @classmethod + def from_dict( + cls, + data: dict, + orient="columns", + dtype=None, + columns=None, + ) -> DataFrame: + """ + Construct DataFrame from dict of array-like or dicts. + + Creates DataFrame object from dictionary by columns or by index + allowing dtype specification. + + Args: + data (dict): + Of the form {field : array-like} or {field : dict}. + orient ({'columns', 'index', 'tight'}, default 'columns'): + The "orientation" of the data. If the keys of the passed dict + should be the columns of the resulting DataFrame, pass 'columns' + (default). Otherwise if the keys should be rows, pass 'index'. + If 'tight', assume a dict with keys ['index', 'columns', 'data', + 'index_names', 'column_names']. + dtype (dtype, default None): + Data type to force after DataFrame construction, otherwise infer. + columns (list, default None): + Column labels to use when ``orient='index'``. Raises a ValueError + if used with ``orient='columns'`` or ``orient='tight'``. + + Returns: + DataFrame + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @classmethod + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float: bool = False, + nrows: int | None = None, + ) -> DataFrame: + """ + Convert structured or record ndarray to DataFrame. + + Creates a DataFrame object from a structured ndarray, sequence of + tuples or dicts, or DataFrame. + + Args: + data (structured ndarray, sequence of tuples or dicts): + Structured input data. + index (str, list of fields, array-like): + Field of array to use as the index, alternately a specific set of + input labels to use. + exclude (sequence, default None): + Columns or fields to exclude. + columns (sequence, default None): + Column names to use. If the passed data do not have names + associated with them, this argument provides names for the + columns. Otherwise this argument indicates the order of the columns + in the result (any names not found in the data will become all-NA + columns). + coerce_float (bool, default False): + Attempt to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + nrows (int, default None): + Number of rows to read if data is an iterator. + + Returns: + DataFrame + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray: """ Convert the DataFrame to a NumPy array. From 8e7e5e238ca2426c2d0b78fe4e5ebd2adbdb5629 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 30 Nov 2023 17:32:11 +0000 Subject: [PATCH 2/2] ignore mypy error --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index fe5a18b3bd..79714b02e5 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2376,7 +2376,7 @@ def from_dict( dtype=None, columns=None, ) -> DataFrame: - return cls(pandas.DataFrame.from_dict(data, orient, dtype, columns)) + return cls(pandas.DataFrame.from_dict(data, orient, dtype, columns)) # type: ignore @classmethod def from_records(