From 88f9dac80a9e285ca82aeae1e72d7eec87f30c91 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 29 Nov 2023 19:26:00 +0000 Subject: [PATCH] feat: add DataFrame.select_dtypes method --- bigframes/dataframe.py | 13 +++++++ tests/system/small/test_dataframe.py | 20 +++++++++++ .../bigframes_vendored/pandas/core/frame.py | 36 +++++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f7796291b9..c6b28f1b01 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -434,6 +434,19 @@ def info( # TODO: Convert to different units (kb, mb, etc.) obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n") + def select_dtypes(self, include=None, exclude=None) -> DataFrame: + # Create empty pandas dataframe with same schema and then leverage actual pandas implementation + as_pandas = pandas.DataFrame( + { + col_id: pandas.Series([], dtype=dtype) + for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) + } + ) + selected_columns = tuple( + as_pandas.select_dtypes(include=include, exclude=exclude).columns + ) + return DataFrame(self._block.select_columns(selected_columns)) + def _set_internal_query_job(self, query_job: bigquery.QueryJob): self._query_job = query_job diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9744d3f6e9..5940df590c 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -297,6 +297,26 @@ def test_df_info(scalars_dfs): assert expected == bf_result.getvalue() +@pytest.mark.parametrize( + ("include", "exclude"), + [ + ("Int64", None), + (["int"], None), + ("number", None), + ([pd.Int64Dtype(), pd.BooleanDtype()], None), + (None, [pd.Int64Dtype(), pd.BooleanDtype()]), + ("Int64", ["boolean"]), + ], +) +def test_select_dtypes(scalars_dfs, include, exclude): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.select_dtypes(include=include, exclude=exclude) + bf_result = scalars_df.select_dtypes(include=include, exclude=exclude).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + def test_drop_index(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 099d8b8e66..3bd90be2e4 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -158,6 +158,42 @@ def memory_usage(self, index: bool = True): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def select_dtypes(self, include=None, exclude=None) -> DataFrame: + """ + Return a subset of the DataFrame's columns based on the column dtypes. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': ["hello", "world"], 'col3': [True, False]}) + >>> df.select_dtypes(include=['Int64']) + col1 + 0 1 + 1 2 + + [2 rows x 1 columns] + + >>> df.select_dtypes(exclude=['Int64']) + col2 col3 + 0 hello True + 1 world False + + [2 rows x 2 columns] + + + Args: + include (scalar or list-like): + A selection of dtypes or strings to be included. + exclude (scalar or list-like): + A selection of dtypes or strings to be excluded. + + Returns: + DataFrame: The subset of the frame including the dtypes in ``include`` and excluding the dtypes in ``exclude``. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # IO methods (to / from other formats) def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray: