From edd53e1c71f2ae6b1af6cd0ca308ffd8fb928af9 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 14 Dec 2023 00:40:32 +0000 Subject: [PATCH 1/5] fix: fix DataFrameGroupby.agg() issue with as_index=False --- bigframes/core/block_transforms.py | 1 - bigframes/core/blocks.py | 51 +++++++++--------------------- bigframes/core/groupby/__init__.py | 15 ++++++--- bigframes/series.py | 16 +++++----- tests/system/small/test_groupby.py | 17 +++++++--- 5 files changed, 46 insertions(+), 54 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index df84f70859..6654892287 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -332,7 +332,6 @@ def value_counts( by_column_ids=columns, aggregations=[(dummy, agg_ops.count_op)], dropna=dropna, - as_index=True, ) count_id = agg_ids[0] if normalize: diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 34913872e7..90801be2cb 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -926,7 +926,6 @@ def aggregate( by_column_ids: typing.Sequence[str] = (), aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp]] = (), *, - as_index: bool = True, dropna: bool = True, ) -> typing.Tuple[Block, typing.Sequence[str]]: """ @@ -947,40 +946,21 @@ def aggregate( aggregate_labels = self._get_labels_for_columns( [agg[0] for agg in aggregations] ) - if as_index: - names: typing.List[Label] = [] - for by_col_id in by_column_ids: - if by_col_id in self.value_columns: - names.append(self.col_id_to_label[by_col_id]) - else: - names.append(self.col_id_to_index_name[by_col_id]) - return ( - Block( - result_expr, - index_columns=by_column_ids, - column_labels=aggregate_labels, - index_labels=names, - ), - output_col_ids, - ) - else: # as_index = False - # If as_index=False, drop grouping levels, but keep grouping value columns - by_value_columns = [ - col for col in by_column_ids if col in self.value_columns - ] - by_column_labels = self._get_labels_for_columns(by_value_columns) - labels = (*by_column_labels, *aggregate_labels) - offsets_id = guid.generate_guid() - result_expr_pruned = result_expr.select_columns( - [*by_value_columns, *output_col_ids] - ).promote_offsets(offsets_id) - - return ( - Block( - result_expr_pruned, index_columns=[offsets_id], column_labels=labels - ), - output_col_ids, - ) + names: typing.List[Label] = [] + for by_col_id in by_column_ids: + if by_col_id in self.value_columns: + names.append(self.col_id_to_label[by_col_id]) + else: + names.append(self.col_id_to_index_name[by_col_id]) + return ( + Block( + result_expr, + index_columns=by_column_ids, + column_labels=aggregate_labels, + index_labels=names, + ), + output_col_ids, + ) def get_stat(self, column_id: str, stat: agg_ops.AggregateOp): """Gets aggregates immediately, and caches it""" @@ -1309,7 +1289,6 @@ def pivot( result_block, _ = block.aggregate( by_column_ids=self.index_columns, aggregations=aggregations, - as_index=True, dropna=True, ) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index a8b8afdae7..32080bd53e 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -263,9 +263,10 @@ def _agg_string(self, func: str) -> df.DataFrame: agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, - as_index=self._as_index, dropna=self._dropna, ) + if not self._as_index: + agg_block = agg_block.reset_index() return df.DataFrame(agg_block) def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: @@ -285,7 +286,6 @@ def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, - as_index=self._as_index, dropna=self._dropna, ) if want_aggfunc_level: @@ -297,6 +297,8 @@ def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: ) else: agg_block = agg_block.with_column_labels(pd.Index(column_labels)) + if not self._as_index: + agg_block = agg_block.reset_index() return df.DataFrame(agg_block) def _agg_list(self, func: typing.Sequence) -> df.DataFrame: @@ -311,7 +313,6 @@ def _agg_list(self, func: typing.Sequence) -> df.DataFrame: agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, - as_index=self._as_index, dropna=self._dropna, ) agg_block = agg_block.with_column_labels( @@ -319,6 +320,8 @@ def _agg_list(self, func: typing.Sequence) -> df.DataFrame: column_labels, names=[*self._block.column_labels.names, None] ) ) + if not self._as_index: + agg_block = agg_block.reset_index() return df.DataFrame(agg_block) def _agg_named(self, **kwargs) -> df.DataFrame: @@ -339,10 +342,11 @@ def _agg_named(self, **kwargs) -> df.DataFrame: agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, - as_index=self._as_index, dropna=self._dropna, ) agg_block = agg_block.with_column_labels(column_labels) + if not self._as_index: + agg_block = agg_block.reset_index() return df.DataFrame(agg_block) aggregate = agg @@ -379,9 +383,10 @@ def _aggregate_all( result_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, - as_index=self._as_index, dropna=self._dropna, ) + if not self._as_index: + result_block = result_block.reset_index(drop=False) return df.DataFrame(result_block) def _apply_window_op( diff --git a/bigframes/series.py b/bigframes/series.py index c929775a00..d20b171ecf 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -816,7 +816,6 @@ def mode(self) -> Series: block, agg_ids = block.aggregate( by_column_ids=[self._value_column], aggregations=((self._value_column, agg_ops.count_op),), - as_index=False, ) value_count_col_id = agg_ids[0] block, max_value_count_col_id = block.apply_window_op( @@ -830,14 +829,15 @@ def mode(self) -> Series: ops.eq_op, ) block = block.filter(is_mode_col_id) - mode_values_series = Series( - block.select_column(self._value_column).assign_label( - self._value_column, self.name - ) - ) - return typing.cast( - Series, mode_values_series.sort_values().reset_index(drop=True) + # use temporary name for reset_index to avoid collision, restore after dropping extra columns + block = ( + block.with_index_labels(["mode_temp_internal"]) + .order_by([OrderingColumnReference(self._value_column)]) + .reset_index(drop=False) ) + block = block.select_column(self._value_column).with_column_labels([self.name]) + mode_values_series = Series(block.select_column(self._value_column)) + return typing.cast(Series, mode_values_series) def mean(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.mean_op)) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 5214905186..2919c167ef 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -122,23 +122,32 @@ def test_dataframe_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) +@pytest.mark.parametrize( + ("as_index"), + [ + (True), + (False), + ], +) def test_dataframe_groupby_agg_dict_with_list( - scalars_df_index, scalars_pandas_df_index + scalars_df_index, scalars_pandas_df_index, as_index ): col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = ( scalars_df_index[col_names] - .groupby("string_col") + .groupby("string_col", as_index=as_index) .agg({"int64_too": ["mean", "max"], "string_col": "count"}) ) pd_result = ( scalars_pandas_df_index[col_names] - .groupby("string_col") + .groupby("string_col", as_index=as_index) .agg({"int64_too": ["mean", "max"], "string_col": "count"}) ) bf_result_computed = bf_result.to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + pd.testing.assert_frame_equal( + pd_result, bf_result_computed, check_dtype=False, check_index_type=False + ) def test_dataframe_groupby_agg_dict_no_lists(scalars_df_index, scalars_pandas_df_index): From 32901b12cbe12f68c01015f3b222b7b97525dcb4 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 14 Dec 2023 02:11:13 +0000 Subject: [PATCH 2/5] fix _aggregate_all to properly drop columns --- bigframes/core/groupby/__init__.py | 2 +- tests/system/small/test_multiindex.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 32080bd53e..8aa2d2c8e4 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -386,7 +386,7 @@ def _aggregate_all( dropna=self._dropna, ) if not self._as_index: - result_block = result_block.reset_index(drop=False) + result_block = result_block.reset_index() return df.DataFrame(result_block) def _apply_window_op( diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index e7e93849c6..337a7ae6bc 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -367,6 +367,10 @@ def test_multi_index_dataframe_groupby_level_aggregate( .groupby(level=level, as_index=as_index) .mean(numeric_only=True) ) + print("pandas") + print(pd_result.to_string()) + print("bigframes") + print(bf_result.to_string()) # Pandas will have int64 index, while bigquery will have Int64 when resetting pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) From 05d1a4d3e44e1b896285b99b561da86fa17dde0a Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 14 Dec 2023 02:28:13 +0000 Subject: [PATCH 3/5] remove unwanted print statements --- tests/system/small/test_multiindex.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 337a7ae6bc..e7e93849c6 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -367,10 +367,6 @@ def test_multi_index_dataframe_groupby_level_aggregate( .groupby(level=level, as_index=as_index) .mean(numeric_only=True) ) - print("pandas") - print(pd_result.to_string()) - print("bigframes") - print(bf_result.to_string()) # Pandas will have int64 index, while bigquery will have Int64 when resetting pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) From 587d83257870759e8d9a4e1b04b36fa4100d1ca6 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 15 Dec 2023 19:35:47 +0000 Subject: [PATCH 4/5] more closely match pandas groupby --- bigframes/core/blocks.py | 2 +- bigframes/core/groupby/__init__.py | 35 +++++++++++++++------------ bigframes/dataframe.py | 6 ++--- tests/system/small/test_multiindex.py | 11 +++++++-- 4 files changed, 33 insertions(+), 21 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 90801be2cb..53dc664b87 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -66,7 +66,7 @@ _MONOTONIC_DECREASING = "monotonic_decreasing" -LevelType = typing.Union[str, int] +LevelType = typing.Hashable LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 8aa2d2c8e4..3ee46ef675 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -265,9 +265,8 @@ def _agg_string(self, func: str) -> df.DataFrame: aggregations=aggregations, dropna=self._dropna, ) - if not self._as_index: - agg_block = agg_block.reset_index() - return df.DataFrame(agg_block) + dataframe = df.DataFrame(agg_block) + return dataframe if self._as_index else self._convert_index(dataframe) def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: aggregations: typing.List[typing.Tuple[str, agg_ops.AggregateOp]] = [] @@ -297,9 +296,8 @@ def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: ) else: agg_block = agg_block.with_column_labels(pd.Index(column_labels)) - if not self._as_index: - agg_block = agg_block.reset_index() - return df.DataFrame(agg_block) + dataframe = df.DataFrame(agg_block) + return dataframe if self._as_index else self._convert_index(dataframe) def _agg_list(self, func: typing.Sequence) -> df.DataFrame: aggregations = [ @@ -320,9 +318,8 @@ def _agg_list(self, func: typing.Sequence) -> df.DataFrame: column_labels, names=[*self._block.column_labels.names, None] ) ) - if not self._as_index: - agg_block = agg_block.reset_index() - return df.DataFrame(agg_block) + dataframe = df.DataFrame(agg_block) + return dataframe if self._as_index else self._convert_index(dataframe) def _agg_named(self, **kwargs) -> df.DataFrame: aggregations = [] @@ -345,9 +342,18 @@ def _agg_named(self, **kwargs) -> df.DataFrame: dropna=self._dropna, ) agg_block = agg_block.with_column_labels(column_labels) - if not self._as_index: - agg_block = agg_block.reset_index() - return df.DataFrame(agg_block) + dataframe = df.DataFrame(agg_block) + return dataframe if self._as_index else self._convert_index(dataframe) + + def _convert_index(self, dataframe: df.DataFrame): + """Convert index levels to columns except where names conflict.""" + levels_to_drop = [ + level for level in dataframe.index.names if level in dataframe.columns + ] + + if len(levels_to_drop) == dataframe.index.nlevels: + return dataframe.reset_index(drop=True) + return dataframe.droplevel(levels_to_drop).reset_index(drop=False) aggregate = agg @@ -385,9 +391,8 @@ def _aggregate_all( aggregations=aggregations, dropna=self._dropna, ) - if not self._as_index: - result_block = result_block.reset_index() - return df.DataFrame(result_block) + dataframe = df.DataFrame(result_block) + return dataframe if self._as_index else self._convert_index(dataframe) def _apply_window_op( self, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 3b0fd7008a..ba98a4c6e5 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -71,7 +71,7 @@ # TODO(tbergeron): Convert to bytes-based limit MAX_INLINE_DF_SIZE = 5000 -LevelType = typing.Union[str, int] +LevelType = typing.Hashable LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] SingleItemValue = Union[bigframes.series.Series, int, float, Callable] @@ -1940,7 +1940,7 @@ def _stack_mono(self): def _stack_multi(self, level: LevelsType = -1): n_levels = self.columns.nlevels - if isinstance(level, int) or isinstance(level, str): + if not utils.is_list_like(level): level = [level] level_indices = [] for level_ref in level: @@ -1966,7 +1966,7 @@ def _stack_multi(self, level: LevelsType = -1): return DataFrame(block) def unstack(self, level: LevelsType = -1): - if isinstance(level, int) or isinstance(level, str): + if not utils.is_list_like(level): level = [level] block = self._block diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index e7e93849c6..1708735f4c 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -356,17 +356,24 @@ def test_multi_index_dataframe_groupby(scalars_df_index, scalars_pandas_df_index def test_multi_index_dataframe_groupby_level_aggregate( scalars_df_index, scalars_pandas_df_index, level, as_index ): + index_cols = ["int64_too", "bool_col"] bf_result = ( - scalars_df_index.set_index(["int64_too", "bool_col"]) + scalars_df_index.set_index(index_cols) .groupby(level=level, as_index=as_index) .mean(numeric_only=True) .to_pandas() ) pd_result = ( - scalars_pandas_df_index.set_index(["int64_too", "bool_col"]) + scalars_pandas_df_index.set_index(index_cols) .groupby(level=level, as_index=as_index) .mean(numeric_only=True) ) + # For as_index=False, pandas will drop index levels used as groupings + # In the future, it will include this in the result, bigframes already does this behavior + if not as_index: + for col in index_cols: + if col in bf_result.columns: + bf_result = bf_result.drop(col, axis=1) # Pandas will have int64 index, while bigquery will have Int64 when resetting pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) From 71617ef7601caa015430520d998b55b9b93729f5 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 15 Dec 2023 20:42:38 +0000 Subject: [PATCH 5/5] fix mypy error --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ba98a4c6e5..5fe7e36b61 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1950,7 +1950,7 @@ def _stack_multi(self, level: LevelsType = -1): else: level_indices.append(level_ref) else: # str - level_indices.append(self.columns.names.index(level_ref)) + level_indices.append(self.columns.names.index(level_ref)) # type: ignore new_order = [ *[i for i in range(n_levels) if i not in level_indices],