From c4ba6148d0eed921ad783b4e6818ba501bd026dc Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 5 Jan 2024 02:00:22 +0000 Subject: [PATCH 1/2] fix: handle multi-level columns for df aggregates properly --- bigframes/core/blocks.py | 12 ++++++++++-- tests/system/small/test_multiindex.py | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index e88326795c..b5277935cd 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -855,13 +855,21 @@ def aggregate_all_and_stack( aggregations = [ (col_id, operation, col_id) for col_id in self.value_columns ] + index_col_ids = [ + guid.generate_guid() for i in range(self.column_labels.nlevels) + ] result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot( row_labels=self.column_labels.to_list(), - index_col_ids=["index"], + index_col_ids=index_col_ids, unpivot_columns=tuple([(value_col_id, tuple(self.value_columns))]), dtype=dtype, ) - return Block(result_expr, index_columns=["index"], column_labels=[None]) + return Block( + result_expr, + index_columns=index_col_ids, + column_labels=[None], + index_labels=self.column_labels.names, + ) else: # axis_n == 1 # using offsets as identity to group on. # TODO: Allow to promote identity/total_order columns instead for better perf diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 2d4e1f0204..2813089f37 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -682,6 +682,25 @@ def test_column_multi_index_binary_op(scalars_df_index, scalars_pandas_df_index) pandas.testing.assert_series_equal(bf_result, pd_result) +def test_column_multi_index_any(): + columns = pandas.MultiIndex.from_tuples( + [("col0", "col00"), ("col0", "col00"), ("col1", "col11")] + ) + pd_df = pandas.DataFrame( + [[0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2]], columns=columns + ) + bf_df = bpd.DataFrame(pd_df) + + pd_result = pd_df.isna().any() + bf_result = bf_df.isna().any().to_pandas() + + pandas.testing.assert_frame_equal( + bf_result.reset_index(drop=False), + pd_result.reset_index(drop=False), + check_dtype=False, + ) + + def test_column_multi_index_agg(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "int64_col", "float64_col"] multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"])) From c09e5b7f8015fee66636cbe5481f059e2bfc4677 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 5 Jan 2024 18:05:03 +0000 Subject: [PATCH 2/2] skip legacy pandas for new test --- tests/system/small/test_multiindex.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 2813089f37..c6e61eced0 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -16,7 +16,7 @@ import pytest import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal +from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas # Row Multi-index tests @@ -682,6 +682,7 @@ def test_column_multi_index_binary_op(scalars_df_index, scalars_pandas_df_index) pandas.testing.assert_series_equal(bf_result, pd_result) +@skip_legacy_pandas def test_column_multi_index_any(): columns = pandas.MultiIndex.from_tuples( [("col0", "col00"), ("col0", "col00"), ("col1", "col11")]