From 181fd1667b3e37483eeedc8748c48e4b3dcf8a10 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 5 Oct 2023 00:10:40 +0000 Subject: [PATCH 1/3] feat: add level param to DataFrame.stack --- bigframes/core/blocks.py | 12 +++---- bigframes/dataframe.py | 45 +++++++++++++++++++++------ tests/system/small/test_dataframe.py | 2 +- tests/system/small/test_multiindex.py | 38 ++++++++++++++-------- 4 files changed, 68 insertions(+), 29 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 0161d17361..f9b36eec7e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1226,20 +1226,20 @@ def pivot( return result_block.with_column_labels(column_index) - def stack(self, how="left", dropna=True, sort=True, levels: int = 1): + def stack(self, how="left", levels: int = 1): """Unpivot last column axis level into row axis""" + if levels == 0: + return self + # These are the values that will be turned into rows col_labels, row_labels = utils.split_index(self.column_labels, levels=levels) - if dropna: - row_labels = row_labels.drop_duplicates() - if sort: - row_labels = row_labels.sort_values() + row_labels = row_labels.drop_duplicates() row_label_tuples = utils.index_as_tuples(row_labels) if col_labels is not None: - result_index = col_labels.drop_duplicates().sort_values().dropna(how="all") + result_index = col_labels.drop_duplicates().dropna(how="all") result_col_labels = utils.index_as_tuples(result_index) else: result_index = pd.Index([None]) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index eea8beb130..7ac2d3b92a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1708,16 +1708,43 @@ def pivot( ) return DataFrame(pivot_block) - def stack(self): - # TODO: support 'level' param by simply reordering levels such that selected level is last before passing to Block.stack. - # TODO: match impl to pandas future_stack as described in pandas 2.1 release notes - stack_block = self._block.stack() - result_block = block_ops.dropna( - stack_block, stack_block.value_columns, how="all" - ) + def stack(self, level: LevelsType = -1): if not isinstance(self.columns, pandas.MultiIndex): - return bigframes.series.Series(result_block) - return DataFrame(result_block) + if level not in [0, -1, self.columns.name]: + raise IndexError(f"Invalid level {level} for single-level index") + return self._stack_mono() + return self._stack_multi(level) + + def _stack_mono(self): + result_block = self._block.stack() + return bigframes.series.Series(result_block) + + def _stack_multi(self, level: LevelsType = -1): + n_levels = self.columns.nlevels + if isinstance(level, int) or isinstance(level, str): + level = [level] + level_indices = [] + for level_ref in level: + if isinstance(level_ref, int): + if level_ref < 0: + level_indices.append(n_levels + level_ref) + else: + level_indices.append(level_ref) + else: # str + level_indices.append(self.columns.names.index(level_ref)) + + new_order = [ + *[i for i in range(n_levels) if i not in level_indices], + *level_indices, + ] + + original_columns = typing.cast(pandas.MultiIndex, self.columns) + new_columns = original_columns.reorder_levels(new_order) + + block = self._block.with_column_labels(new_columns) + + block = block.stack(levels=len(level)) + return DataFrame(block) def unstack(self): block = self._block diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b8616a54d6..eacb733a0b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1838,7 +1838,7 @@ def test_df_stack(scalars_dfs): columns = ["int64_col", "int64_too", "rowindex_2"] bf_result = scalars_df[columns].stack().to_pandas() - pd_result = scalars_pandas_df[columns].stack() + pd_result = scalars_pandas_df[columns].stack(future_stack=True) # Pandas produces NaN, where bq dataframes produces pd.NA pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 19f1c557ef..a132676770 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -718,25 +718,37 @@ def test_column_multi_index_cumsum(scalars_df_index, scalars_pandas_df_index): pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) -def test_column_multi_index_stack(scalars_df_index, scalars_pandas_df_index): - columns = ["int64_too", "int64_col", "rowindex_2"] +@pytest.mark.parametrize( + ("level",), + [(["l3", "l1"],), ([-2, -1],), (["l3"],), ("l2",), (-3,)], +) +def test_column_multi_index_stack(level): + if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): + pytest.skip("pandas <2.1 uses different stack implementation") + level1 = pandas.Index(["b", "a", "b"]) - # Need resulting column to be pyarrow string rather than object dtype - level2 = pandas.Index(["a", "b", "b"], dtype="string[pyarrow]") - multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) - bf_df = scalars_df_index[columns].copy() - bf_df.columns = multi_columns - pd_df = scalars_pandas_df_index[columns].copy() - pd_df.columns = multi_columns + level2 = pandas.Index(["a", "b", "b"]) + level3 = pandas.Index(["b", "b", "a"]) - bf_result = bf_df.stack().to_pandas() - # Shifting sort behavior in stack - pd_result = pd_df.stack() + multi_columns = pandas.MultiIndex.from_arrays( + [level1, level2, level3], names=["l1", "l2", "l3"] + ) + pd_df = pandas.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=[5, 2, None], + columns=multi_columns, + dtype="Int64", + ) + bf_df = bpd.DataFrame(pd_df) + + bf_result = bf_df.stack(level=level).to_pandas() + # BigFrames emulates future_stack impl + pd_result = pd_df.stack(level=level, future_stack=True) # Pandas produces NaN, where bq dataframes produces pd.NA # Column ordering seems to depend on pandas version pandas.testing.assert_frame_equal( - bf_result.sort_index(axis=1), pd_result.sort_index(axis=1), check_dtype=False + bf_result, pd_result, check_dtype=False, check_index_type=False ) From cea8232876e67b783c8c0689afa4a7cf120b606e Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 5 Oct 2023 18:23:19 +0000 Subject: [PATCH 2/3] fix test failures --- bigframes/dataframe.py | 4 +--- tests/system/small/test_dataframe.py | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7ac2d3b92a..0327894263 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1750,9 +1750,7 @@ def unstack(self): block = self._block # Special case, unstack with mono-index transpose into a series if self.index.nlevels == 1: - block = block.stack( - how="right", dropna=False, sort=False, levels=self.columns.nlevels - ) + block = block.stack(how="right", levels=self.columns.nlevels) return bigframes.series.Series(block) # Pivot by last level of index diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index eacb733a0b..f7a64373aa 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1830,6 +1830,8 @@ def test_df_describe(scalars_dfs): def test_df_stack(scalars_dfs): + if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): + pytest.skip("pandas <2.1 uses different stack implementation") scalars_df, scalars_pandas_df = scalars_dfs # To match bigquery dataframes scalars_pandas_df = scalars_pandas_df.copy() From f85db721b353cbdd2b94fbd1c0c00147c05d0283 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 10 Oct 2023 01:02:20 +0000 Subject: [PATCH 3/3] fix merge issue --- bigframes/core/block_transforms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 4c30d7631d..904da7f312 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -53,7 +53,7 @@ def equals(block1: blocks.Block, block2: blocks.Block) -> bool: joined_block = joined_block.select_columns(equality_ids).with_column_labels( list(range(len(equality_ids))) ) - stacked_block = joined_block.stack(dropna=False, sort=False) + stacked_block = joined_block.stack() result = stacked_block.get_stat(stacked_block.value_columns[0], agg_ops.all_op) return typing.cast(bool, result)