diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index eab4645477..e8a3968b3d 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -67,6 +67,10 @@ _MONOTONIC_DECREASING = "monotonic_decreasing" +LevelType = typing.Union[str, int] +LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] + + class BlockHolder(typing.Protocol): """Interface for mutable objects with state represented by a block value object.""" @@ -1423,9 +1427,7 @@ def _get_unique_values( raise ValueError(f"Too many unique values: {pd_values}") if len(columns) > 1: - return pd.MultiIndex.from_frame( - pd_values.sort_values(by=list(pd_values.columns), na_position="first") - ) + return pd.MultiIndex.from_frame(pd_values) else: return pd.Index(pd_values.squeeze(axis=1).sort_values(na_position="first")) @@ -1611,6 +1613,24 @@ def cached(self) -> Block: index_labels=self.index_labels, ) + def resolve_index_level(self, level: LevelsType) -> typing.Sequence[str]: + if utils.is_list_like(level): + levels = list(level) + else: + levels = [level] + resolved_level_ids = [] + for level_ref in levels: + if isinstance(level_ref, int): + resolved_level_ids.append(self.index_columns[level_ref]) + elif isinstance(level_ref, typing.Hashable): + matching_ids = self.index_name_to_col_id.get(level_ref, []) + if len(matching_ids) != 1: + raise ValueError("level name cannot be found or is ambiguous") + resolved_level_ids.append(matching_ids[0]) + else: + raise ValueError(f"Unexpected level: {level_ref}") + return resolved_level_ids + def _is_monotonic( self, column_ids: typing.Union[str, Sequence[str]], increasing: bool ) -> bool: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 5c0d9b78e1..869075a970 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1038,22 +1038,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0): raise ValueError("Columns must be a multiindex to reorder levels.") def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]: - if utils.is_list_like(level): - levels = list(level) - else: - levels = [level] - resolved_level_ids = [] - for level_ref in levels: - if isinstance(level_ref, int): - resolved_level_ids.append(self._block.index_columns[level_ref]) - elif isinstance(level_ref, typing.Hashable): - matching_ids = self._block.index_name_to_col_id.get(level_ref, []) - if len(matching_ids) != 1: - raise ValueError("level name cannot be found or is ambiguous") - resolved_level_ids.append(matching_ids[0]) - else: - raise ValueError(f"Unexpected level: {level_ref}") - return resolved_level_ids + return self._block.resolve_index_level(level) def rename(self, *, columns: Mapping[blocks.Label, blocks.Label]) -> DataFrame: block = self._block.rename(columns=columns) @@ -1802,20 +1787,25 @@ def _stack_multi(self, level: LevelsType = -1): block = block.stack(levels=len(level)) return DataFrame(block) - def unstack(self): + def unstack(self, level: LevelsType = -1): + if isinstance(level, int) or isinstance(level, str): + level = [level] + block = self._block # Special case, unstack with mono-index transpose into a series if self.index.nlevels == 1: block = block.stack(how="right", levels=self.columns.nlevels) return bigframes.series.Series(block) - # Pivot by last level of index - index_ids = block.index_columns + # Pivot by index levels + unstack_ids = self._resolve_levels(level) block = block.reset_index(drop=False) - block = block.set_index(index_ids[:-1]) + block = block.set_index( + [col for col in self._block.index_columns if col not in unstack_ids] + ) pivot_block = block.pivot( - columns=[index_ids[-1]], + columns=unstack_ids, values=self._block.value_columns, values_in_index=True, ) diff --git a/bigframes/series.py b/bigframes/series.py index 49df8ab61e..c191452783 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -352,22 +352,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0): return Series(self._block.reorder_levels(resolved_level_ids)) def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]: - if _is_list_like(level): - levels = list(level) - else: - levels = [level] - resolved_level_ids = [] - for level_ref in levels: - if isinstance(level_ref, int): - resolved_level_ids.append(self._block.index_columns[level_ref]) - elif isinstance(level_ref, typing.Hashable): - matching_ids = self._block.index_name_to_col_id.get(level_ref, []) - if len(matching_ids) != 1: - raise ValueError("level name cannot be found or is ambiguous") - resolved_level_ids.append(matching_ids[0]) - else: - raise ValueError(f"Unexpected level: {level_ref}") - return resolved_level_ids + return self._block.resolve_index_level(level) def between(self, left, right, inclusive="both"): if inclusive not in ["both", "neither", "left", "right"]: @@ -918,6 +903,29 @@ def argmin(self) -> int: scalars.Scalar, Series(block.select_column(row_nums)).iloc[0] ) + def unstack(self, level: LevelsType = -1): + if isinstance(level, int) or isinstance(level, str): + level = [level] + + block = self._block + + if self.index.nlevels == 1: + raise ValueError("Series must have multi-index to unstack") + + # Pivot by index levels + unstack_ids = self._resolve_levels(level) + block = block.reset_index(drop=False) + block = block.set_index( + [col for col in self._block.index_columns if col not in unstack_ids] + ) + + pivot_block = block.pivot( + columns=unstack_ids, + values=self._block.value_columns, + values_in_index=False, + ) + return bigframes.dataframe.DataFrame(pivot_block) + def idxmax(self) -> blocks.Label: block = self._block.order_by( [ diff --git a/tests/system/conftest.py b/tests/system/conftest.py index cb664302a8..8885b03d34 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -400,7 +400,11 @@ def hockey_df( hockey_table_id: str, session: bigframes.Session ) -> bigframes.dataframe.DataFrame: """DataFrame pointing at test data.""" - return session.read_gbq(hockey_table_id) + return ( + session.read_gbq(hockey_table_id) + .set_index(["player_name", "season"]) + .sort_index() + ) @pytest.fixture(scope="session") @@ -419,7 +423,7 @@ def hockey_pandas_df() -> pd.DataFrame: "season": pd.Int64Dtype(), }, ) - df.index = df.index.astype("Int64") + df = df.set_index(["player_name", "season"]).sort_index() return df diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 84e8def83b..a746a1867c 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1949,8 +1949,14 @@ def test_df_pivot(scalars_dfs, values, index, columns): ], ) def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns): - bf_result = hockey_df.pivot(values=values, index=index, columns=columns).to_pandas() - pd_result = hockey_pandas_df.pivot(values=values, index=index, columns=columns) + bf_result = ( + hockey_df.reset_index() + .pivot(values=values, index=index, columns=columns) + .to_pandas() + ) + pd_result = hockey_pandas_df.reset_index().pivot( + values=values, index=index, columns=columns + ) # Pandas produces NaN, where bq dataframes produces pd.NA pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index b5c78de69c..a87dacae04 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -909,13 +909,36 @@ def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_i pandas.testing.assert_frame_equal(bf_result, pd_result) -def test_multi_index_unstack(hockey_df, hockey_pandas_df): +@pytest.mark.parametrize( + ("level",), + [(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)], +) +def test_df_multi_index_unstack(hockey_df, hockey_pandas_df, level): bf_result = ( - hockey_df.set_index(["team_name", "season", "position"]).unstack().to_pandas() + hockey_df.set_index(["team_name", "position"], append=True) + .unstack(level=level) + .to_pandas() ) pd_result = hockey_pandas_df.set_index( - ["team_name", "season", "position"] - ).unstack() + ["team_name", "position"], append=True + ).unstack(level=level) + + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("level",), + [(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)], +) +def test_series_multi_index_unstack(hockey_df, hockey_pandas_df, level): + bf_result = ( + hockey_df.set_index(["team_name", "position"], append=True)["number"] + .unstack(level=level) + .to_pandas() + ) + pd_result = hockey_pandas_df.set_index(["team_name", "position"], append=True)[ + "number" + ].unstack(level=level) pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index bd1f9a9a18..f0e13e16f5 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1654,6 +1654,19 @@ def clip(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def unstack(self, level): + """ + Unstack, also known as pivot, Series with MultiIndex to produce DataFrame. + + Args: + level (int, str, or list of these, default last level): + Level(s) to unstack, can pass level name. + + Returns: + DataFrame: Unstacked Series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def argmax(self): """ Return int position of the smallest value in the Series.