From e452ed93ad9d08d053d2bc0e279f096375c21b1b Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 15 Sep 2025 19:59:31 +0000 Subject: [PATCH] feat: Add rank(pct=True) support --- bigframes/core/block_transforms.py | 7 ++ bigframes/core/groupby/dataframe_group_by.py | 7 +- bigframes/core/groupby/series_group_by.py | 7 +- bigframes/dataframe.py | 5 +- bigframes/series.py | 5 +- tests/system/small/test_dataframe.py | 15 ++-- tests/system/small/test_groupby.py | 73 +++++-------------- tests/system/small/test_series.py | 44 ++++++++++- .../bigframes_vendored/pandas/core/generic.py | 4 + .../pandas/core/groupby/__init__.py | 2 + 10 files changed, 100 insertions(+), 69 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 279643b91d..2ee3dc38b3 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -417,6 +417,7 @@ def rank( ascending: bool = True, grouping_cols: tuple[str, ...] = (), columns: tuple[str, ...] = (), + pct: bool = False, ): if method not in ["average", "min", "max", "first", "dense"]: raise ValueError( @@ -459,6 +460,12 @@ def rank( ), skip_reproject_unsafe=(col != columns[-1]), ) + if pct: + block, max_id = block.apply_window_op( + rownum_id, agg_ops.max_op, windows.unbound(grouping_keys=grouping_cols) + ) + block, rownum_id = block.project_expr(ops.div_op.as_expr(rownum_id, max_id)) + rownum_col_ids.append(rownum_id) # Step 2: Apply aggregate to groups of like input values. diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py index 7d3d3ada69..21f49fe563 100644 --- a/bigframes/core/groupby/dataframe_group_by.py +++ b/bigframes/core/groupby/dataframe_group_by.py @@ -181,7 +181,11 @@ def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFr return self._aggregate_all(agg_ops.median_op, numeric_only=True) def rank( - self, method="average", ascending: bool = True, na_option: str = "keep" + self, + method="average", + ascending: bool = True, + na_option: str = "keep", + pct: bool = False, ) -> df.DataFrame: return df.DataFrame( block_ops.rank( @@ -191,6 +195,7 @@ def rank( ascending, grouping_cols=tuple(self._by_col_ids), columns=tuple(self._selected_cols), + pct=pct, ) ) diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py index 041cc1b3dd..8ab39d27cc 100644 --- a/bigframes/core/groupby/series_group_by.py +++ b/bigframes/core/groupby/series_group_by.py @@ -100,7 +100,11 @@ def mean(self, *args) -> series.Series: return self._aggregate(agg_ops.mean_op) def rank( - self, method="average", ascending: bool = True, na_option: str = "keep" + self, + method="average", + ascending: bool = True, + na_option: str = "keep", + pct: bool = False, ) -> series.Series: return series.Series( block_ops.rank( @@ -110,6 +114,7 @@ def rank( ascending, grouping_cols=tuple(self._by_col_ids), columns=(self._value_column,), + pct=pct, ) ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ff730be4a8..371f69e713 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4990,9 +4990,12 @@ def rank( numeric_only=False, na_option: str = "keep", ascending=True, + pct: bool = False, ) -> DataFrame: df = self._drop_non_numeric() if numeric_only else self - return DataFrame(block_ops.rank(df._block, method, na_option, ascending)) + return DataFrame( + block_ops.rank(df._block, method, na_option, ascending, pct=pct) + ) def first_valid_index(self): return diff --git a/bigframes/series.py b/bigframes/series.py index e44cf417ab..da2f3f07c4 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -851,8 +851,11 @@ def rank( numeric_only=False, na_option: str = "keep", ascending: bool = True, + pct: bool = False, ) -> Series: - return Series(block_ops.rank(self._block, method, na_option, ascending)) + return Series( + block_ops.rank(self._block, method, na_option, ascending, pct=pct) + ) def fillna(self, value=None) -> Series: return self._apply_binary_op(value, ops.fillna_op) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 95aec9906f..bad90d0562 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5442,13 +5442,13 @@ def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): @pytest.mark.parametrize( - ("na_option", "method", "ascending", "numeric_only"), + ("na_option", "method", "ascending", "numeric_only", "pct"), [ - ("keep", "average", True, True), - ("top", "min", False, False), - ("bottom", "max", False, False), - ("top", "first", False, False), - ("bottom", "dense", False, False), + ("keep", "average", True, True, True), + ("top", "min", False, False, False), + ("bottom", "max", False, False, True), + ("top", "first", False, False, False), + ("bottom", "dense", False, False, True), ], ) def test_df_rank_with_nulls( @@ -5458,6 +5458,7 @@ def test_df_rank_with_nulls( method, ascending, numeric_only, + pct, ): unsupported_columns = ["geography_col"] bf_result = ( @@ -5467,6 +5468,7 @@ def test_df_rank_with_nulls( method=method, ascending=ascending, numeric_only=numeric_only, + pct=pct, ) .to_pandas() ) @@ -5477,6 +5479,7 @@ def test_df_rank_with_nulls( method=method, ascending=ascending, numeric_only=numeric_only, + pct=pct, ) .astype(pd.Float64Dtype()) ) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index dba8d46676..553a12a14a 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -96,41 +96,22 @@ def test_dataframe_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q @pytest.mark.parametrize( - ("na_option", "method", "ascending"), + ("na_option", "method", "ascending", "pct"), [ ( "keep", "average", True, - ), - ( - "top", - "min", - False, - ), - ( - "bottom", - "max", - False, - ), - ( - "top", - "first", - False, - ), - ( - "bottom", - "dense", False, ), + ("top", "min", False, False), + ("bottom", "max", False, False), + ("top", "first", False, True), + ("bottom", "dense", False, True), ], ) def test_dataframe_groupby_rank( - scalars_df_index, - scalars_pandas_df_index, - na_option, - method, - ascending, + scalars_df_index, scalars_pandas_df_index, na_option, method, ascending, pct ): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") @@ -138,21 +119,13 @@ def test_dataframe_groupby_rank( bf_result = ( scalars_df_index[col_names] .groupby("string_col") - .rank( - na_option=na_option, - method=method, - ascending=ascending, - ) + .rank(na_option=na_option, method=method, ascending=ascending, pct=pct) ).to_pandas() pd_result = ( ( scalars_pandas_df_index[col_names] .groupby("string_col") - .rank( - na_option=na_option, - method=method, - ascending=ascending, - ) + .rank(na_option=na_option, method=method, ascending=ascending, pct=pct) ) .astype("float64") .astype("Float64") @@ -737,41 +710,37 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): @pytest.mark.parametrize( - ("na_option", "method", "ascending"), + ("na_option", "method", "ascending", "pct"), [ - ( - "keep", - "average", - True, - ), + ("keep", "average", True, False), ( "top", "min", False, + True, ), ( "bottom", "max", False, + True, ), ( "top", "first", False, + True, ), ( "bottom", "dense", False, + False, ), ], ) def test_series_groupby_rank( - scalars_df_index, - scalars_pandas_df_index, - na_option, - method, - ascending, + scalars_df_index, scalars_pandas_df_index, na_option, method, ascending, pct ): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") @@ -779,21 +748,13 @@ def test_series_groupby_rank( bf_result = ( scalars_df_index[col_names] .groupby("string_col")["int64_col"] - .rank( - na_option=na_option, - method=method, - ascending=ascending, - ) + .rank(na_option=na_option, method=method, ascending=ascending, pct=pct) ).to_pandas() pd_result = ( ( scalars_pandas_df_index[col_names] .groupby("string_col")["int64_col"] - .rank( - na_option=na_option, - method=method, - ascending=ascending, - ) + .rank(na_option=na_option, method=method, ascending=ascending, pct=pct) ) .astype("float64") .astype("Float64") diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index ca08f8dece..0a761a3a3a 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2704,10 +2704,48 @@ def test_series_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): ) -def test_rank_ints(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("na_option", "method", "ascending", "numeric_only", "pct"), + [ + ("keep", "average", True, True, False), + ("top", "min", False, False, True), + ("bottom", "max", False, False, False), + ("top", "first", False, False, True), + ("bottom", "dense", False, False, False), + ], +) +def test_series_rank( + scalars_df_index, + scalars_pandas_df_index, + na_option, + method, + ascending, + numeric_only, + pct, +): col_name = "int64_too" - bf_result = scalars_df_index[col_name].rank().to_pandas() - pd_result = scalars_pandas_df_index[col_name].rank().astype(pd.Float64Dtype()) + bf_result = ( + scalars_df_index[col_name] + .rank( + na_option=na_option, + method=method, + ascending=ascending, + numeric_only=numeric_only, + pct=pct, + ) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index[col_name] + .rank( + na_option=na_option, + method=method, + ascending=ascending, + numeric_only=numeric_only, + pct=pct, + ) + .astype(pd.Float64Dtype()) + ) pd.testing.assert_series_equal( bf_result, diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 4c9d1338f4..48f33c67fd 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -1042,6 +1042,10 @@ def rank( ascending (bool, default True): Whether or not the elements should be ranked in ascending order. + pct (bool, default False): + Whether or not to display the returned rankings in percentile + form. + Returns: bigframes.pandas.DataFrame or bigframes.pandas.Series: Return a Series or DataFrame with data ranks as values. diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index f0bc6348f8..b6b91388e3 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -428,6 +428,8 @@ def rank( * keep: leave NA values where they are. * top: smallest rank if ascending. * bottom: smallest rank if descending. + pct (bool, default False): + Compute percentage rank of data within each group Returns: DataFrame with ranking of values within each group