Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,7 @@ def rank(
ascending: bool = True,
grouping_cols: tuple[str, ...] = (),
columns: tuple[str, ...] = (),
pct: bool = False,
):
if method not in ["average", "min", "max", "first", "dense"]:
raise ValueError(
Expand Down Expand Up @@ -459,6 +460,12 @@ def rank(
),
skip_reproject_unsafe=(col != columns[-1]),
)
if pct:
block, max_id = block.apply_window_op(
rownum_id, agg_ops.max_op, windows.unbound(grouping_keys=grouping_cols)
)
block, rownum_id = block.project_expr(ops.div_op.as_expr(rownum_id, max_id))

rownum_col_ids.append(rownum_id)

# Step 2: Apply aggregate to groups of like input values.
Expand Down
7 changes: 6 additions & 1 deletion bigframes/core/groupby/dataframe_group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,11 @@ def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFr
return self._aggregate_all(agg_ops.median_op, numeric_only=True)

def rank(
self, method="average", ascending: bool = True, na_option: str = "keep"
self,
method="average",
ascending: bool = True,
na_option: str = "keep",
pct: bool = False,
) -> df.DataFrame:
return df.DataFrame(
block_ops.rank(
Expand All @@ -191,6 +195,7 @@ def rank(
ascending,
grouping_cols=tuple(self._by_col_ids),
columns=tuple(self._selected_cols),
pct=pct,
)
)

Expand Down
7 changes: 6 additions & 1 deletion bigframes/core/groupby/series_group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,11 @@ def mean(self, *args) -> series.Series:
return self._aggregate(agg_ops.mean_op)

def rank(
self, method="average", ascending: bool = True, na_option: str = "keep"
self,
method="average",
ascending: bool = True,
na_option: str = "keep",
pct: bool = False,
) -> series.Series:
return series.Series(
block_ops.rank(
Expand All @@ -110,6 +114,7 @@ def rank(
ascending,
grouping_cols=tuple(self._by_col_ids),
columns=(self._value_column,),
pct=pct,
)
)

Expand Down
5 changes: 4 additions & 1 deletion bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4990,9 +4990,12 @@ def rank(
numeric_only=False,
na_option: str = "keep",
ascending=True,
pct: bool = False,
) -> DataFrame:
df = self._drop_non_numeric() if numeric_only else self
return DataFrame(block_ops.rank(df._block, method, na_option, ascending))
return DataFrame(
block_ops.rank(df._block, method, na_option, ascending, pct=pct)
)

def first_valid_index(self):
return
Expand Down
5 changes: 4 additions & 1 deletion bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -851,8 +851,11 @@ def rank(
numeric_only=False,
na_option: str = "keep",
ascending: bool = True,
pct: bool = False,
) -> Series:
return Series(block_ops.rank(self._block, method, na_option, ascending))
return Series(
block_ops.rank(self._block, method, na_option, ascending, pct=pct)
)

def fillna(self, value=None) -> Series:
return self._apply_binary_op(value, ops.fillna_op)
Expand Down
15 changes: 9 additions & 6 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5442,13 +5442,13 @@ def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna):


@pytest.mark.parametrize(
("na_option", "method", "ascending", "numeric_only"),
("na_option", "method", "ascending", "numeric_only", "pct"),
[
("keep", "average", True, True),
("top", "min", False, False),
("bottom", "max", False, False),
("top", "first", False, False),
("bottom", "dense", False, False),
("keep", "average", True, True, True),
("top", "min", False, False, False),
("bottom", "max", False, False, True),
("top", "first", False, False, False),
("bottom", "dense", False, False, True),
],
)
def test_df_rank_with_nulls(
Expand All @@ -5458,6 +5458,7 @@ def test_df_rank_with_nulls(
method,
ascending,
numeric_only,
pct,
):
unsupported_columns = ["geography_col"]
bf_result = (
Expand All @@ -5467,6 +5468,7 @@ def test_df_rank_with_nulls(
method=method,
ascending=ascending,
numeric_only=numeric_only,
pct=pct,
)
.to_pandas()
)
Expand All @@ -5477,6 +5479,7 @@ def test_df_rank_with_nulls(
method=method,
ascending=ascending,
numeric_only=numeric_only,
pct=pct,
)
.astype(pd.Float64Dtype())
)
Expand Down
73 changes: 17 additions & 56 deletions tests/system/small/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,63 +96,36 @@ def test_dataframe_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q


@pytest.mark.parametrize(
("na_option", "method", "ascending"),
("na_option", "method", "ascending", "pct"),
[
(
"keep",
"average",
True,
),
(
"top",
"min",
False,
),
(
"bottom",
"max",
False,
),
(
"top",
"first",
False,
),
(
"bottom",
"dense",
False,
),
("top", "min", False, False),
("bottom", "max", False, False),
("top", "first", False, True),
("bottom", "dense", False, True),
],
)
def test_dataframe_groupby_rank(
scalars_df_index,
scalars_pandas_df_index,
na_option,
method,
ascending,
scalars_df_index, scalars_pandas_df_index, na_option, method, ascending, pct
):
# TODO: supply a reason why this isn't compatible with pandas 1.x
pytest.importorskip("pandas", minversion="2.0.0")
col_names = ["int64_too", "float64_col", "int64_col", "string_col"]
bf_result = (
scalars_df_index[col_names]
.groupby("string_col")
.rank(
na_option=na_option,
method=method,
ascending=ascending,
)
.rank(na_option=na_option, method=method, ascending=ascending, pct=pct)
).to_pandas()
pd_result = (
(
scalars_pandas_df_index[col_names]
.groupby("string_col")
.rank(
na_option=na_option,
method=method,
ascending=ascending,
)
.rank(na_option=na_option, method=method, ascending=ascending, pct=pct)
)
.astype("float64")
.astype("Float64")
Expand Down Expand Up @@ -737,63 +710,51 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index):


@pytest.mark.parametrize(
("na_option", "method", "ascending"),
("na_option", "method", "ascending", "pct"),
[
(
"keep",
"average",
True,
),
("keep", "average", True, False),
(
"top",
"min",
False,
True,
),
(
"bottom",
"max",
False,
True,
),
(
"top",
"first",
False,
True,
),
(
"bottom",
"dense",
False,
False,
),
],
)
def test_series_groupby_rank(
scalars_df_index,
scalars_pandas_df_index,
na_option,
method,
ascending,
scalars_df_index, scalars_pandas_df_index, na_option, method, ascending, pct
):
# TODO: supply a reason why this isn't compatible with pandas 1.x
pytest.importorskip("pandas", minversion="2.0.0")
col_names = ["int64_col", "string_col"]
bf_result = (
scalars_df_index[col_names]
.groupby("string_col")["int64_col"]
.rank(
na_option=na_option,
method=method,
ascending=ascending,
)
.rank(na_option=na_option, method=method, ascending=ascending, pct=pct)
).to_pandas()
pd_result = (
(
scalars_pandas_df_index[col_names]
.groupby("string_col")["int64_col"]
.rank(
na_option=na_option,
method=method,
ascending=ascending,
)
.rank(na_option=na_option, method=method, ascending=ascending, pct=pct)
)
.astype("float64")
.astype("Float64")
Expand Down
44 changes: 41 additions & 3 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2704,10 +2704,48 @@ def test_series_nsmallest(scalars_df_index, scalars_pandas_df_index, keep):
)


def test_rank_ints(scalars_df_index, scalars_pandas_df_index):
@pytest.mark.parametrize(
("na_option", "method", "ascending", "numeric_only", "pct"),
[
("keep", "average", True, True, False),
("top", "min", False, False, True),
("bottom", "max", False, False, False),
("top", "first", False, False, True),
("bottom", "dense", False, False, False),
],
)
def test_series_rank(
scalars_df_index,
scalars_pandas_df_index,
na_option,
method,
ascending,
numeric_only,
pct,
):
col_name = "int64_too"
bf_result = scalars_df_index[col_name].rank().to_pandas()
pd_result = scalars_pandas_df_index[col_name].rank().astype(pd.Float64Dtype())
bf_result = (
scalars_df_index[col_name]
.rank(
na_option=na_option,
method=method,
ascending=ascending,
numeric_only=numeric_only,
pct=pct,
)
.to_pandas()
)
pd_result = (
scalars_pandas_df_index[col_name]
.rank(
na_option=na_option,
method=method,
ascending=ascending,
numeric_only=numeric_only,
pct=pct,
)
.astype(pd.Float64Dtype())
)

pd.testing.assert_series_equal(
bf_result,
Expand Down
4 changes: 4 additions & 0 deletions third_party/bigframes_vendored/pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1042,6 +1042,10 @@ def rank(
ascending (bool, default True):
Whether or not the elements should be ranked in ascending order.

pct (bool, default False):
Whether or not to display the returned rankings in percentile
form.

Returns:
bigframes.pandas.DataFrame or bigframes.pandas.Series:
Return a Series or DataFrame with data ranks as values.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,8 @@ def rank(
* keep: leave NA values where they are.
* top: smallest rank if ascending.
* bottom: smallest rank if descending.
pct (bool, default False):
Compute percentage rank of data within each group

Returns:
DataFrame with ranking of values within each group
Expand Down