Thanks to visit codestin.com
Credit goes to github.com

Skip to content
9 changes: 6 additions & 3 deletions bigframes/core/reshape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,15 @@ def cut(
if bins.is_overlapping:
raise ValueError("Overlapping IntervalIndex is not accepted.")

if labels is not False:
if labels is not None and labels is not False:
raise NotImplementedError(
f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}"
"The 'labels' parameter must be either False or None. "
"Please provide a valid value for 'labels'."
)

return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec())
return x._apply_window_op(
agg_ops.CutOp(bins, labels=labels), window_spec=core.WindowSpec()
)


def qcut(
Expand Down
41 changes: 34 additions & 7 deletions bigframes/operations/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def skips_nulls(self):


class CutOp(WindowOp):
def __init__(self, bins: typing.Union[int, pd.IntervalIndex]):
def __init__(self, bins: typing.Union[int, pd.IntervalIndex], labels=None):
if isinstance(bins, int):
if not bins > 0:
raise ValueError("`bins` should be a positive integer.")
Expand All @@ -239,6 +239,8 @@ def __init__(self, bins: typing.Union[int, pd.IntervalIndex]):
self._bins_int = 0
self._bins = bins

self._labels = labels

def _as_ibis(self, x: ibis_types.Column, window=None):
out = ibis.case()

Expand All @@ -247,12 +249,37 @@ def _as_ibis(self, x: ibis_types.Column, window=None):
col_max = _apply_window_if_present(x.max(), window)
bin_width = (col_max - col_min) / self._bins

for this_bin in range(self._bins_int - 1):
out = out.when(
x <= (col_min + (this_bin + 1) * bin_width),
dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()),
)
out = out.when(x.notnull(), self._bins - 1)
if self._labels is False:
for this_bin in range(self._bins_int - 1):
out = out.when(
x <= (col_min + (this_bin + 1) * bin_width),
dtypes.literal_to_ibis_scalar(
this_bin, force_dtype=Int64Dtype()
),
)
out = out.when(x.notnull(), self._bins - 1)
else:
interval_struct = None
adj = (col_max - col_min) * 0.001
for this_bin in range(self._bins_int):
left_edge = (
col_min + this_bin * bin_width - (0 if this_bin > 0 else adj)
)
right_edge = col_min + (this_bin + 1) * bin_width
interval_struct = ibis.struct(
{
"left_exclusive": left_edge,
"right_inclusive": right_edge,
}
)

if this_bin < self._bins_int - 1:
out = out.when(
x <= (col_min + (this_bin + 1) * bin_width),
interval_struct,
)
else:
out = out.when(x.notnull(), interval_struct)
else:
for interval in self._bins:
condition = (x > interval.left) & (x <= interval.right)
Expand Down
25 changes: 24 additions & 1 deletion tests/system/small/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,30 @@ def test_cut(scalars_dfs):
pd.testing.assert_series_equal(bf_result, pd_result)


def test_cut_default_labels(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

pd_result = pd.cut(scalars_pandas_df["float64_col"], 5)
bf_result = bpd.cut(scalars_df["float64_col"], 5).to_pandas()

# Convert to match data format
pd_result_converted = pd.Series(
[
{"left_exclusive": interval.left, "right_inclusive": interval.right}
if pd.notna(val)
else pd.NA
for val, interval in zip(
pd_result, pd_result.cat.categories[pd_result.cat.codes]
)
],
name=pd_result.name,
)

pd.testing.assert_series_equal(
bf_result, pd_result_converted, check_index=False, check_dtype=False
)


@pytest.mark.parametrize(
("bins",),
[
Expand Down Expand Up @@ -424,7 +448,6 @@ def test_cut_with_interval(scalars_dfs, bins):
],
name=pd_result.name,
)
pd_result.index = pd_result.index.astype("Int64")

pd.testing.assert_series_equal(
bf_result, pd_result_converted, check_index=False, check_dtype=False
Expand Down
5 changes: 4 additions & 1 deletion tests/unit/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,10 @@ def test_method_matches_session(method_name: str):


def test_cut_raises_with_labels():
with pytest.raises(NotImplementedError, match="Only labels=False"):
with pytest.raises(
NotImplementedError,
match="The 'labels' parameter must be either False or None.",
):
mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"])

Expand Down
17 changes: 12 additions & 5 deletions third_party/bigframes_vendored/pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ def cut(

Cut with an integer (equal-width bins):

>>> bpd.cut(s, bins=4)
0 {'left_exclusive': -0.01, 'right_inclusive': 2.5}
1 {'left_exclusive': -0.01, 'right_inclusive': 2.5}
2 {'left_exclusive': 2.5, 'right_inclusive': 5.0}
3 {'left_exclusive': 7.5, 'right_inclusive': 10.0}
dtype: struct<left_exclusive: double, right_inclusive: double>[pyarrow]

Cut with an integer (equal-width bins) and labels=False:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems labels=True is also acceptable. Can we include a code sample for that?

Copy link
Collaborator Author

@Genesis929 Genesis929 Jan 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was adding this to avoid the confusing case in pandas that labels can be false but can not be true, so I made labels = True works the same as labels = None. But it would appear this may be more confusing. Changed the code back so that now labels won't be able to be True as in pandas.


>>> bpd.cut(s, bins=4, labels=False)
0 0
1 0
Expand All @@ -50,7 +59,7 @@ def cut(
>>> import pandas as pd

>>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)])
>>> bpd.cut(s, bins=interval_index, labels=False)
>>> bpd.cut(s, bins=interval_index)
0 <NA>
1 {'left_exclusive': 0, 'right_inclusive': 1}
2 {'left_exclusive': 1, 'right_inclusive': 5}
Expand All @@ -60,7 +69,7 @@ def cut(
Cut with an iterable of tuples:

>>> bins_tuples = [(0, 1), (1, 4), (5, 20)]
>>> bpd.cut(s, bins=bins_tuples, labels=False)
>>> bpd.cut(s, bins=bins_tuples)
0 <NA>
1 {'left_exclusive': 0, 'right_inclusive': 1}
2 <NA>
Expand All @@ -82,9 +91,7 @@ def cut(
labels (None):
Specifies the labels for the returned bins. Must be the same length as
the resulting bins. If False, returns only integer indicators of the
bins. This affects the type of the output container (see below).
If True, raises an error. When `ordered=False`, labels must be
provided.
bins. This affects the type of the output container.

Returns:
Series: A Series representing the respective bin for each value
Expand Down