googleapis · Genesis929 · Jan 23, 2024 · Jan 20, 2024 · Jan 20, 2024 · Jan 20, 2024
@@ -129,12 +129,15 @@ def cut(
         if bins.is_overlapping:
             raise ValueError("Overlapping IntervalIndex is not accepted.")
 
-    if labels is not False:
+    if labels is not None and labels is not False:
         raise NotImplementedError(
-            f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}"
+            "The 'labels' parameter must be either False or None. "
+            "Please provide a valid value for 'labels'."
         )
 
-    return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec())
+    return x._apply_window_op(
+        agg_ops.CutOp(bins, labels=labels), window_spec=core.WindowSpec()
+    )
 
 
 def qcut(

@@ -229,7 +229,7 @@ def skips_nulls(self):
 
 
 class CutOp(WindowOp):
-    def __init__(self, bins: typing.Union[int, pd.IntervalIndex]):
+    def __init__(self, bins: typing.Union[int, pd.IntervalIndex], labels=None):
         if isinstance(bins, int):
             if not bins > 0:
                 raise ValueError("`bins` should be a positive integer.")
@@ -239,6 +239,8 @@ def __init__(self, bins: typing.Union[int, pd.IntervalIndex]):
             self._bins_int = 0
             self._bins = bins
 
+        self._labels = labels
+
     def _as_ibis(self, x: ibis_types.Column, window=None):
         out = ibis.case()
 
@@ -247,12 +249,37 @@ def _as_ibis(self, x: ibis_types.Column, window=None):
             col_max = _apply_window_if_present(x.max(), window)
             bin_width = (col_max - col_min) / self._bins
 
-            for this_bin in range(self._bins_int - 1):
-                out = out.when(
-                    x <= (col_min + (this_bin + 1) * bin_width),
-                    dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()),
-                )
-            out = out.when(x.notnull(), self._bins - 1)
+            if self._labels is False:
+                for this_bin in range(self._bins_int - 1):
+                    out = out.when(
+                        x <= (col_min + (this_bin + 1) * bin_width),
+                        dtypes.literal_to_ibis_scalar(
+                            this_bin, force_dtype=Int64Dtype()
+                        ),
+                    )
+                out = out.when(x.notnull(), self._bins - 1)
+            else:
+                interval_struct = None
+                adj = (col_max - col_min) * 0.001
+                for this_bin in range(self._bins_int):
+                    left_edge = (
+                        col_min + this_bin * bin_width - (0 if this_bin > 0 else adj)
+                    )
+                    right_edge = col_min + (this_bin + 1) * bin_width
+                    interval_struct = ibis.struct(
+                        {
+                            "left_exclusive": left_edge,
+                            "right_inclusive": right_edge,
+                        }
+                    )
+
+                    if this_bin < self._bins_int - 1:
+                        out = out.when(
+                            x <= (col_min + (this_bin + 1) * bin_width),
+                            interval_struct,
+                        )
+                    else:
+                        out = out.when(x.notnull(), interval_struct)
         else:
             for interval in self._bins:
                 condition = (x > interval.left) & (x <= interval.right)

@@ -397,6 +397,30 @@ def test_cut(scalars_dfs):
     pd.testing.assert_series_equal(bf_result, pd_result)
 
 
+def test_cut_default_labels(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    pd_result = pd.cut(scalars_pandas_df["float64_col"], 5)
+    bf_result = bpd.cut(scalars_df["float64_col"], 5).to_pandas()
+
+    # Convert to match data format
+    pd_result_converted = pd.Series(
+        [
+            {"left_exclusive": interval.left, "right_inclusive": interval.right}
+            if pd.notna(val)
+            else pd.NA
+            for val, interval in zip(
+                pd_result, pd_result.cat.categories[pd_result.cat.codes]
+            )
+        ],
+        name=pd_result.name,
+    )
+
+    pd.testing.assert_series_equal(
+        bf_result, pd_result_converted, check_index=False, check_dtype=False
+    )
+
+
 @pytest.mark.parametrize(
     ("bins",),
     [
@@ -424,7 +448,6 @@ def test_cut_with_interval(scalars_dfs, bins):
         ],
         name=pd_result.name,
     )
-    pd_result.index = pd_result.index.astype("Int64")
 
     pd.testing.assert_series_equal(
         bf_result, pd_result_converted, check_index=False, check_dtype=False

@@ -85,7 +85,10 @@ def test_method_matches_session(method_name: str):
 
 
 def test_cut_raises_with_labels():
-    with pytest.raises(NotImplementedError, match="Only labels=False"):
+    with pytest.raises(
+        NotImplementedError,
+        match="The 'labels' parameter must be either False or None.",
+    ):
         mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
         bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"])
 

@@ -38,6 +38,15 @@ def cut(
 
     Cut with an integer (equal-width bins):
 
+        >>> bpd.cut(s, bins=4)
+            0    {'left_exclusive': -0.01, 'right_inclusive': 2.5}
+            1    {'left_exclusive': -0.01, 'right_inclusive': 2.5}
+            2      {'left_exclusive': 2.5, 'right_inclusive': 5.0}
+            3     {'left_exclusive': 7.5, 'right_inclusive': 10.0}
+            dtype: struct<left_exclusive: double, right_inclusive: double>[pyarrow]
+
+    Cut with an integer (equal-width bins) and labels=False:
+
         >>> bpd.cut(s, bins=4, labels=False)
         0    0
         1    0
@@ -50,7 +59,7 @@ def cut(
         >>> import pandas as pd
 
         >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)])
-        >>> bpd.cut(s, bins=interval_index, labels=False)
+        >>> bpd.cut(s, bins=interval_index)
         0                                            <NA>
         1     {'left_exclusive': 0, 'right_inclusive': 1}
         2     {'left_exclusive': 1, 'right_inclusive': 5}
@@ -60,7 +69,7 @@ def cut(
     Cut with an iterable of tuples:
 
         >>> bins_tuples = [(0, 1), (1, 4), (5, 20)]
-        >>> bpd.cut(s, bins=bins_tuples, labels=False)
+        >>> bpd.cut(s, bins=bins_tuples)
         0                                            <NA>
         1     {'left_exclusive': 0, 'right_inclusive': 1}
         2                                            <NA>
@@ -82,9 +91,7 @@ def cut(
         labels (None):
             Specifies the labels for the returned bins. Must be the same length as
             the resulting bins. If False, returns only integer indicators of the
-            bins. This affects the type of the output container (see below).
-            If True, raises an error. When `ordered=False`, labels must be
-            provided.
+            bins. This affects the type of the output container.
 
     Returns:
         Series: A Series representing the respective bin for each value