From cc7d3c9200f8126ca27fe0e012d2932b03867322 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Sat, 20 Jan 2024 00:58:54 +0000 Subject: [PATCH 1/8] test ver. --- bigframes/core/reshape/__init__.py | 11 +++++--- bigframes/operations/aggregations.py | 42 ++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index cadd8e5145..f1f61ae1bf 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -129,12 +129,15 @@ def cut( if bins.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") - if labels is not False: - raise NotImplementedError( - f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" + if labels is not None and not isinstance(labels, bool): + raise ValueError( + "The 'labels' parameter must be either a boolean value or None. " + "Please provide a valid value for 'labels'." ) - return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec()) + return x._apply_window_op( + agg_ops.CutOp(bins, labels=labels), window_spec=core.WindowSpec() + ) def qcut( diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 452abf047c..87418247e5 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -19,8 +19,10 @@ import ibis import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types -from pandas import Int64Dtype +import numpy as np +from pandas import Float64Dtype, Int64Dtype import pandas as pd +import pandas.core.algorithms as algos import bigframes.constants as constants import bigframes.dtypes as dtypes @@ -229,7 +231,7 @@ def skips_nulls(self): class CutOp(WindowOp): - def __init__(self, bins: typing.Union[int, pd.IntervalIndex]): + def __init__(self, bins: typing.Union[int, pd.IntervalIndex], labels=None): if isinstance(bins, int): if not bins > 0: raise ValueError("`bins` should be a positive integer.") @@ -239,6 +241,8 @@ def __init__(self, bins: typing.Union[int, pd.IntervalIndex]): self._bins_int = 0 self._bins = bins + self._labels = labels + def _as_ibis(self, x: ibis_types.Column, window=None): out = ibis.case() @@ -247,12 +251,34 @@ def _as_ibis(self, x: ibis_types.Column, window=None): col_max = _apply_window_if_present(x.max(), window) bin_width = (col_max - col_min) / self._bins - for this_bin in range(self._bins_int - 1): - out = out.when( - x <= (col_min + (this_bin + 1) * bin_width), - dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()), - ) - out = out.when(x.notnull(), self._bins - 1) + if self._labels == False: + for this_bin in range(self._bins_int - 1): + out = out.when( + x <= (col_min + (this_bin + 1) * bin_width), + dtypes.literal_to_ibis_scalar( + this_bin, force_dtype=Int64Dtype() + ), + ) + out = out.when(x.notnull(), self._bins - 1) + else: + interval_struct = None + for this_bin in range(self._bins_int): + left_edge = col_min + this_bin * bin_width + right_edge = col_min + (this_bin + 1) * bin_width + interval_struct = ibis.struct( + { + "left_exclusive": left_edge, + "right_inclusive": right_edge, + } + ) + + if this_bin < self._bins_int - 1: + out = out.when( + x <= (col_min + (this_bin + 1) * bin_width), + interval_struct, + ) + else: + out = out.when(x.notnull(), interval_struct) else: for interval in self._bins: condition = (x > interval.left) & (x <= interval.right) From 5775d32024da66edd8c3b842046aad0a8c43c8da Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Sat, 20 Jan 2024 01:15:05 +0000 Subject: [PATCH 2/8] add test and adjustment --- bigframes/operations/aggregations.py | 5 ++++- tests/system/small/test_pandas.py | 23 +++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 87418247e5..26b02a6e15 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -262,8 +262,11 @@ def _as_ibis(self, x: ibis_types.Column, window=None): out = out.when(x.notnull(), self._bins - 1) else: interval_struct = None + adj = (col_max - col_min) * 0.001 for this_bin in range(self._bins_int): - left_edge = col_min + this_bin * bin_width + left_edge = ( + col_min + this_bin * bin_width - (0 if this_bin > 0 else adj) + ) right_edge = col_min + (this_bin + 1) * bin_width interval_struct = ibis.struct( { diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index a79ddb64cd..3103f45ae6 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -397,6 +397,29 @@ def test_cut(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result) +def test_cut_default_labels(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = pd.cut(scalars_pandas_df["float64_col"], 5) + bf_result = bpd.cut(scalars_df["float64_col"], 5).to_pandas() + + pd_result_converted = pd.Series( + [ + {"left_exclusive": interval.left, "right_inclusive": interval.right} + if pd.notna(val) + else pd.NA + for val, interval in zip( + pd_result, pd_result.cat.categories[pd_result.cat.codes] + ) + ], + name=pd_result.name, + ) + + pd.testing.assert_series_equal( + bf_result, pd_result_converted, check_index=False, check_dtype=False + ) + + @pytest.mark.parametrize( ("bins",), [ From 40b8b27c99fee694edda735d8de4c68cc0acb171 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Sat, 20 Jan 2024 01:24:45 +0000 Subject: [PATCH 3/8] update test and docstring. --- tests/system/small/test_pandas.py | 2 +- third_party/bigframes_vendored/pandas/core/reshape/tile.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 3103f45ae6..0910c0b7e2 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -403,6 +403,7 @@ def test_cut_default_labels(scalars_dfs): pd_result = pd.cut(scalars_pandas_df["float64_col"], 5) bf_result = bpd.cut(scalars_df["float64_col"], 5).to_pandas() + # Convert to match data format pd_result_converted = pd.Series( [ {"left_exclusive": interval.left, "right_inclusive": interval.right} @@ -447,7 +448,6 @@ def test_cut_with_interval(scalars_dfs, bins): ], name=pd_result.name, ) - pd_result.index = pd_result.index.astype("Int64") pd.testing.assert_series_equal( bf_result, pd_result_converted, check_index=False, check_dtype=False diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 55975c3fc1..8d54924217 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -82,9 +82,7 @@ def cut( labels (None): Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the - bins. This affects the type of the output container (see below). - If True, raises an error. When `ordered=False`, labels must be - provided. + bins. This affects the type of the output container. Returns: Series: A Series representing the respective bin for each value From d2846e7651528b353a89333faa76d97f92da9cf9 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Sat, 20 Jan 2024 01:28:35 +0000 Subject: [PATCH 4/8] remove unused import. --- bigframes/operations/aggregations.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 26b02a6e15..f6db7d3f8d 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -19,10 +19,8 @@ import ibis import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types -import numpy as np -from pandas import Float64Dtype, Int64Dtype +from pandas import Int64Dtype import pandas as pd -import pandas.core.algorithms as algos import bigframes.constants as constants import bigframes.dtypes as dtypes From 0d966af01cce0fffba73f7c70292b88272236394 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Sat, 20 Jan 2024 01:43:52 +0000 Subject: [PATCH 5/8] update code examples. --- .../bigframes_vendored/pandas/core/reshape/tile.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 8d54924217..7f5e26dff7 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -38,6 +38,15 @@ def cut( Cut with an integer (equal-width bins): + >>> bpd.cut(s, bins=4) + 0 {'left_exclusive': -0.01, 'right_inclusive': 2.5} + 1 {'left_exclusive': -0.01, 'right_inclusive': 2.5} + 2 {'left_exclusive': 2.5, 'right_inclusive': 5.0} + 3 {'left_exclusive': 7.5, 'right_inclusive': 10.0} + dtype: struct[pyarrow] + + Cut with an integer (equal-width bins) and labels=False: + >>> bpd.cut(s, bins=4, labels=False) 0 0 1 0 @@ -50,7 +59,7 @@ def cut( >>> import pandas as pd >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)]) - >>> bpd.cut(s, bins=interval_index, labels=False) + >>> bpd.cut(s, bins=interval_index) 0 1 {'left_exclusive': 0, 'right_inclusive': 1} 2 {'left_exclusive': 1, 'right_inclusive': 5} @@ -60,7 +69,7 @@ def cut( Cut with an iterable of tuples: >>> bins_tuples = [(0, 1), (1, 4), (5, 20)] - >>> bpd.cut(s, bins=bins_tuples, labels=False) + >>> bpd.cut(s, bins=bins_tuples) 0 1 {'left_exclusive': 0, 'right_inclusive': 1} 2 From 84ff982c450988d80bb9e22033ac68ce8025a7ed Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Sat, 20 Jan 2024 01:48:48 +0000 Subject: [PATCH 6/8] COde formatted. --- third_party/bigframes_vendored/pandas/core/reshape/tile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 7f5e26dff7..fbd1d2d052 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -44,7 +44,7 @@ def cut( 2 {'left_exclusive': 2.5, 'right_inclusive': 5.0} 3 {'left_exclusive': 7.5, 'right_inclusive': 10.0} dtype: struct[pyarrow] - + Cut with an integer (equal-width bins) and labels=False: >>> bpd.cut(s, bins=4, labels=False) From 1cac47a5bbe9673e251b6f6431616b1bda4a6e1e Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Sat, 20 Jan 2024 01:56:33 +0000 Subject: [PATCH 7/8] Update error and unittest. --- bigframes/core/reshape/__init__.py | 2 +- bigframes/operations/aggregations.py | 2 +- tests/unit/test_pandas.py | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index f1f61ae1bf..4400988d97 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -130,7 +130,7 @@ def cut( raise ValueError("Overlapping IntervalIndex is not accepted.") if labels is not None and not isinstance(labels, bool): - raise ValueError( + raise NotImplementedError( "The 'labels' parameter must be either a boolean value or None. " "Please provide a valid value for 'labels'." ) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index f6db7d3f8d..cc2e79c100 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -249,7 +249,7 @@ def _as_ibis(self, x: ibis_types.Column, window=None): col_max = _apply_window_if_present(x.max(), window) bin_width = (col_max - col_min) / self._bins - if self._labels == False: + if self._labels is False: for this_bin in range(self._bins_int - 1): out = out.when( x <= (col_min + (this_bin + 1) * bin_width), diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 4835a24dc7..0f001104eb 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -85,7 +85,10 @@ def test_method_matches_session(method_name: str): def test_cut_raises_with_labels(): - with pytest.raises(NotImplementedError, match="Only labels=False"): + with pytest.raises( + NotImplementedError, + match="The 'labels' parameter must be either a boolean value or None.", + ): mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"]) From 42324c6603baadf0f9d54d38caa848393f4affa8 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 23 Jan 2024 20:23:26 +0000 Subject: [PATCH 8/8] Update labels selections. --- bigframes/core/reshape/__init__.py | 4 ++-- tests/unit/test_pandas.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index 4400988d97..4a3bb16a39 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -129,9 +129,9 @@ def cut( if bins.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") - if labels is not None and not isinstance(labels, bool): + if labels is not None and labels is not False: raise NotImplementedError( - "The 'labels' parameter must be either a boolean value or None. " + "The 'labels' parameter must be either False or None. " "Please provide a valid value for 'labels'." ) diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 0f001104eb..d6af223456 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -87,7 +87,7 @@ def test_method_matches_session(method_name: str): def test_cut_raises_with_labels(): with pytest.raises( NotImplementedError, - match="The 'labels' parameter must be either a boolean value or None.", + match="The 'labels' parameter must be either False or None.", ): mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"])