diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 6e99a7c774..30c7902981 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -17,6 +17,7 @@ import pandas as pd +import bigframes.constants as constants import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.ordering as ordering @@ -576,3 +577,53 @@ def align_columns( left_final = left_block.select_columns(left_column_ids) right_final = right_block.select_columns(right_column_ids) return left_final, right_final + + +def idxmin(block: blocks.Block) -> blocks.Block: + return _idx_extrema(block, "min") + + +def idxmax(block: blocks.Block) -> blocks.Block: + return _idx_extrema(block, "max") + + +def _idx_extrema( + block: blocks.Block, min_or_max: typing.Literal["min", "max"] +) -> blocks.Block: + if len(block.index_columns) != 1: + # TODO: Need support for tuple dtype + raise NotImplementedError( + f"idxmin not support for multi-index. {constants.FEEDBACK_LINK}" + ) + + original_block = block + result_cols = [] + for value_col in original_block.value_columns: + direction = ( + ordering.OrderingDirection.ASC + if min_or_max == "min" + else ordering.OrderingDirection.DESC + ) + # Have to find the min for each + order_refs = [ + ordering.OrderingColumnReference(value_col, direction), + *[ + ordering.OrderingColumnReference(idx_col) + for idx_col in original_block.index_columns + ], + ] + window_spec = core.WindowSpec(ordering=order_refs) + idx_col = original_block.index_columns[0] + block, result_col = block.apply_window_op( + idx_col, agg_ops.first_op, window_spec + ) + result_cols.append(result_col) + + block = block.select_columns(result_cols).with_column_labels( + original_block.column_labels + ) + # Stack the entire column axis to produce single-column result + # Assumption: uniform dtype for stackability + return block.aggregate_all_and_stack( + agg_ops.AnyValueOp(), dtype=block.dtypes[0] + ).with_column_labels([original_block.index.name]) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 345afbe6e6..eea8beb130 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1642,6 +1642,12 @@ def agg( aggregate = agg + def idxmin(self) -> bigframes.series.Series: + return bigframes.series.Series(block_ops.idxmin(self._block)) + + def idxmax(self) -> bigframes.series.Series: + return bigframes.series.Series(block_ops.idxmax(self._block)) + def describe(self) -> DataFrame: df_numeric = self._drop_non_numeric(keep_bool=False) if len(df_numeric.columns) == 0: diff --git a/bigframes/series.py b/bigframes/series.py index 84d737210e..8815a6abde 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -887,6 +887,34 @@ def argmin(self) -> int: scalars.Scalar, Series(block.select_column(row_nums)).iloc[0] ) + def idxmax(self) -> blocks.Label: + block = self._block.order_by( + [ + OrderingColumnReference( + self._value_column, direction=OrderingDirection.DESC + ), + *[ + OrderingColumnReference(idx_col) + for idx_col in self._block.index_columns + ], + ] + ) + block = block.slice(0, 1) + return indexes.Index._from_block(block).to_pandas()[0] + + def idxmin(self) -> blocks.Label: + block = self._block.order_by( + [ + OrderingColumnReference(self._value_column), + *[ + OrderingColumnReference(idx_col) + for idx_col in self._block.index_columns + ], + ] + ) + block = block.slice(0, 1) + return indexes.Index._from_block(block).to_pandas()[0] + @property def is_monotonic_increasing(self) -> bool: return typing.cast( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 2880932d10..f3e7a0c0e9 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1292,6 +1292,34 @@ def test_df_update(overwrite, filter_func): pd.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1) +def test_df_idxmin(): + pd_df = pd.DataFrame( + {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] + ) + bf_df = dataframe.DataFrame(pd_df) + + bf_result = bf_df.idxmin().to_pandas() + pd_result = pd_df.idxmin() + + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +def test_df_idxmax(): + pd_df = pd.DataFrame( + {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] + ) + bf_df = dataframe.DataFrame(pd_df) + + bf_result = bf_df.idxmax().to_pandas() + pd_result = pd_df.idxmax() + + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + @pytest.mark.parametrize( ("join", "axis"), [ diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 10f2a74b21..19f1c557ef 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -41,6 +41,17 @@ def test_reset_multi_index(scalars_df_index, scalars_pandas_df_index): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_series_multi_index_idxmin(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index(["bool_col", "int64_too"])[ + "float64_col" + ].idxmin() + pd_result = scalars_pandas_df_index.set_index(["bool_col", "int64_too"])[ + "float64_col" + ].idxmin() + + assert bf_result == pd_result + + def test_binop_series_series_matching_multi_indices( scalars_df_index, scalars_pandas_df_index ): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 588dcc2c83..645638fb3a 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2468,6 +2468,18 @@ def test_argmax(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result +def test_series_idxmin(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.string_col.idxmin() + pd_result = scalars_pandas_df_index.string_col.idxmin() + assert bf_result == pd_result + + +def test_series_idxmax(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_too.idxmax() + pd_result = scalars_pandas_df_index.int64_too.idxmax() + assert bf_result == pd_result + + def test_getattr_attribute_error_when_pandas_has(scalars_df_index): # asof is implemented in pandas but not in bigframes with pytest.raises(AttributeError): diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 11aaf84b14..17d941fbdd 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1805,6 +1805,28 @@ def nsmallest(self, n: int, columns, keep: str = "first"): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def idxmin(self): + """ + Return index of first occurrence of minimum over requested axis. + + NA/null values are excluded. + + Returns: + Series: Indexes of minima along the specified axis. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def idxmax(self): + """ + Return index of first occurrence of maximum over requested axis. + + NA/null values are excluded. + + Returns: + Series: Indexes of maxima along the specified axis. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def nunique(self): """ Count number of distinct elements in specified axis. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index d58c1ccc3b..a41a3454ca 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -8,7 +8,6 @@ import numpy as np from pandas._libs import lib from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer -import pandas.io.formats.format as fmt from bigframes import constants from third_party.bigframes_vendored.pandas.core.generic import NDFrame @@ -151,21 +150,6 @@ def to_string( str or None: String representation of Series if ``buf=None``, otherwise None. """ - formatter = fmt.SeriesFormatter( - self, - name=name, - length=length, - header=header, - index=index, - dtype=dtype, - na_rep=na_rep, - float_format=float_format, - min_rows=min_rows, - max_rows=max_rows, - ) - result = formatter.to_string() - - # catch contract violations raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_markdown( @@ -475,6 +459,30 @@ def duplicated(self, keep="first") -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def idxmin(self) -> Hashable: + """ + Return the row label of the minimum value. + + If multiple values equal the minimum, the first row label with that + value is returned. + + Returns: + Index: Label of the minimum value. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def idxmax(self) -> Hashable: + """ + Return the row label of the maximum value. + + If multiple values equal the maximum, the first row label with that + value is returned. + + Returns: + Index: Label of the maximum value. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def round(self, decimals: int = 0) -> Series: """ Round each value in a Series to the given number of decimals.