diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index c4127c5fd5..b53c2212c1 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -822,22 +822,54 @@ def filter(self, column_id: str, keep_null: bool = False): index_labels=self.index.names, ) - def aggregate_all_and_pivot( + def aggregate_all_and_stack( self, operation: agg_ops.AggregateOp, *, + axis: int | str = 0, value_col_id: str = "values", dropna: bool = True, dtype=pd.Float64Dtype(), ) -> Block: - aggregations = [(col_id, operation, col_id) for col_id in self.value_columns] - result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot( - row_labels=self.column_labels.to_list(), - index_col_id="index", - unpivot_columns=[(value_col_id, self.value_columns)], - dtype=dtype, - ) - return Block(result_expr, index_columns=["index"], column_labels=[None]) + axis_n = utils.get_axis_number(axis) + if axis_n == 0: + aggregations = [ + (col_id, operation, col_id) for col_id in self.value_columns + ] + result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot( + row_labels=self.column_labels.to_list(), + index_col_id="index", + unpivot_columns=[(value_col_id, self.value_columns)], + dtype=dtype, + ) + return Block(result_expr, index_columns=["index"], column_labels=[None]) + else: # axis_n == 1 + # using offsets as identity to group on. + # TODO: Allow to promote identity/total_order columns instead for better perf + expr_with_offsets, offset_col = self.expr.promote_offsets() + stacked_expr = expr_with_offsets.unpivot( + row_labels=self.column_labels.to_list(), + index_col_id=guid.generate_guid(), + unpivot_columns=[(value_col_id, self.value_columns)], + passthrough_columns=[*self.index_columns, offset_col], + dtype=dtype, + ) + index_aggregations = [ + (col_id, agg_ops.AnyValueOp(), col_id) + for col_id in [*self.index_columns] + ] + main_aggregation = (value_col_id, operation, value_col_id) + result_expr = stacked_expr.aggregate( + [*index_aggregations, main_aggregation], + by_column_ids=[offset_col], + dropna=dropna, + ) + return Block( + result_expr.drop_columns([offset_col]), + self.index_columns, + column_labels=[None], + index_labels=self.index_labels, + ) def select_column(self, id: str) -> Block: return self.select_columns([id]) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0d357e7c3d..e4e22e0306 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1462,41 +1462,48 @@ def dropna( def any( self, *, + axis: typing.Union[str, int] = 0, bool_only: bool = False, ) -> bigframes.series.Series: if not bool_only: frame = self._raise_on_non_boolean("any") else: frame = self._drop_non_bool() - block = frame._block.aggregate_all_and_pivot( - agg_ops.any_op, dtype=pandas.BooleanDtype() + block = frame._block.aggregate_all_and_stack( + agg_ops.any_op, dtype=pandas.BooleanDtype(), axis=axis ) return bigframes.series.Series(block.select_column("values")) - def all(self, *, bool_only: bool = False) -> bigframes.series.Series: + def all( + self, axis: typing.Union[str, int] = 0, *, bool_only: bool = False + ) -> bigframes.series.Series: if not bool_only: frame = self._raise_on_non_boolean("all") else: frame = self._drop_non_bool() - block = frame._block.aggregate_all_and_pivot( - agg_ops.all_op, dtype=pandas.BooleanDtype() + block = frame._block.aggregate_all_and_stack( + agg_ops.all_op, dtype=pandas.BooleanDtype(), axis=axis ) return bigframes.series.Series(block.select_column("values")) - def sum(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def sum( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("sum") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.sum_op) + block = frame._block.aggregate_all_and_stack(agg_ops.sum_op, axis=axis) return bigframes.series.Series(block.select_column("values")) - def mean(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def mean( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("mean") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.mean_op) + block = frame._block.aggregate_all_and_stack(agg_ops.mean_op, axis=axis) return bigframes.series.Series(block.select_column("values")) def median( @@ -1510,47 +1517,57 @@ def median( frame = self._raise_on_non_numeric("median") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.median_op) + block = frame._block.aggregate_all_and_stack(agg_ops.median_op) return bigframes.series.Series(block.select_column("values")) - def std(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def std( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("std") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.std_op) + block = frame._block.aggregate_all_and_stack(agg_ops.std_op, axis=axis) return bigframes.series.Series(block.select_column("values")) - def var(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def var( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("var") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.var_op) + block = frame._block.aggregate_all_and_stack(agg_ops.var_op, axis=axis) return bigframes.series.Series(block.select_column("values")) - def min(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def min( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("min") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.min_op) + block = frame._block.aggregate_all_and_stack(agg_ops.min_op, axis=axis) return bigframes.series.Series(block.select_column("values")) - def max(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def max( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("max") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.max_op) + block = frame._block.aggregate_all_and_stack(agg_ops.max_op, axis=axis) return bigframes.series.Series(block.select_column("values")) - def prod(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def prod( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("prod") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.product_op) + block = frame._block.aggregate_all_and_stack(agg_ops.product_op, axis=axis) return bigframes.series.Series(block.select_column("values")) product = prod @@ -1560,11 +1577,11 @@ def count(self, *, numeric_only: bool = False) -> bigframes.series.Series: frame = self else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.count_op) + block = frame._block.aggregate_all_and_stack(agg_ops.count_op) return bigframes.series.Series(block.select_column("values")) def nunique(self) -> bigframes.series.Series: - block = self._block.aggregate_all_and_pivot(agg_ops.nunique_op) + block = self._block.aggregate_all_and_stack(agg_ops.nunique_op) return bigframes.series.Series(block.select_column("values")) def agg( @@ -1587,7 +1604,7 @@ def agg( ) else: return bigframes.series.Series( - self._block.aggregate_all_and_pivot( + self._block.aggregate_all_and_stack( agg_ops.lookup_agg_func(typing.cast(str, func)) ) ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b6ca958c03..adf17848ee 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1999,6 +1999,29 @@ def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op): pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) +@pytest.mark.parametrize( + ("op"), + [ + (lambda x: x.sum(axis=1, numeric_only=True)), + (lambda x: x.mean(axis=1, numeric_only=True)), + (lambda x: x.min(axis=1, numeric_only=True)), + (lambda x: x.max(axis=1, numeric_only=True)), + (lambda x: x.std(axis=1, numeric_only=True)), + (lambda x: x.var(axis=1, numeric_only=True)), + ], + ids=["sum", "mean", "min", "max", "std", "var"], +) +def test_dataframe_aggregates_axis_1(scalars_df_index, scalars_pandas_df_index, op): + col_names = ["int64_too", "int64_col", "float64_col", "bool_col", "string_col"] + bf_result = op(scalars_df_index[col_names]).to_pandas() + pd_result = op(scalars_pandas_df_index[col_names]) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + # Pandas has object index type + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index): col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] bf_result = scalars_df_index[col_names].median(numeric_only=True).to_pandas() @@ -2019,11 +2042,16 @@ def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index): [ (lambda x: x.all(bool_only=True)), (lambda x: x.any(bool_only=True)), + (lambda x: x.all(axis=1, bool_only=True)), + (lambda x: x.any(axis=1, bool_only=True)), ], - ids=["all", "any"], + ids=["all_axis0", "any_axis0", "all_axis1", "any_axis1"], ) def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op): # Pandas will drop nullable 'boolean' dtype so we convert first to bool, then cast back later + scalars_df_index = scalars_df_index.assign( + bool_col=scalars_df_index.bool_col.fillna(False) + ) scalars_pandas_df_index = scalars_pandas_df_index.assign( bool_col=scalars_pandas_df_index.bool_col.fillna(False).astype("bool") ) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 9d26938e08..6ce11cd7e9 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -11,7 +11,7 @@ """ from __future__ import annotations -from typing import Iterable, Literal, Mapping, Optional, Sequence, Union +from typing import Literal, Mapping, Optional, Sequence, Union import numpy as np @@ -1457,7 +1457,7 @@ def apply(self, func, *, args=(), **kwargs): # ---------------------------------------------------------------------- # ndarray-like stats methods - def any(self, *, bool_only: bool = False): + def any(self, *, axis=0, bool_only: bool = False): """ Return whether any element is True, potentially over an axis. @@ -1466,6 +1466,9 @@ def any(self, *, bool_only: bool = False): non-empty). Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. bool_only (bool. default False): Include only boolean columns. @@ -1474,7 +1477,7 @@ def any(self, *, bool_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def all(self, *, bool_only: bool = False): + def all(self, axis=0, *, bool_only: bool = False): """ Return whether all elements are True, potentially over an axis. @@ -1483,6 +1486,9 @@ def all(self, *, bool_only: bool = False): empty). Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. bool_only (bool. default False): Include only boolean columns. @@ -1491,11 +1497,14 @@ def all(self, *, bool_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def prod(self, *, numeric_only: bool = False): + def prod(self, axis=0, *, numeric_only: bool = False): """ Return the product of the values over the requested axis. Args: + aßxis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. numeric_only (bool. default False): Include only float, int, boolean columns. @@ -1504,13 +1513,16 @@ def prod(self, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def min(self, *, numeric_only: bool = False): + def min(self, axis=0, *, numeric_only: bool = False): """Return the minimum of the values over the requested axis. If you want the *index* of the minimum, use ``idxmin``. This is the equivalent of the ``numpy.ndarray`` method ``argmin``. Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. numeric_only (bool, default False): Default False. Include only float, int, boolean columns. @@ -1519,13 +1531,16 @@ def min(self, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def max(self, *, numeric_only: bool = False): + def max(self, axis=0, *, numeric_only: bool = False): """Return the maximum of the values over the requested axis. If you want the *index* of the maximum, use ``idxmax``. This is the equivalent of the ``numpy.ndarray`` method ``argmax``. Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. numeric_only (bool. default False): Default False. Include only float, int, boolean columns. @@ -1534,12 +1549,15 @@ def max(self, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def sum(self, *, numeric_only: bool = False): + def sum(self, axis=0, *, numeric_only: bool = False): """Return the sum of the values over the requested axis. This is equivalent to the method ``numpy.sum``. Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. numeric_only (bool. default False): Default False. Include only float, int, boolean columns. @@ -1548,10 +1566,13 @@ def sum(self, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def mean(self, *, numeric_only: bool = False): + def mean(self, axis=0, *, numeric_only: bool = False): """Return the mean of the values over the requested axis. Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. numeric_only (bool. default False): Default False. Include only float, int, boolean columns. @@ -1575,12 +1596,15 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def var(self, *, numeric_only: bool = False): + def var(self, axis=0, *, numeric_only: bool = False): """Return unbiased variance over requested axis. Normalized by N-1 by default. Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. numeric_only (bool. default False): Default False. Include only float, int, boolean columns.