diff --git a/bigframes/core/compile/ibis_compiler/aggregate_compiler.py b/bigframes/core/compile/ibis_compiler/aggregate_compiler.py index 5e9cba7f8c..1907078690 100644 --- a/bigframes/core/compile/ibis_compiler/aggregate_compiler.py +++ b/bigframes/core/compile/ibis_compiler/aggregate_compiler.py @@ -676,6 +676,29 @@ def _( ).to_expr() +@compile_ordered_unary_agg.register +def _( + op: agg_ops.StringAggOp, + column: ibis_types.Column, + window=None, + order_by: typing.Sequence[ibis_types.Value] = [], +) -> ibis_types.ArrayValue: + if window is not None: + raise NotImplementedError( + f"StringAgg with windowing is not supported. {constants.FEEDBACK_LINK}" + ) + + return ( + ibis_ops.StringAgg( + column, # type: ignore + sep=op.sep, # type: ignore + order_by=order_by, # type: ignore + ) + .to_expr() + .fill_null(ibis_types.literal("")) + ) + + @compile_binary_agg.register def _( op: agg_ops.CorrOp, left: ibis_types.Column, right: ibis_types.Column, window=None diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 969ae2659d..044fc90306 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1216,11 +1216,18 @@ def to_arry_op_impl(*values: ibis_types.Value): def array_reduce_op_impl(x: ibis_types.Value, op: ops.ArrayReduceOp): import bigframes.core.compile.ibis_compiler.aggregate_compiler as agg_compilers - return typing.cast(ibis_types.ArrayValue, x).reduce( - lambda arr_vals: agg_compilers.compile_unary_agg( - op.aggregation, typing.cast(ibis_types.Column, arr_vals) + if op.aggregation.order_independent: + return typing.cast(ibis_types.ArrayValue, x).reduce( + lambda arr_vals: agg_compilers.compile_unary_agg( + op.aggregation, typing.cast(ibis_types.Column, arr_vals) + ) + ) + else: + return typing.cast(ibis_types.ArrayValue, x).reduce( + lambda arr_vals: agg_compilers.compile_ordered_unary_agg( + op.aggregation, typing.cast(ibis_types.Column, arr_vals) + ) ) - ) # JSON Ops diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 81ab18272c..0ee80fd74b 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -379,9 +379,26 @@ def skips_nulls(self): return True def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - return pd.ArrowDtype( - pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(input_types[0])) - ) + return dtypes.list_type(input_types[0]) + + +@dataclasses.dataclass(frozen=True) +class StringAggOp(UnaryAggregateOp): + name: ClassVar[str] = "string_agg" + sep: str = "," + + @property + def order_independent(self): + return False + + @property + def skips_nulls(self): + return True + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] != dtypes.STRING_DTYPE: + raise TypeError(f"Type {input_types[0]} is not string-like") + return dtypes.STRING_DTYPE @dataclasses.dataclass(frozen=True) diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 9022a1665e..4743483954 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -24,6 +24,7 @@ import bigframes.dataframe as df import bigframes.operations as ops from bigframes.operations._op_converters import convert_index, convert_slice +import bigframes.operations.aggregations as agg_ops import bigframes.operations.base import bigframes.series as series @@ -295,6 +296,11 @@ def cat( ) -> series.Series: return self._apply_binary_op(others, ops.strconcat_op, alignment=join) + def join(self, sep: str) -> series.Series: + return self._apply_unary_op( + ops.ArrayReduceOp(aggregation=agg_ops.StringAggOp(sep=sep)) + ) + def to_blob(self, connection: Optional[str] = None) -> series.Series: """Create a BigFrames Blob series from a series of URIs. diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index a720614892..afd1a74dff 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -736,3 +736,14 @@ def test_getitem_w_struct_array(): expected = bpd.Series(expected_data, dtype=bpd.ArrowDtype((pa_struct))) assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_string_join(session): + pd_series = pd.Series([["a", "b", "c"], ["100"], ["hello", "world"], []]) + bf_series = session.read_pandas(pd_series) + + pd_result = pd_series.str.join("--") + bf_result = bf_series.str.join("--").to_pandas() + + pd_result = pd_result.astype("string[pyarrow]") + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index 61bafeeca2..9af2a4afe4 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -1088,6 +1088,22 @@ def visit_ArrayAggregate(self, op, *, arg, order_by, where): expr = arg return sge.IgnoreNulls(this=self.agg.array_agg(expr, where=where)) + def visit_StringAgg(self, op, *, arg, sep, order_by, where): + if len(order_by) > 0: + expr = sge.Order( + this=arg, + expressions=[ + # Avoid adding NULLS FIRST / NULLS LAST in SQL, which is + # unsupported in ARRAY_AGG by reconstructing the node as + # plain SQL text. + f"({order_column.args['this'].sql(dialect='bigquery')}) {'DESC' if order_column.args.get('desc') else 'ASC'}" + for order_column in order_by + ], + ) + else: + expr = arg + return self.agg.string_agg(expr, sep, where=where) + def visit_FirstNonNullValue(self, op, *, arg): return sge.IgnoreNulls(this=sge.FirstValue(this=arg)) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py index 34f6406e0c..c3f2a03223 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py @@ -401,3 +401,20 @@ class ArrayAggregate(Filterable, Reduction): @attribute def dtype(self): return dt.Array(self.arg.dtype) + + +@public +class StringAgg(Filterable, Reduction): + """ + Collects the elements of this expression into a string. Similar to + the ibis `GroupConcat`, but adds `order_by_*` parameter. + """ + + arg: Column + sep: Value[dt.String] + + order_by: VarTuple[Value] = () + + @attribute + def dtype(self): + return dt.string diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index 9b5b461ea5..fe94bf3049 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -1298,3 +1298,43 @@ def center( bigframes.series.Series: Returns Series or Index with minimum number of char in object. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def join(self, sep: str): + """ + Join lists contained as elements in the Series/Index with passed delimiter. + + If the elements of a Series are lists themselves, join the content of these + lists using the delimiter passed to the function. + This function is an equivalent to :meth:`str.join`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import pandas as pd + + Example with a list that contains non-string elements. + + >>> s = bpd.Series([['lion', 'elephant', 'zebra'], + ... ['dragon'], + ... ['duck', 'swan', 'fish', 'guppy']]) + >>> s + 0 ['lion' 'elephant' 'zebra'] + 1 ['dragon'] + 2 ['duck' 'swan' 'fish' 'guppy'] + dtype: list[pyarrow] + + >>> s.str.join('-') + 0 lion-elephant-zebra + 1 dragon + 2 duck-swan-fish-guppy + dtype: string + + Args: + sep (str): + Delimiter to use between list entries. + + Returns: + bigframes.series.Series: The list entries concatenated by intervening occurrences of the delimiter. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)