diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index d62173b7d6..b2d9d10107 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -27,7 +27,6 @@ import functools import itertools import random -import textwrap import typing from typing import ( Iterable, @@ -54,7 +53,6 @@ from bigframes.core import agg_expressions, local_data import bigframes.core as core import bigframes.core.agg_expressions as ex_types -import bigframes.core.compile.googlesql as googlesql import bigframes.core.expression as ex import bigframes.core.expression as scalars import bigframes.core.guid as guid @@ -62,8 +60,6 @@ import bigframes.core.join_def as join_defs import bigframes.core.ordering as ordering import bigframes.core.pyarrow_utils as pyarrow_utils -import bigframes.core.schema as bf_schema -import bigframes.core.sql as sql import bigframes.core.utils as utils import bigframes.core.window_spec as windows import bigframes.dtypes @@ -2776,14 +2772,6 @@ def _throw_if_null_index(self, opname: str): ) def _get_rows_as_json_values(self) -> Block: - # We want to preserve any ordering currently present before turning to - # direct SQL manipulation. We will restore the ordering when we rebuild - # expression. - # TODO(shobs): Replace direct SQL manipulation by structured expression - # manipulation - expr, ordering_column_name = self.expr.promote_offsets() - expr_sql = self.session._executor.to_sql(expr) - # Names of the columns to serialize for the row. # We will use the repr-eval pattern to serialize a value here and # deserialize in the cloud function. Let's make sure that would work. @@ -2799,93 +2787,44 @@ def _get_rows_as_json_values(self) -> Block: ) column_names.append(serialized_column_name) - column_names_csv = sql.csv(map(sql.simple_literal, column_names)) - - # index columns count - index_columns_count = len(self.index_columns) # column references to form the array of values for the row column_types = list(self.index.dtypes) + list(self.dtypes) column_references = [] for type_, col in zip(column_types, self.expr.column_ids): - if isinstance(type_, pd.ArrowDtype) and pa.types.is_binary( - type_.pyarrow_dtype - ): - column_references.append(sql.to_json_string(col)) + if type_ == bigframes.dtypes.BYTES_DTYPE: + column_references.append(ops.ToJSONString().as_expr(col)) + elif type_ == bigframes.dtypes.BOOL_DTYPE: + # cast operator produces True/False, but function template expects lower case + column_references.append( + ops.lower_op.as_expr( + ops.AsTypeOp(bigframes.dtypes.STRING_DTYPE).as_expr(col) + ) + ) else: - column_references.append(sql.cast_as_string(col)) - - column_references_csv = sql.csv(column_references) - - # types of the columns to serialize for the row - column_types_csv = sql.csv( - [sql.simple_literal(str(typ)) for typ in column_types] - ) + column_references.append( + ops.AsTypeOp(bigframes.dtypes.STRING_DTYPE).as_expr(col) + ) # row dtype to use for deserializing the row as pandas series pandas_row_dtype = bigframes.dtypes.lcd_type(*column_types) if pandas_row_dtype is None: pandas_row_dtype = "object" - pandas_row_dtype = sql.simple_literal(str(pandas_row_dtype)) - - # create a json column representing row through SQL manipulation - row_json_column_name = guid.generate_guid() - select_columns = ( - [ordering_column_name] + list(self.index_columns) + [row_json_column_name] - ) - select_columns_csv = sql.csv( - [googlesql.identifier(col) for col in select_columns] - ) - json_sql = f"""\ -With T0 AS ( -{textwrap.indent(expr_sql, " ")} -), -T1 AS ( - SELECT *, - TO_JSON_STRING(JSON_OBJECT( - "names", [{column_names_csv}], - "types", [{column_types_csv}], - "values", [{column_references_csv}], - "indexlength", {index_columns_count}, - "dtype", {pandas_row_dtype} - )) AS {googlesql.identifier(row_json_column_name)} FROM T0 -) -SELECT {select_columns_csv} FROM T1 -""" - # The only ways this code is used is through df.apply(axis=1) cope path - destination, query_job = self.session._loader._query_to_destination( - json_sql, cluster_candidates=[ordering_column_name] - ) - if not destination: - raise ValueError(f"Query job {query_job} did not produce result table") - - new_schema = ( - self.expr.schema.select([*self.index_columns]) - .append( - bf_schema.SchemaItem( - row_json_column_name, bigframes.dtypes.STRING_DTYPE - ) - ) - .append( - bf_schema.SchemaItem(ordering_column_name, bigframes.dtypes.INT_DTYPE) - ) - ) + pandas_row_dtype = str(pandas_row_dtype) - dest_table = self.session.bqclient.get_table(destination) - expr = core.ArrayValue.from_table( - dest_table, - schema=new_schema, - session=self.session, - offsets_col=ordering_column_name, - n_rows=dest_table.num_rows, - ).drop_columns([ordering_column_name]) - block = Block( - expr, - index_columns=self.index_columns, - column_labels=[row_json_column_name], - index_labels=self._index_labels, + struct_op = ops.StructOp( + column_names=("names", "types", "values", "indexlength", "dtype") ) - return block + names_val = ex.const(tuple(column_names)) + types_val = ex.const(tuple(map(str, column_types))) + values_val = ops.ToArrayOp().as_expr(*column_references) + indexlength_val = ex.const(len(self.index_columns)) + dtype_val = ex.const(str(pandas_row_dtype)) + struct_expr = struct_op.as_expr( + names_val, types_val, values_val, indexlength_val, dtype_val + ) + block, col_id = self.project_expr(ops.ToJSONString().as_expr(struct_expr)) + return block.select_column(col_id) class BlockIndexProperties: diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 044fc90306..a37d390b51 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1301,8 +1301,8 @@ def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON): @scalar_op_compiler.register_unary_op(ops.ToJSONString) -def to_json_string_op_impl(json_obj: ibis_types.Value): - return to_json_string(json_obj=json_obj) +def to_json_string_op_impl(x: ibis_types.Value): + return to_json_string(value=x) @scalar_op_compiler.register_unary_op(ops.JSONValue, pass_op=True) @@ -2069,9 +2069,9 @@ def json_extract_string_array( # type: ignore[empty-body] @ibis_udf.scalar.builtin(name="to_json_string") def to_json_string( # type: ignore[empty-body] - json_obj: ibis_dtypes.JSON, + value, ) -> ibis_dtypes.String: - """Convert JSON to STRING.""" + """Convert value to JSON-formatted string.""" @ibis_udf.scalar.builtin(name="json_value") diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 0a61be716a..25b59d4582 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -386,10 +386,6 @@ def literal_to_ibis_scalar( ibis_dtype = bigframes_dtype_to_ibis_dtype(force_dtype) if force_dtype else None if pd.api.types.is_list_like(literal): - if validate: - raise ValueError( - f"List types can't be stored in BigQuery DataFrames. {constants.FEEDBACK_LINK}" - ) # "correct" way would be to use ibis.array, but this produces invalid BQ SQL syntax return tuple(literal) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index ae68dbe7d3..2c4cccefd2 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -671,8 +671,7 @@ def infer_literal_type(literal) -> typing.Optional[Dtype]: if pd.api.types.is_list_like(literal): element_types = [infer_literal_type(i) for i in literal] common_type = lcd_type(*element_types) - as_arrow = bigframes_dtype_to_arrow_dtype(common_type) - return pd.ArrowDtype(as_arrow) + return list_type(common_type) if pd.api.types.is_dict_like(literal): fields = [] for key in literal.keys(): diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index b1f4f2f689..d3f62fb4f2 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -107,12 +107,6 @@ class ToJSONString(base_ops.UnaryOp): name: typing.ClassVar[str] = "to_json_string" def output_type(self, *input_types): - input_type = input_types[0] - if not dtypes.is_json_like(input_type): - raise TypeError( - "Input type must be a valid JSON object or JSON-formatted string type." - + f" Received type: {input_type}" - ) return dtypes.STRING_DTYPE diff --git a/bigframes/operations/struct_ops.py b/bigframes/operations/struct_ops.py index 0926142b17..de51efd8a4 100644 --- a/bigframes/operations/struct_ops.py +++ b/bigframes/operations/struct_ops.py @@ -43,7 +43,7 @@ def output_type(self, *input_types): @dataclasses.dataclass(frozen=True) class StructOp(base_ops.NaryOp): name: typing.ClassVar[str] = "struct" - column_names: tuple[str] + column_names: tuple[str, ...] def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: num_input_types = len(input_types) diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py index cd23614bbf..b72a781e56 100644 --- a/tests/unit/core/test_dtypes.py +++ b/tests/unit/core/test_dtypes.py @@ -267,13 +267,6 @@ def test_literal_to_ibis_scalar_converts(literal, ibis_scalar): ) -def test_literal_to_ibis_scalar_throws_on_incompatible_literal(): - with pytest.raises( - ValueError, - ): - bigframes.core.compile.ibis_types.literal_to_ibis_scalar({"mykey": "myval"}) - - @pytest.mark.parametrize( ["scalar", "expected_dtype"], [