diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index 9d73fd43c1..c65947f53f 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import re import typing -from typing import List, Optional +from typing import Dict, List, Optional import warnings import numpy as np @@ -34,7 +36,13 @@ def __init__(self, df) -> None: self._df: bigframes.dataframe.DataFrame = df - def filter(self, instruction: str, model, ground_with_google_search: bool = False): + def filter( + self, + instruction: str, + model, + ground_with_google_search: bool = False, + attach_logprobs: bool = False, + ): """ Filters the DataFrame with the semantics of the user instruction. @@ -74,6 +82,10 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models The default is `False`. + attach_logprobs (bool, default False): + Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level + of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. + Returns: bigframes.pandas.DataFrame: DataFrame filtered by the instruction. @@ -82,72 +94,27 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals ValueError: when the instruction refers to a non-existing column, or when no columns are referred to. """ - import bigframes.dataframe - import bigframes.series - self._validate_model(model) - columns = self._parse_columns(instruction) - for column in columns: - if column not in self._df.columns: - raise ValueError(f"Column {column} not found.") + answer_col = "answer" - if ground_with_google_search: - msg = exceptions.format_message( - "Enables Grounding with Google Search may impact billing cost. See pricing " - "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" - ) - warnings.warn(msg, category=UserWarning) - - self._confirm_operation(len(self._df)) - - df: bigframes.dataframe.DataFrame = self._df[columns].copy() - has_blob_column = False - for column in columns: - if df[column].dtype == dtypes.OBJ_REF_DTYPE: - # Don't cast blob columns to string - has_blob_column = True - continue - - if df[column].dtype != dtypes.STRING_DTYPE: - df[column] = df[column].astype(dtypes.STRING_DTYPE) - - user_instruction = self._format_instruction(instruction, columns) - output_instruction = "Based on the provided context, reply to the following claim by only True or False:" - - if has_blob_column: - results = typing.cast( - bigframes.dataframe.DataFrame, - model.predict( - df, - prompt=self._make_multimodel_prompt( - df, columns, user_instruction, output_instruction - ), - temperature=0.0, - ground_with_google_search=ground_with_google_search, - ), - ) - else: - results = typing.cast( - bigframes.dataframe.DataFrame, - model.predict( - self._make_text_prompt( - df, columns, user_instruction, output_instruction - ), - temperature=0.0, - ground_with_google_search=ground_with_google_search, - ), - ) + output_schema = {answer_col: "bool"} + result = self.map( + instruction, + model, + output_schema, + ground_with_google_search, + attach_logprobs, + ) - return self._df[ - results["ml_generate_text_llm_result"].str.lower().str.contains("true") - ] + return result[result[answer_col]].drop(answer_col, axis=1) def map( self, instruction: str, - output_column: str, model, + output_schema: Dict[str, str] | None = None, ground_with_google_search: bool = False, + attach_logprobs=False, ): """ Maps the DataFrame with the semantics of the user instruction. @@ -163,7 +130,7 @@ def map( >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") >>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]}) - >>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model) + >>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", model=model, output_schema={"food": "string"}) ingredient_1 ingredient_2 food 0 Burger Bun Beef Patty Burger @@ -180,12 +147,14 @@ def map( in the instructions like: "Get the ingredients of {food}." - output_column (str): - The column name of the mapping result. - model (bigframes.ml.llm.GeminiTextGenerator): A GeminiTextGenerator provided by Bigframes ML package. + output_schema (Dict[str, str] or None, default None): + The schema used to generate structured output as a bigframes DataFrame. The schema is a string key-value pair of :. + Supported types are int64, float64, bool, string, array and struct. If None, generate string result under the column + "ml_generate_text_llm_result". + ground_with_google_search (bool, default False): Enables Grounding with Google Search for the GeminiTextGenerator model. When set to True, the model incorporates relevant information from Google @@ -194,6 +163,11 @@ def map( page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models The default is `False`. + attach_logprobs (bool, default False): + Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level + of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. + + Returns: bigframes.pandas.DataFrame: DataFrame with attached mapping results. @@ -236,6 +210,9 @@ def map( "Based on the provided contenxt, answer the following instruction:" ) + if output_schema is None: + output_schema = {"ml_generate_text_llm_result": "string"} + if has_blob_column: results = typing.cast( bigframes.series.Series, @@ -246,7 +223,8 @@ def map( ), temperature=0.0, ground_with_google_search=ground_with_google_search, - )["ml_generate_text_llm_result"], + output_schema=output_schema, + ), ) else: results = typing.cast( @@ -257,12 +235,28 @@ def map( ), temperature=0.0, ground_with_google_search=ground_with_google_search, - )["ml_generate_text_llm_result"], + output_schema=output_schema, + ), + ) + + attach_columns = [results[col] for col, _ in output_schema.items()] + + def extract_logprob(s: bigframes.series.Series) -> bigframes.series.Series: + from bigframes import bigquery as bbq + + logprob_jsons = bbq.json_extract_array(s, "$.candidates").list[0] + logprobs = bbq.json_extract(logprob_jsons, "$.avg_logprobs").astype( + "Float64" ) + logprobs.name = "logprob" + return logprobs + + if attach_logprobs: + attach_columns.append(extract_logprob(results["full_response"])) from bigframes.core.reshape.api import concat - return concat([self._df, results.rename(output_column)], axis=1) + return concat([self._df, *attach_columns], axis=1) def join( self, @@ -270,6 +264,7 @@ def join( instruction: str, model, ground_with_google_search: bool = False, + attach_logprobs=False, ): """ Joines two dataframes by applying the instruction over each pair of rows from @@ -313,10 +308,6 @@ def join( model (bigframes.ml.llm.GeminiTextGenerator): A GeminiTextGenerator provided by Bigframes ML package. - max_rows (int, default 1000): - The maximum number of rows allowed to be sent to the model per call. If the result is too large, the method - call will end early with an error. - ground_with_google_search (bool, default False): Enables Grounding with Google Search for the GeminiTextGenerator model. When set to True, the model incorporates relevant information from Google @@ -325,6 +316,10 @@ def join( page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models The default is `False`. + attach_logprobs (bool, default False): + Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level + of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. + Returns: bigframes.pandas.DataFrame: The joined dataframe. @@ -400,7 +395,10 @@ def join( joined_df = self._df.merge(other, how="cross", suffixes=("_left", "_right")) return joined_df.ai.filter( - instruction, model, ground_with_google_search=ground_with_google_search + instruction, + model, + ground_with_google_search=ground_with_google_search, + attach_logprobs=attach_logprobs, ).reset_index(drop=True) def search( diff --git a/notebooks/experimental/ai_operators.ipynb b/notebooks/experimental/ai_operators.ipynb index 9f35d3864a..49a9d798e2 100644 --- a/notebooks/experimental/ai_operators.ipynb +++ b/notebooks/experimental/ai_operators.ipynb @@ -139,7 +139,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:54: PreviewWarning: AI operators are still under experiments, and are subject to change in\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:55: PreviewWarning: AI operators are still under experiments, and are subject to change in\n", "the future.\n", " warnings.warn(msg, category=bfe.PreviewWarning)\n" ] @@ -160,13 +160,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "vCkraKOeqJFl" }, "outputs": [], "source": [ - "bpd.options.bigquery.project = 'YOUR_PROJECT_ID'\n", + "bpd.options.bigquery.project = 'bigframes-dev'\n", "bpd.options.bigquery.location = 'US'" ] }, @@ -227,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "id": "F4dZm4b7iouR" }, @@ -248,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "id": "BoUK-cpbiouS" }, @@ -403,7 +403,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", "`db_dtypes` is a preview feature and subject to change.\n", " warnings.warn(msg, bfe.PreviewWarning)\n" ] @@ -575,12 +575,108 @@ "id": "VFObP2aFiouS" }, "source": [ - "Now, you ask LLM what kind of food can be made from the two ingredients in each row. The column reference syntax in your instruction stays the same. In addition, you need to specify the column name by setting the `output_column` parameter to hold the mapping results." + "Now, you ask LLM what kind of food can be made from the two ingredients in each row. The column reference syntax in your instruction stays the same. In addition, you need to specify the output column name." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are using BigFrames version `2.5.0` or later, the column name is specified with the `output_schema` parameter. This parameter expects a dictionary input in the form of `{'col_name': 'type_name'}`." ] }, { "cell_type": "code", "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ingredient_1ingredient_2food
0BunBeef PattyHamburger
1Soy BeanBitternTofu
2SausageLong BreadHotdog
\n", + "

3 rows × 3 columns

\n", + "
[3 rows x 3 columns in total]" + ], + "text/plain": [ + " ingredient_1 ingredient_2 food\n", + "0 Bun Beef Patty Hamburger\n", + "1 Soy Bean Bittern Tofu\n", + "2 Sausage Long Bread Hotdog\n", + "\n", + "[3 rows x 3 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", model=gemini_model, output_schema={\"food\": \"string\"})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you are using BigFrames version 2.4.0 or prior, the column name is specified wit the `output_column` parameter. The outputs are always strings." + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -667,7 +763,7 @@ } ], "source": [ - "df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" + "# df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" ] }, { @@ -3170,7 +3266,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.10.17" } }, "nbformat": 4, diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py index c2797e39ee..1b1d3a3376 100644 --- a/tests/system/large/operations/test_ai.py +++ b/tests/system/large/operations/test_ai.py @@ -66,6 +66,31 @@ def test_filter(session, gemini_flash_model): ) +def test_filter_attach_logprob(session, gemini_flash_model): + df = dataframe.DataFrame( + data={ + "number_1": [1, 2], + "number_2": [2, 1], + "col": [0, 0], + }, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.ai.filter( + "{number_1} is greater than {number_2}", + gemini_flash_model, + attach_logprobs=True, + ).to_pandas() + + assert "logprob" in actual_df.columns + + def test_filter_multi_model(session, gemini_flash_model): with bigframes.option_context( AI_OP_EXP_OPTION, @@ -186,7 +211,14 @@ def test_filter_invalid_model_raise_error(): df.ai.filter("{city} is the capital of {country}", None) -def test_map(session, gemini_flash_model): +@pytest.mark.parametrize( + ("output_schema", "output_col"), + [ + pytest.param(None, "ml_generate_text_llm_result", id="default_schema"), + pytest.param({"food": "string"}, "food", id="non_default_schema"), + ], +) +def test_map(session, gemini_flash_model, output_schema, output_col): df = dataframe.DataFrame( data={ "ingredient_1": ["Burger Bun", "Soy Bean"], @@ -204,18 +236,18 @@ def test_map(session, gemini_flash_model): ): actual_df = df.ai.map( "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", - "food", gemini_flash_model, + output_schema=output_schema, ).to_pandas() # Result sanitation - actual_df["food"] = actual_df["food"].str.strip().str.lower() + actual_df[output_col] = actual_df[output_col].str.strip().str.lower() expected_df = pd.DataFrame( { "ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"], "gluten-free": [True, True], - "food": ["burger", "tofu"], + output_col: ["burger", "tofu"], } ) pandas.testing.assert_frame_equal( @@ -227,6 +259,31 @@ def test_map(session, gemini_flash_model): ) +def test_map_attach_logprob(session, gemini_flash_model): + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + "gluten-free": [True, True], + }, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.ai.map( + "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", + gemini_flash_model, + attach_logprobs=True, + ).to_pandas() + + assert "logprob" in actual_df.columns + + def test_map_multimodel(session, gemini_flash_model): with bigframes.option_context( AI_OP_EXP_OPTION, @@ -244,8 +301,8 @@ def test_map_multimodel(session, gemini_flash_model): ) result = df.ai.map( "What is the object in {image} combined with {scenario}? One word only.", - "object", gemini_flash_model, + output_schema={"object": "string"}, ).to_pandas() assert len(result) == len(df) @@ -279,7 +336,6 @@ def test_map_with_confirmation(session, gemini_flash_model, reply, monkeypatch): ): df.ai.map( "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", - "food", gemini_flash_model, ) @@ -319,7 +375,7 @@ def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): THRESHOLD_OPTION, 10, ), pytest.raises(ValueError): - df.ai.map(instruction, "food", gemini_flash_model) + df.ai.map(instruction, gemini_flash_model, output_schema={"food": "string"}) def test_map_invalid_model_raise_error(): @@ -338,7 +394,6 @@ def test_map_invalid_model_raise_error(): ), pytest.raises(TypeError): df.ai.map( "What is the food made from {ingredient_1} and {ingredient_2}? One word only.", - "food", None, ) @@ -396,6 +451,34 @@ def test_join(instruction, session, gemini_flash_model): ) +def test_join_attach_logprob(session, gemini_flash_model): + cities = dataframe.DataFrame( + data={ + "city": ["Seattle", "Berlin"], + }, + session=session, + ) + countries = dataframe.DataFrame( + data={"country": ["USA", "UK", "Germany"]}, + session=session, + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = cities.ai.join( + countries, + "{city} is in {country}", + gemini_flash_model, + attach_logprobs=True, + ).to_pandas() + + assert "logprob" in actual_df.columns + + @pytest.mark.parametrize( ("reply"), [ diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py index de6ba4b86c..25d411bef8 100644 --- a/tests/system/small/operations/test_ai.py +++ b/tests/system/small/operations/test_ai.py @@ -51,7 +51,11 @@ def test_filter(session): df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) model = FakeGeminiTextGenerator( dataframe.DataFrame( - {"ml_generate_text_llm_result": ["true", "false"]}, session=session + { + "answer": [True, False], + "full_response": _create_dummy_full_response(2), + }, + session=session, ), ) @@ -77,7 +81,11 @@ def test_map(session): df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) model = FakeGeminiTextGenerator( dataframe.DataFrame( - {"ml_generate_text_llm_result": ["true", "false"]}, session=session + { + "output": ["true", "false"], + "full_response": _create_dummy_full_response(2), + }, + session=session, ), ) @@ -87,7 +95,9 @@ def test_map(session): THRESHOLD_OPTION, 50, ): - result = df.ai.map("map {col}", model=model, output_column="output").to_pandas() + result = df.ai.map( + "map {col}", model=model, output_schema={"output": "string"} + ).to_pandas() pandas.testing.assert_frame_equal( result, @@ -102,7 +112,13 @@ def test_join(session): left_df = dataframe.DataFrame({"col_A": ["A"]}, session=session) right_df = dataframe.DataFrame({"col_B": ["B"]}, session=session) model = FakeGeminiTextGenerator( - dataframe.DataFrame({"ml_generate_text_llm_result": ["true"]}, session=session), + dataframe.DataFrame( + { + "answer": [True], + "full_response": _create_dummy_full_response(1), + }, + session=session, + ), ) with bigframes.option_context( @@ -139,3 +155,9 @@ def test_top_k(session): result = df.ai.top_k("top k of {col}", model, k=1).to_pandas() assert len(result) == 1 + + +def _create_dummy_full_response(row_count: int) -> pd.Series: + entry = """{"candidates": [{"avg_logprobs": -0.5}]}""" + + return pd.Series([entry] * row_count)