Thanks to visit codestin.com
Credit goes to github.com

Skip to content

feat!: add structured output for ai map, ai filter and ai join #1746

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 69 additions & 71 deletions bigframes/operations/ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import re
import typing
from typing import List, Optional
from typing import Dict, List, Optional
import warnings

import numpy as np
Expand All @@ -34,7 +36,13 @@ def __init__(self, df) -> None:

self._df: bigframes.dataframe.DataFrame = df

def filter(self, instruction: str, model, ground_with_google_search: bool = False):
def filter(
self,
instruction: str,
model,
ground_with_google_search: bool = False,
attach_logprobs: bool = False,
):
"""
Filters the DataFrame with the semantics of the user instruction.

Expand Down Expand Up @@ -74,6 +82,10 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals
page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
The default is `False`.

attach_logprobs (bool, default False):
Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.

Returns:
bigframes.pandas.DataFrame: DataFrame filtered by the instruction.

Expand All @@ -82,72 +94,27 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals
ValueError: when the instruction refers to a non-existing column, or when no
columns are referred to.
"""
import bigframes.dataframe
import bigframes.series

self._validate_model(model)
columns = self._parse_columns(instruction)
for column in columns:
if column not in self._df.columns:
raise ValueError(f"Column {column} not found.")
answer_col = "answer"

if ground_with_google_search:
msg = exceptions.format_message(
"Enables Grounding with Google Search may impact billing cost. See pricing "
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
)
warnings.warn(msg, category=UserWarning)

self._confirm_operation(len(self._df))

df: bigframes.dataframe.DataFrame = self._df[columns].copy()
has_blob_column = False
for column in columns:
if df[column].dtype == dtypes.OBJ_REF_DTYPE:
# Don't cast blob columns to string
has_blob_column = True
continue

if df[column].dtype != dtypes.STRING_DTYPE:
df[column] = df[column].astype(dtypes.STRING_DTYPE)

user_instruction = self._format_instruction(instruction, columns)
output_instruction = "Based on the provided context, reply to the following claim by only True or False:"

if has_blob_column:
results = typing.cast(
bigframes.dataframe.DataFrame,
model.predict(
df,
prompt=self._make_multimodel_prompt(
df, columns, user_instruction, output_instruction
),
temperature=0.0,
ground_with_google_search=ground_with_google_search,
),
)
else:
results = typing.cast(
bigframes.dataframe.DataFrame,
model.predict(
self._make_text_prompt(
df, columns, user_instruction, output_instruction
),
temperature=0.0,
ground_with_google_search=ground_with_google_search,
),
)
output_schema = {answer_col: "bool"}
result = self.map(
instruction,
model,
output_schema,
ground_with_google_search,
attach_logprobs,
)

return self._df[
results["ml_generate_text_llm_result"].str.lower().str.contains("true")
]
return result[result[answer_col]].drop(answer_col, axis=1)

def map(
self,
instruction: str,
output_column: str,
model,
output_schema: Dict[str, str] | None = None,
ground_with_google_search: bool = False,
attach_logprobs=False,
):
"""
Maps the DataFrame with the semantics of the user instruction.
Expand All @@ -163,7 +130,7 @@ def map(
>>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001")

>>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]})
>>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column="food", model=model)
>>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", model=model, output_schema={"food": "string"})
ingredient_1 ingredient_2 food
0 Burger Bun Beef Patty Burger
<BLANKLINE>
Expand All @@ -180,12 +147,14 @@ def map(
in the instructions like:
"Get the ingredients of {food}."

output_column (str):
The column name of the mapping result.

model (bigframes.ml.llm.GeminiTextGenerator):
A GeminiTextGenerator provided by Bigframes ML package.

output_schema (Dict[str, str] or None, default None):
The schema used to generate structured output as a bigframes DataFrame. The schema is a string key-value pair of <column_name>:<type>.
Supported types are int64, float64, bool, string, array<type> and struct<column type>. If None, generate string result under the column
"ml_generate_text_llm_result".

ground_with_google_search (bool, default False):
Enables Grounding with Google Search for the GeminiTextGenerator model.
When set to True, the model incorporates relevant information from Google
Expand All @@ -194,6 +163,11 @@ def map(
page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
The default is `False`.

attach_logprobs (bool, default False):
Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.


Returns:
bigframes.pandas.DataFrame: DataFrame with attached mapping results.

Expand Down Expand Up @@ -236,6 +210,9 @@ def map(
"Based on the provided contenxt, answer the following instruction:"
)

if output_schema is None:
output_schema = {"ml_generate_text_llm_result": "string"}

if has_blob_column:
results = typing.cast(
bigframes.series.Series,
Expand All @@ -246,7 +223,8 @@ def map(
),
temperature=0.0,
ground_with_google_search=ground_with_google_search,
)["ml_generate_text_llm_result"],
output_schema=output_schema,
),
)
else:
results = typing.cast(
Expand All @@ -257,19 +235,36 @@ def map(
),
temperature=0.0,
ground_with_google_search=ground_with_google_search,
)["ml_generate_text_llm_result"],
output_schema=output_schema,
),
)

attach_columns = [results[col] for col, _ in output_schema.items()]

def extract_logprob(s: bigframes.series.Series) -> bigframes.series.Series:
from bigframes import bigquery as bbq

logprob_jsons = bbq.json_extract_array(s, "$.candidates").list[0]
logprobs = bbq.json_extract(logprob_jsons, "$.avg_logprobs").astype(
"Float64"
)
logprobs.name = "logprob"
return logprobs

if attach_logprobs:
attach_columns.append(extract_logprob(results["full_response"]))

from bigframes.core.reshape.api import concat

return concat([self._df, results.rename(output_column)], axis=1)
return concat([self._df, *attach_columns], axis=1)

def join(
self,
other,
instruction: str,
model,
ground_with_google_search: bool = False,
attach_logprobs=False,
):
"""
Joines two dataframes by applying the instruction over each pair of rows from
Expand Down Expand Up @@ -313,10 +308,6 @@ def join(
model (bigframes.ml.llm.GeminiTextGenerator):
A GeminiTextGenerator provided by Bigframes ML package.

max_rows (int, default 1000):
The maximum number of rows allowed to be sent to the model per call. If the result is too large, the method
call will end early with an error.

ground_with_google_search (bool, default False):
Enables Grounding with Google Search for the GeminiTextGenerator model.
When set to True, the model incorporates relevant information from Google
Expand All @@ -325,6 +316,10 @@ def join(
page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
The default is `False`.

attach_logprobs (bool, default False):
Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.

Returns:
bigframes.pandas.DataFrame: The joined dataframe.

Expand Down Expand Up @@ -400,7 +395,10 @@ def join(
joined_df = self._df.merge(other, how="cross", suffixes=("_left", "_right"))

return joined_df.ai.filter(
instruction, model, ground_with_google_search=ground_with_google_search
instruction,
model,
ground_with_google_search=ground_with_google_search,
attach_logprobs=attach_logprobs,
).reset_index(drop=True)

def search(
Expand Down
Loading