From 86a35ac17651710f4d54bc45926be7fc6f4b89a5 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 24 Mar 2025 23:00:49 +0000 Subject: [PATCH 1/7] fix: add deprecation warning to TextEmbeddingGenerator model, espeically gemini-1.0-X and gemini-1.5-X --- bigframes/ml/llm.py | 9 ++++++++- tests/system/small/ml/test_llm.py | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 0117444f16..ecac240385 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -452,7 +452,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator: @typing_extensions.deprecated( - "PaLM2TextEmbeddingGenerator has been deprecated. Use TextEmbeddingGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator) instead. ", + "PaLM2TextEmbeddingGenerator has been deprecated, textembedding-gecko are going to be deprecated. Use TextEmbeddingGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator) instead. ", category=exceptions.ApiDeprecationWarning, ) @log_adapter.class_logger @@ -918,10 +918,17 @@ def to_gbq( return new_model.session.read_gbq_model(model_name) +@typing_extensions.deprecated( + "gemini-1.0-X and gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. ", + category=exceptions.ApiDeprecationWarning, +) @log_adapter.class_logger class GeminiTextGenerator(base.RetriableRemotePredictor): """Gemini text generator LLM model. + .. note:: + gemini-1.0-X and gemini-1.5-X are going to be deprecated. Use GeminiTextGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. + Args: model_name (str, Default to "gemini-pro"): The model for natural language tasks. Accepted values are "gemini-pro", "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001", "gemini-1.5-pro-002", "gemini-1.5-flash-001", "gemini-1.5-flash-002" and "gemini-2.0-flash-exp". Default to "gemini-pro". diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 1bcbd9cd8c..695da45303 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -1022,6 +1022,21 @@ def test_palm2_text_embedding_deprecated(): pass +@pytest.mark.parametrize( + "model_name", + ( + "gemini-1.5-pro-001", + "gemini-1.5-pro-002", + "gemini-1.5-flash-001", + "gemini-1.5-flash-002", + "gemini-1.0-pro", + ), +) +def test_gemini_text_generator_deprecated(model_name): + with pytest.warns(exceptions.ApiDeprecationWarning): + llm.GeminiTextGenerator(model_name=model_name) + + @pytest.mark.parametrize( "model_name", ( From 452503f507010fa713dde73ec8e0eb2cbd0610cd Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 25 Mar 2025 03:24:21 +0000 Subject: [PATCH 2/7] set GeminiTextGenereator model to gemini-2.0-flash-exp, since gemini-pro is being deprecated --- bigframes/ml/llm.py | 4 +- .../apps/synthetic_data_generation.ipynb | 849 +------------ .../bq_dataframes_llm_code_generation.ipynb | 2 +- .../bq_dataframes_llm_kmeans.ipynb | 2 +- .../bq_dataframes_llm_vector_search.ipynb | 1124 +++++++++-------- ...q_dataframes_ml_drug_name_generation.ipynb | 2 +- .../generative_ai/large_language_models.ipynb | 92 +- .../bq_dataframes_template.ipynb | 2 +- samples/snippets/gemini_model_test.py | 4 +- tests/system/load/test_llm.py | 3 +- tests/system/small/ml/test_llm.py | 38 +- 11 files changed, 684 insertions(+), 1438 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index ecac240385..750da0e914 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -452,7 +452,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator: @typing_extensions.deprecated( - "PaLM2TextEmbeddingGenerator has been deprecated, textembedding-gecko are going to be deprecated. Use TextEmbeddingGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator) instead. ", + "PaLM2TextEmbeddingGenerator has been deprecated. Use TextEmbeddingGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.TextEmbeddingGenerator) instead. ", category=exceptions.ApiDeprecationWarning, ) @log_adapter.class_logger @@ -927,7 +927,7 @@ class GeminiTextGenerator(base.RetriableRemotePredictor): """Gemini text generator LLM model. .. note:: - gemini-1.0-X and gemini-1.5-X are going to be deprecated. Use GeminiTextGenerator(https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. + gemini-1.0-X and gemini-1.5-X are going to be deprecated. Use gemini-2.0-X (https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.GeminiTextGenerator) instead. Args: model_name (str, Default to "gemini-pro"): diff --git a/notebooks/apps/synthetic_data_generation.ipynb b/notebooks/apps/synthetic_data_generation.ipynb index a6e8444aac..c6aafdc3a9 100644 --- a/notebooks/apps/synthetic_data_generation.ipynb +++ b/notebooks/apps/synthetic_data_generation.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -52,12 +52,12 @@ "output_type": "stream", "text": [ "Collecting faker\n", - " Downloading Faker-24.9.0-py3-none-any.whl (1.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: python-dateutil>=2.4 in /usr/local/lib/python3.10/dist-packages (from faker) (2.8.2)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.4->faker) (1.16.0)\n", - "Installing collected packages: faker\n", - "Successfully installed faker-24.9.0\n" + " Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)\n", + "Requirement already satisfied: tzdata in /usr/local/google/home/shuowei/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages (from faker) (2024.2)\n", + "Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)\n", + "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m55.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: faker\n", + "Successfully installed faker-37.1.0\n" ] } ], @@ -67,11 +67,23 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "id": "m3q1oeJALhsG" }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'PROJECT_ID' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mbigframes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mbpd\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m bpd\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mbigquery\u001b[38;5;241m.\u001b[39mproject \u001b[38;5;241m=\u001b[39m \u001b[43mPROJECT_ID\u001b[49m\n", + "\u001b[0;31mNameError\u001b[0m: name 'PROJECT_ID' is not defined" + ] + } + ], "source": [ "import bigframes.pandas as bpd\n", "bpd.options.bigquery.project = PROJECT_ID" @@ -95,32 +107,11 @@ "id": "lIYdn1woOS1n", "outputId": "be474338-44c2-4ce0-955e-d525b8b9c84b" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/bigframes/session/__init__.py:1907: UserWarning: No explicit location is set, so using location US for the session.\n", - " return Session(context)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 3e8423da-737c-42e2-a3d2-d2180ca18579 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "model = GeminiTextGenerator()" + "model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-exp\")" ] }, { @@ -141,77 +132,7 @@ "id": "SSR-lLScLa95", "outputId": "cbaec34e-6fa6-45b4-e54a-f11ca06b61e1" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job d651d0bf-300c-4b1d-9e3c-03310b71287c is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job c67b9bb9-2f3e-4b9e-b680-0b7b6e9d2279 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
prompt
0Write python code to generate a pandas datafra...
\n", - "

1 rows × 1 columns

\n", - "
[1 rows x 1 columns in total]" - ], - "text/plain": [ - " prompt\n", - "0 Write python code to generate a pandas datafra...\n", - "\n", - "[1 rows x 1 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "prompt = \"\"\"\\\n", "Write python code to generate a pandas dataframe based on the requirements:\n", @@ -248,73 +169,7 @@ "id": "miDe3K4GNvOo", "outputId": "f2039e80-5ad7-4551-f8b2-7ef714a89d63" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job d5c0725d-9070-4712-adfd-8a9bd86eefc3 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 4eb581a3-7f97-411a-bee1-91e8c150cef4 is DONE. 8 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job f3d5503d-a3e7-49ce-b985-5ffbdbd856e3 is DONE. 2 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 8ef76041-f077-4a05-bc03-63e6983ef853 is DONE. 332 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "import pandas as pd\n", - "from faker import Faker\n", - "\n", - "fake = Faker('es_ES')\n", - "result_df = pd.DataFrame({\n", - " 'Name': [fake.name() for _ in range(100)],\n", - " 'Age': [fake.random_int(min=18, max=65) for _ in range(100)],\n", - " 'Gender': [fake.random_element(elements=['Male', 'Female', 'Non-binary']) for _ in range(100)]\n", - "})\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "max_tries = 5\n", "for i in range(max_tries):\n", @@ -366,342 +221,7 @@ "id": "GODcPwX2PBEu", "outputId": "dec4c872-c464-49e4-cd7f-9442fc977d18" }, - "outputs": [ - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"execution_context\",\n \"rows\": 100,\n \"fields\": [\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 100,\n \"samples\": [\n \"Renata Pla Cases\",\n \"Guiomar Carnero-Paz\",\n \"Luciano Garmendia\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13,\n \"min\": 18,\n \"max\": 64,\n \"num_unique_values\": 39,\n \"samples\": [\n 56,\n 31,\n 34\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Gender\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Male\",\n \"Non-binary\",\n \"Female\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameAgeGender
0Pastora Acuña Company21Male
1León Reig-Salom39Non-binary
2Aura Tomás Llobet30Female
3Vicente Correa Palomar64Female
4Benito del Fuster34Female
............
95Eduardo Cabrera27Non-binary
96Nazaret de Izaguirre40Non-binary
97Manuela Agullo Bustamante27Female
98Eugenio Mateo Naranjo Blazquez36Non-binary
99Heriberto Vicens Baeza53Female
\n", - "

100 rows × 3 columns

\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " Name Age Gender\n", - "0 Pastora Acuña Company 21 Male\n", - "1 León Reig-Salom 39 Non-binary\n", - "2 Aura Tomás Llobet 30 Female\n", - "3 Vicente Correa Palomar 64 Female\n", - "4 Benito del Fuster 34 Female\n", - ".. ... ... ...\n", - "95 Eduardo Cabrera 27 Non-binary\n", - "96 Nazaret de Izaguirre 40 Non-binary\n", - "97 Manuela Agullo Bustamante 27 Female\n", - "98 Eugenio Mateo Naranjo Blazquez 36 Non-binary\n", - "99 Heriberto Vicens Baeza 53 Female\n", - "\n", - "[100 rows x 3 columns]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "execution_context = {}\n", "exec(code, execution_context)\n", @@ -726,21 +246,7 @@ "id": "n-BsGciNqSwU", "outputId": "996e5639-a49c-4542-a0dc-ede450e0eb6d" }, - "outputs": [ - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, - "text/plain": [ - "'projects/bigframes-dev/locations/us-central1/functions/bigframes-19f2f35637098969770261a2974bef32'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "@bpd.remote_function([int], str, packages=['faker', 'pandas'])\n", "def data_generator(id):\n", @@ -770,20 +276,7 @@ "id": "Odkmev9nsYqA", "outputId": "4aa7a1fd-0c0d-4412-f326-a20e19f583b5" }, - "outputs": [ - { - "data": { - "text/html": [ - "Load job 40b9c3a8-27fc-40a8-9edf-4aa2e0fec332 is DONE. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "desired_num_rows = 1_000_000 # 1 million rows\n", "batch_size = 100 # used in the prompt\n", @@ -803,20 +296,7 @@ "id": "UyBhlJFVsmQC", "outputId": "29748df5-673b-4320-bb1f-53abaace3b81" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 9dd49b50-2dbf-4351-b9ad-b17aeb627caf is DONE. 240.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "df[\"json_data\"] = df[\"row_id\"].apply(data_generator)" ] @@ -839,262 +319,7 @@ "id": "6p3eM21qvRvy", "outputId": "333f4e49-a555-4d2f-b527-02142782b3a7" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 3f8d2133-b01d-402d-a731-79592810ca1c is DONE. 63.7 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 4a613aa3-6323-4914-8e34-93323885d458 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 0deb03be-725b-40b4-a7a1-1023b0477f35 is DONE. 40.1 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NameAgeGender
0Eloy Santiago-Aragón31Male
1Amanda Mata Abril20Non-binary
2Danilo Velázquez Salcedo58Male
3Leyre Alba España61Female
4Paulina Amores Pastor41Male
5Jorge Cuadrado Mena50Female
6Chucho Catalán36Non-binary
7Vidal Benavente Lerma38Male
8Clementina Álamo32Female
9Petrona Roselló-Valls61Male
10Luís Camilo Sastre Marin45Male
11Gil Baudelio Carbajo Ordóñez58Non-binary
12David del Donoso44Female
13Dolores Arnau Ros21Non-binary
14Febe de León46Non-binary
15Ariadna Almazán34Female
16Blas Serna Aguiló24Non-binary
17Paulino Barreda Almeida59Female
18Eligio Valcárcel Tormo35Non-binary
19Toño Amador Torres Portillo48Female
20Florencia del Bejarano65Non-binary
21Clímaco Andreu Gómez18Male
22Xiomara Dominguez Solana35Female
23Leire Castilla Borrego19Non-binary
24Angelita Garmendia Carpio21Non-binary
\n", - "

25 rows × 3 columns

\n", - "
[1000000 rows x 3 columns in total]" - ], - "text/plain": [ - " Name Age Gender\n", - "0 Eloy Santiago-Aragón 31 Male\n", - "1 Amanda Mata Abril 20 Non-binary\n", - "2 Danilo Velázquez Salcedo 58 Male\n", - "3 Leyre Alba España 61 Female\n", - "4 Paulina Amores Pastor 41 Male\n", - "5 Jorge Cuadrado Mena 50 Female\n", - "6 Chucho Catalán 36 Non-binary\n", - "7 Vidal Benavente Lerma 38 Male\n", - "8 Clementina Álamo 32 Female\n", - "9 Petrona Roselló-Valls 61 Male\n", - "10 Luís Camilo Sastre Marin 45 Male\n", - "11 Gil Baudelio Carbajo Ordóñez 58 Non-binary\n", - "12 David del Donoso 44 Female\n", - "13 Dolores Arnau Ros 21 Non-binary\n", - "14 Febe de León 46 Non-binary\n", - "15 Ariadna Almazán 34 Female\n", - "16 Blas Serna Aguiló 24 Non-binary\n", - "17 Paulino Barreda Almeida 59 Female\n", - "18 Eligio Valcárcel Tormo 35 Non-binary\n", - "19 Toño Amador Torres Portillo 48 Female\n", - "20 Florencia del Bejarano 65 Non-binary\n", - "21 Clímaco Andreu Gómez 18 Male\n", - "22 Xiomara Dominguez Solana 35 Female\n", - "23 Leire Castilla Borrego 19 Non-binary\n", - "24 Angelita Garmendia Carpio 21 Non-binary\n", - "...\n", - "\n", - "[1000000 rows x 3 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "sql = f\"\"\"\n", "WITH T0 AS ({df.sql}),\n", @@ -1126,6 +351,18 @@ "kernelspec": { "display_name": "Python 3", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" } }, "nbformat": 4, diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index db51afd412..d6f5936dd4 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -430,7 +430,7 @@ "source": [ "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "model = GeminiTextGenerator()" + "model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-exp\")" ] }, { diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 254ac65358..fcce7d5d1e 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -1614,7 +1614,7 @@ "source": [ "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "q_a_model = GeminiTextGenerator()" + "q_a_model = GeminiTextGenerator(model_name=\"gemini-2.0-flash-exp\")" ] }, { diff --git a/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb index 20d5b4161d..a15209aae4 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb @@ -61,14 +61,14 @@ }, { "cell_type": "markdown", + "metadata": { + "id": "vFMjpPBo9aVv" + }, "source": [ "**Author:** Sudipto Guha (Google)\n", "\n", "**Last updated:** March 16th 2025" - ], - "metadata": { - "id": "vFMjpPBo9aVv" - } + ] }, { "cell_type": "markdown", @@ -136,6 +136,9 @@ }, { "cell_type": "markdown", + "metadata": { + "id": "GqLjnm1hsKGU" + }, "source": [ "## Setup & initialization\n", "\n", @@ -144,10 +147,7 @@ "For [Vector embedding generation](https://cloud.google.com/bigquery/docs/generate-text-embedding#required_roles)\n", "\n", "For [Vector Index creation](https://cloud.google.com/bigquery/docs/vector-index#roles_and_permissions)" - ], - "metadata": { - "id": "GqLjnm1hsKGU" - } + ] }, { "cell_type": "markdown", @@ -198,17 +198,17 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "id": "b8bKCfIiooEV", "executionInfo": { + "elapsed": 2, "status": "ok", "timestamp": 1742191597773, - "user_tz": -480, - "elapsed": 2, "user": { "displayName": "", "userId": "" - } - } + }, + "user_tz": -480 + }, + "id": "b8bKCfIiooEV" }, "outputs": [], "source": [ @@ -284,23 +284,23 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "id": "j3lmnsh7ooEW", "executionInfo": { + "elapsed": 2, "status": "ok", "timestamp": 1742191608487, - "user_tz": -480, - "elapsed": 2, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "j3lmnsh7ooEW", "outputId": "eb68daf5-5558-487a-91d2-4b4f9e476da0" }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "WARNING: google.colab.auth.authenticate_user() is not supported in Colab Enterprise.\n" ] @@ -342,17 +342,17 @@ "cell_type": "code", "execution_count": 33, "metadata": { - "id": "R7STCS8xB5d2", "executionInfo": { + "elapsed": 947, "status": "ok", "timestamp": 1742195413800, - "user_tz": -480, - "elapsed": 947, "user": { "displayName": "", "userId": "" - } - } + }, + "user_tz": -480 + }, + "id": "R7STCS8xB5d2" }, "outputs": [], "source": [ @@ -385,33 +385,33 @@ }, { "cell_type": "markdown", - "source": [ - "Partial ordering mode allows BigQuery DataFrames to push down many more row and column filters. On large clustered and partitioned tables, this can greatly reduce the number of bytes scanned and computation slots used. This [blog post](https://medium.com/google-cloud/introducing-partial-ordering-mode-for-bigquery-dataframes-bigframes-ec35841d95c0) goes over it in more detail." - ], "metadata": { "id": "iOFF9hrvs5WE" - } + }, + "source": [ + "Partial ordering mode allows BigQuery DataFrames to push down many more row and column filters. On large clustered and partitioned tables, this can greatly reduce the number of bytes scanned and computation slots used. This [blog post](https://medium.com/google-cloud/introducing-partial-ordering-mode-for-bigquery-dataframes-bigframes-ec35841d95c0) goes over it in more detail." + ] }, { "cell_type": "code", - "source": [ - "bf.options.bigquery.ordering_mode = \"partial\"" - ], + "execution_count": 4, "metadata": { - "id": "9Gil1Oaas7KA", "executionInfo": { + "elapsed": 2, "status": "ok", "timestamp": 1742191620533, - "user_tz": -480, - "elapsed": 2, "user": { "displayName": "", "userId": "" - } - } + }, + "user_tz": -480 + }, + "id": "9Gil1Oaas7KA" }, - "execution_count": 4, - "outputs": [] + "outputs": [], + "source": [ + "bf.options.bigquery.ordering_mode = \"partial\"" + ] }, { "cell_type": "markdown", @@ -435,26 +435,26 @@ "cell_type": "code", "execution_count": 17, "metadata": { - "id": "zDSwoBo1CU3G", "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { + "elapsed": 468, "status": "ok", "timestamp": 1742192516923, - "user_tz": -480, - "elapsed": 468, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "zDSwoBo1CU3G", "outputId": "83edbc2f-5a23-407b-8890-f968eb31be44" }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py:3553: UserWarning: \u001b[93mReading cached table from 2025-03-17 06:07:09.526507+00:00 to avoid\n", "incompatibilies with previous reads of this table. To read the latest\n", @@ -472,35 +472,35 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "id": "tYDoaKgJChiq", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "executionInfo": { + "elapsed": 6697, "status": "ok", "timestamp": 1742192524632, - "user_tz": -480, - "elapsed": 6697, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "tYDoaKgJChiq", "outputId": "9174da29-a051-4a99-e38f-6a2b09cfe4e9" }, "outputs": [ { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job 6f15ad71-cc7b-49c1-90e9-274bea7afbb9 is DONE. 477.4 GB processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -514,105 +514,33 @@ }, { "cell_type": "code", - "source": [ - "## take a look at the sample dataset\n", - "\n", - "publications.head(5)" - ], + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 556 }, - "id": "XmqdJInztzPl", "executionInfo": { + "elapsed": 6, "status": "ok", "timestamp": 1742191801044, - "user_tz": -480, - "elapsed": 6, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "XmqdJInztzPl", "outputId": "ae05f3a6-edeb-423a-c061-c416717e1ec5" }, - "execution_count": 11, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " publication_number title \\\n", - "0 AU-338190-S Compressor wheel \n", - "1 CN-100525651-C Method for processing egg products \n", - "2 TW-I725505-B Improved carbon molecular sieve adsorbent \n", - "3 EP-0248026-B1 A system for supplying strip to a processing line \n", - "4 MY-135762-A Method for producing acrylic acid \n", - "\n", - " title_translated abstract \\\n", - "0 False Newness and distinctiveness is claimed in the ... \n", - "1 False The invention discloses a processing method of... \n", - "2 False Disclosed herein are rapid cycle pressure swin... \n", - "3 False A system (10) for supplying strip material (S)... \n", - "4 False A PROCESS FOR THE FRACTIONAL CONDENSATION OF A... \n", - "\n", - " abstract_translated cpc \\\n", - "0 False [] \n", - "1 False [] \n", - "2 False [{'code': 'B01D2253/116', 'inventive': False, ... \n", - "3 False [{'code': 'B65H2701/37', 'inventive': False, '... \n", - "4 False [{'code': 'C07C51/50', 'inventive': True, 'fir... \n", - "\n", - " cpc_low \\\n", - "0 [] \n", - "1 [] \n", - "2 ['B01D2253/116' 'B01D2253/10' 'B01D2253/00' 'B... \n", - "3 ['B65H2701/37' 'B65H2701/30' 'B65H2701/00' 'B6... \n", - "4 ['C07C51/50' 'C07C51/42' 'C07C51/00' 'C07C' 'C... \n", - "\n", - " cpc_inventive_low \\\n", - "0 [] \n", - "1 [] \n", - "2 ['B01D2253/116' 'B01D2253/10' 'B01D2253/00' 'B... \n", - "3 ['B65H2701/37' 'B65H2701/30' 'B65H2701/00' 'B6... \n", - "4 ['C07C51/50' 'C07C51/42' 'C07C51/00' 'C07C' 'C... \n", - "\n", - " top_terms \\\n", - "0 ['compressor wheel' 'newness' 'distinctiveness... \n", - "1 ['egg' 'processing method' 'egg body' 'pack' '... \n", - "2 ['swing adsorption' 'pressure swing' 'molecula... \n", - "3 ['strip material' 'assembly' 'coil' 'take' 'pr... \n", - "4 ['acrylic acid' 'producing acrylic' 'stabilize... \n", - "\n", - " similar \\\n", - "0 [{'publication_number': 'AU-338190-S', 'applic... \n", - "1 [{'publication_number': 'CN-101396133-B', 'app... \n", - "2 [{'publication_number': 'EP-1867379-B1', 'appl... \n", - "3 [{'publication_number': 'EP-0248026-B1', 'appl... \n", - "4 [{'publication_number': 'SG-157371-A1', 'appli... \n", - "\n", - " url country \\\n", - "0 https://patents.google.com/patent/AU338190S Australia \n", - "1 https://patents.google.com/patent/CN100525651C China \n", - "2 https://patents.google.com/patent/TWI725505B Taiwan \n", - "3 https://patents.google.com/patent/EP0248026B1 European Patent Office \n", - "4 https://patents.google.com/patent/MY135762A Malaysia \n", - "\n", - " publication_description cited_by \\\n", - "0 Design [] \n", - "1 Granted Patent [] \n", - "2 Granted Patent or patent of addition [] \n", - "3 Granted patent [] \n", - "4 Granted patent / Utility model [] \n", - "\n", - " embedding_v1 \n", - "0 [ 5.2067090e-02 -1.5462303e-01 -1.3415462e-01 ... \n", - "1 [-0.05154578 -0.00437102 0.01365495 -0.168424... \n", - "2 [ 0.0163008 -0.20972364 0.02052403 -0.003073... \n", - "3 [-0.04377723 0.04111805 -0.0929429 0.043924... \n", - "4 [ 0.10407669 0.01262973 -0.22623734 -0.171453... " - ], + "application/vnd.google.colaboratory.intrinsic+json": { + "repr_error": "Function 'unique' has no kernel matching input types (list not null>>)", + "type": "dataframe", + "variable_name": "publications" + }, "text/html": [ "\n", "
\n", @@ -955,15 +883,87 @@ "
\n", " \n" ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "publications", - "repr_error": "Function 'unique' has no kernel matching input types (list not null>>)" - } + "text/plain": [ + " publication_number title \\\n", + "0 AU-338190-S Compressor wheel \n", + "1 CN-100525651-C Method for processing egg products \n", + "2 TW-I725505-B Improved carbon molecular sieve adsorbent \n", + "3 EP-0248026-B1 A system for supplying strip to a processing line \n", + "4 MY-135762-A Method for producing acrylic acid \n", + "\n", + " title_translated abstract \\\n", + "0 False Newness and distinctiveness is claimed in the ... \n", + "1 False The invention discloses a processing method of... \n", + "2 False Disclosed herein are rapid cycle pressure swin... \n", + "3 False A system (10) for supplying strip material (S)... \n", + "4 False A PROCESS FOR THE FRACTIONAL CONDENSATION OF A... \n", + "\n", + " abstract_translated cpc \\\n", + "0 False [] \n", + "1 False [] \n", + "2 False [{'code': 'B01D2253/116', 'inventive': False, ... \n", + "3 False [{'code': 'B65H2701/37', 'inventive': False, '... \n", + "4 False [{'code': 'C07C51/50', 'inventive': True, 'fir... \n", + "\n", + " cpc_low \\\n", + "0 [] \n", + "1 [] \n", + "2 ['B01D2253/116' 'B01D2253/10' 'B01D2253/00' 'B... \n", + "3 ['B65H2701/37' 'B65H2701/30' 'B65H2701/00' 'B6... \n", + "4 ['C07C51/50' 'C07C51/42' 'C07C51/00' 'C07C' 'C... \n", + "\n", + " cpc_inventive_low \\\n", + "0 [] \n", + "1 [] \n", + "2 ['B01D2253/116' 'B01D2253/10' 'B01D2253/00' 'B... \n", + "3 ['B65H2701/37' 'B65H2701/30' 'B65H2701/00' 'B6... \n", + "4 ['C07C51/50' 'C07C51/42' 'C07C51/00' 'C07C' 'C... \n", + "\n", + " top_terms \\\n", + "0 ['compressor wheel' 'newness' 'distinctiveness... \n", + "1 ['egg' 'processing method' 'egg body' 'pack' '... \n", + "2 ['swing adsorption' 'pressure swing' 'molecula... \n", + "3 ['strip material' 'assembly' 'coil' 'take' 'pr... \n", + "4 ['acrylic acid' 'producing acrylic' 'stabilize... \n", + "\n", + " similar \\\n", + "0 [{'publication_number': 'AU-338190-S', 'applic... \n", + "1 [{'publication_number': 'CN-101396133-B', 'app... \n", + "2 [{'publication_number': 'EP-1867379-B1', 'appl... \n", + "3 [{'publication_number': 'EP-0248026-B1', 'appl... \n", + "4 [{'publication_number': 'SG-157371-A1', 'appli... \n", + "\n", + " url country \\\n", + "0 https://patents.google.com/patent/AU338190S Australia \n", + "1 https://patents.google.com/patent/CN100525651C China \n", + "2 https://patents.google.com/patent/TWI725505B Taiwan \n", + "3 https://patents.google.com/patent/EP0248026B1 European Patent Office \n", + "4 https://patents.google.com/patent/MY135762A Malaysia \n", + "\n", + " publication_description cited_by \\\n", + "0 Design [] \n", + "1 Granted Patent [] \n", + "2 Granted Patent or patent of addition [] \n", + "3 Granted patent [] \n", + "4 Granted patent / Utility model [] \n", + "\n", + " embedding_v1 \n", + "0 [ 5.2067090e-02 -1.5462303e-01 -1.3415462e-01 ... \n", + "1 [-0.05154578 -0.00437102 0.01365495 -0.168424... \n", + "2 [ 0.0163008 -0.20972364 0.02052403 -0.003073... \n", + "3 [-0.04377723 0.04111805 -0.0929429 0.043924... \n", + "4 [ 0.10407669 0.01262973 -0.22623734 -0.171453... " + ] }, + "execution_count": 11, "metadata": {}, - "execution_count": 11 + "output_type": "execute_result" } + ], + "source": [ + "## take a look at the sample dataset\n", + "\n", + "publications.head(5)" ] }, { @@ -979,35 +979,35 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "id": "li38q8FzDDMu", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "executionInfo": { + "elapsed": 4528, "status": "ok", "timestamp": 1742192047236, - "user_tz": -480, - "elapsed": 4528, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "li38q8FzDDMu", "outputId": "b8c1bd38-b484-4f71-bd38-927c8677d0c5" }, "outputs": [ { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job 127fb090-1c9e-4d7a-acdd-86f077a87b07 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -1018,64 +1018,53 @@ }, { "cell_type": "code", - "source": [ - "## rename abstract column to content as the desired column on which embedding will be generated\n", - "publications = publications[[\"publication_number\", \"title\", \"abstract\"]].rename(columns={'abstract': 'content'})\n", - "\n", - "## generate the embeddings\n", - "## takes ~2-3 mins to run\n", - "embedding = text_model.predict(publications)[[\"publication_number\", \"title\", \"content\", \"ml_generate_embedding_result\",\"ml_generate_embedding_status\"]]\n", - "\n", - "## filter out rows where the embedding generation failed. the embedding status value is empty if the embedding generation was successful\n", - "embedding = embedding[~embedding[\"ml_generate_embedding_status\"].isnull()]\n" - ], + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 139 }, - "id": "b5HHZob_u61B", "executionInfo": { + "elapsed": 126632, "status": "ok", "timestamp": 1742192656608, - "user_tz": -480, - "elapsed": 126632, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "b5HHZob_u61B", "outputId": "c9ecc5fd-5d11-4fd8-f59b-9dce4e12e371" }, - "execution_count": 19, "outputs": [ { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Load job b8079d70-7d99-4198-898f-2921915f305f is DONE. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job 17338b11-420c-4d3d-bd55-0bba1247f705 is DONE. 8.9 MB processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/bigframes/core/array_value.py:114: PreviewWarning: \u001b[93mJSON column interpretation as a custom PyArrow extention in\n", "`db_dtypes` is a preview feature and subject to change.\u001b[0m\n", @@ -1083,59 +1072,67 @@ ] }, { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job ebf3eb36-3199-4551-ad07-5fa5abb200be is DONE. 20.0 kB processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job 9e9c5aae-9045-4750-a34e-c98493369a90 is DONE. 20.0 kB processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } + ], + "source": [ + "## rename abstract column to content as the desired column on which embedding will be generated\n", + "publications = publications[[\"publication_number\", \"title\", \"abstract\"]].rename(columns={'abstract': 'content'})\n", + "\n", + "## generate the embeddings\n", + "## takes ~2-3 mins to run\n", + "embedding = text_model.predict(publications)[[\"publication_number\", \"title\", \"content\", \"ml_generate_embedding_result\",\"ml_generate_embedding_status\"]]\n", + "\n", + "## filter out rows where the embedding generation failed. the embedding status value is empty if the embedding generation was successful\n", + "embedding = embedding[~embedding[\"ml_generate_embedding_status\"].isnull()]\n" ] }, { "cell_type": "code", - "source": [ - "embedding.head(5)" - ], + "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 464 }, - "id": "OIT5FbqAwqG5", "executionInfo": { + "elapsed": 6715, "status": "ok", "timestamp": 1742192727525, - "user_tz": -480, - "elapsed": 6715, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": -480 }, + "id": "OIT5FbqAwqG5", "outputId": "d04c994a-a0c8-44b0-e897-d871036eeb1f" }, - "execution_count": 20, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/bigframes/core/array_value.py:238: AmbiguousWindowWarning: \u001b[93mWindow ordering may be ambiguous, this can cause unstable results.\u001b[0m\n", " warnings.warn(msg, bfe.AmbiguousWindowWarning)\n", @@ -1144,63 +1141,31 @@ ] }, { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job 1bc3517f-df67-456c-8d31-14a6432b8629 is DONE. 70.4 MB processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "Query job ae92602b-0eab-437f-a02d-102a4defa99a is DONE. 31.3 kB processed. Open Job" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "execute_result", "data": { - "text/plain": [ - " publication_number title \\\n", - "5753 HN-1996000102-A NEW PESTICIDES \n", - "8115 AU-325874-S Baby sling \n", - "5415 AU-2016256863-A1 Microbial compositions and methods for denitri... \n", - "8886 FR-2368509-A1 NEW DEODORANTS OR FRESHENERS AND COMPOSITIONS ... \n", - "5661 US-2006051255-A1 Gas generator \n", - "\n", - " content \\\n", - "5753 THE PRESENT INVENTION REFERS TO \n", - "8115 Adjustable baby sling with velcro. \n", - "5415 The present invention provides compositions an... \n", - "8886 Polyanionic polyamide salts comprising a conca... \n", - "5661 A gas generator insulated by a vacuum-jacket v... \n", - "\n", - " ml_generate_embedding_result \\\n", - "5753 [-0.02709213 0.0366395 0.03931784 -0.003942... \n", - "8115 [ 6.44167811e-02 -2.01051459e-02 -3.39564607e-... \n", - "5415 [-5.90537786e-02 2.38401629e-03 7.22754598e-... \n", - "8886 [-3.44522446e-02 5.64815439e-02 -1.35829514e-... \n", - "5661 [-1.50892800e-02 6.56989636e-03 2.34969519e-... \n", - "\n", - " ml_generate_embedding_status \n", - "5753 \n", - "8115 \n", - "5415 \n", - "8886 \n", - "5661 \n", - "\n", - "[5 rows x 5 columns]" - ], "text/html": [ "
\n", "