diff --git a/notebooks/generative_ai/bq_dataframes_llm_claude3_museum_art.ipynb b/notebooks/generative_ai/bq_dataframes_llm_claude3_museum_art.ipynb deleted file mode 100644 index a1bb1e9d89..0000000000 --- a/notebooks/generative_ai/bq_dataframes_llm_claude3_museum_art.ipynb +++ /dev/null @@ -1,1019 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "9A9NkTRTfo2I" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8fK_rdvvx1iZ" - }, - "source": [ - "## Overview\n", - "\n", - "## Objective\n", - "\n", - "This notebook shows how to conecct BigQuery dataset to Claude models on Vertex AI using BigQuery DataFrames.\n", - "\n", - "### Claude on Vertex AI\n", - "\n", - "Anthropic Claude models on Vertex AI offer fully managed and serverless models. To use a Claude model on Vertex AI, send a request directly to the Vertex AI API endpoint.\n", - "\n", - "For more information, see the [Use Claude](https://cloud.devsite.corp.google.com/vertex-ai/generative-ai/docs/third-party-models/use-claude) documentation.\n", - "\n", - "### BigQuery DataFrames\n", - "BigQuery DataFrames provides a Pythonic DataFrame and machine learning (ML) API powered by the BigQuery engine. BigQuery DataFrames is an open-source package.\n", - "\n", - "For more information, see this documentation\n", - "https://cloud.google.com/bigquery/docs/reference/bigquery-dataframes\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nwYvaaW25jYS" - }, - "source": [ - "### Getting Started\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hVi8v2mxBkeG" - }, - "source": [ - "#### Authenticate your notebook environment (Colab only)\n", - "If you are running this notebook on Google Colab, uncomment and run the following cell to authenticate your environment. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench)." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "OHfMDNI76_Pz" - }, - "outputs": [], - "source": [ - "# from google.colab import auth\n", - "# auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gI3KlxQQ_F_T" - }, - "source": [ - "## Using Anthropic's Vertex SDK + BQ for *Python*" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "E0x3GO6M_O3_" - }, - "source": [ - "### Getting Started\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_CJrqUvqAfR7" - }, - "source": [ - "#### Install the latest bigframes package if bigframes version < 1.15.0\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "collapsed": true, - "executionInfo": { - "elapsed": 11539, - "status": "ok", - "timestamp": 1724257409246, - "user": { - "displayName": "Annie Xu", - "userId": "11935526703047498014" - }, - "user_tz": 420 - }, - "id": "fi_HLdat_Pce", - "outputId": "020149f0-9fe8-45de-f160-abe488c0bed2" - }, - "outputs": [], - "source": [ - "# !pip install bigframes --upgrade" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hUiAYUFbBCpR" - }, - "source": [ - "#### Restart current runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "jcqgcj_DBFgt" - }, - "outputs": [], - "source": [ - "# # Restart kernel after installs so that your environment can access the new packages\n", - "# import sys\n", - "\n", - "# if \"google.colab\" in sys.modules:\n", - "# import IPython\n", - "\n", - "# app = IPython.Application.instance()\n", - "# app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "shZgRl6qbZYP" - }, - "source": [ - "#### Define Google Cloud project and region information" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "JZLqMJ6va9fc" - }, - "outputs": [], - "source": [ - "# Input your project id\n", - "PROJECT_ID = \"bigframes-dev\" # @param {type:\"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "czcmJpKPBMVC" - }, - "source": [ - "#### Select Claude Model and Region Availability:\n", - "https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-claude#anthropic_claude_quotas_and_supported_context_length" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "wIBkGcFkK0Ci" - }, - "outputs": [], - "source": [ - "REGION = \"us-east5\" # @param {type:\"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F3UmCLerH0t0" - }, - "source": [ - "### Load raw sample data to a bigquery dataset\n", - "\n", - "Create a BigQuery Dataset and table. You can use the sample museum data in CSV from [here](https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks/generative_ai/museum_art.csv).\n", - "\n", - "The dataset should be in the **same region** as your chosen claude model. Let's say you selected us-east5 for claude 'haiku', then load the sample data to a dataset in us-east5." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gijJ2vr5B5nV" - }, - "source": [ - "### Text generation for BQ Tables using Python BigFrames\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "collapsed": true, - "executionInfo": { - "elapsed": 756, - "status": "ok", - "timestamp": 1724260427446, - "user": { - "displayName": "Annie Xu", - "userId": "11935526703047498014" - }, - "user_tz": 420 - }, - "id": "cU3Gq7TqHFdi", - "outputId": "aa5ec159-a91b-4349-e56a-400e90935edc" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "object_number string[pyarrow]\n", - "is_highlight boolean\n", - "is_public_domain boolean\n", - "object_id Int64\n", - "department string[pyarrow]\n", - "object_name string[pyarrow]\n", - "title string[pyarrow]\n", - "culture string[pyarrow]\n", - "period string[pyarrow]\n", - "dynasty string[pyarrow]\n", - "reign string[pyarrow]\n", - "portfolio string[pyarrow]\n", - "artist_role string[pyarrow]\n", - "artist_prefix string[pyarrow]\n", - "artist_display_name string[pyarrow]\n", - "artist_display_bio string[pyarrow]\n", - "artist_suffix string[pyarrow]\n", - "artist_alpha_sort string[pyarrow]\n", - "artist_nationality string[pyarrow]\n", - "artist_begin_date string[pyarrow]\n", - "artist_end_date string[pyarrow]\n", - "object_date string[pyarrow]\n", - "object_begin_date Int64\n", - "object_end_date Int64\n", - "medium string[pyarrow]\n", - "dimensions string[pyarrow]\n", - "credit_line string[pyarrow]\n", - "geography_type string[pyarrow]\n", - "city string[pyarrow]\n", - "state string[pyarrow]\n", - "county string[pyarrow]\n", - "country string[pyarrow]\n", - "region string[pyarrow]\n", - "subregion string[pyarrow]\n", - "locale string[pyarrow]\n", - "locus string[pyarrow]\n", - "excavation string[pyarrow]\n", - "river string[pyarrow]\n", - "classification string[pyarrow]\n", - "rights_and_reproduction string[pyarrow]\n", - "link_resource string[pyarrow]\n", - "metadata_date timestamp[us, tz=UTC][pyarrow]\n", - "repository string[pyarrow]\n", - "dtype: object" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import bigframes\n", - "import bigframes.pandas as bpd\n", - "bigframes.options._bigquery_options.project = PROJECT_ID # replace to user project\n", - "bigframes.options._bigquery_options.location = REGION #choice a region which the claude model you choice allows\n", - "df = bpd.read_gbq(\"bigframes-dev.garrettwu_us_east5.museum_art\") # replace with your table\n", - "df.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 461 - }, - "executionInfo": { - "elapsed": 4568, - "status": "ok", - "timestamp": 1724271168583, - "user": { - "displayName": "Annie Xu", - "userId": "11935526703047498014" - }, - "user_tz": 420 - }, - "id": "exWNXEzLHHaU", - "outputId": "1b33b64c-c8bd-42e6-ecc3-0ea0b5e492be" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 1998408a-4e29-4381-9229-cf8585a47dbe is DONE. 7.7 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 817a5321-9852-45da-8b14-004affc20c38 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 36aa1b30-acb5-4188-8377-b9f544443db8 is DONE. 955 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
object_idtitle
0285844Addie Card, 12 years. Spinner in North Pownal ...
1437141Portrait of a Man
2670650[Snow Crystal]
3268450Newhaven Fisherman
4646996전(傳) 오원 장승업 (1843–1897) 청동기와 화초가 있는 정물화 조선|傳 吾...
5287958Bridge of Augustus at Nani
6435869Antoine Dominique Sauveur Aubert (born 1817), ...
755834<NA>
845087<NA>
956883<NA>
\n", - "

10 rows × 2 columns

\n", - "
[10 rows x 2 columns in total]" - ], - "text/plain": [ - " object_id title\n", - "0 285844 Addie Card, 12 years. Spinner in North Pownal ...\n", - "1 437141 Portrait of a Man\n", - "2 670650 [Snow Crystal]\n", - "3 268450 Newhaven Fisherman\n", - "4 646996 전(傳) 오원 장승업 (1843–1897) 청동기와 화초가 있는 정물화 조선|傳 吾...\n", - "5 287958 Bridge of Augustus at Nani\n", - "6 435869 Antoine Dominique Sauveur Aubert (born 1817), ...\n", - "7 55834 \n", - "8 45087 \n", - "9 56883 \n", - "\n", - "[10 rows x 2 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# @title query: select top 10 records from table and put into dataframe\n", - "\n", - "df = df[[\"object_id\", \"title\"]].head(10)\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_UZNsP_WDlyr" - }, - "source": [ - "### Enable Claude model on Vertex AI and Create a BQ External Model Connection\n", - "\n", - "\n", - "* Step 1: Visit the Vertex AI Model Garden console and select the model tile for Claude model of your choice. Following this doc [link](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-claude). Click on the **“Enable”** button and follow the instructions.\n", - "\n", - "* Step 2: Create a BQ External Connection\n", - "Follow the same process like this one: [link](https://cloud.google.com/bigquery/docs/generate-text#create_a_connection). Pay attention to the **supported region** of Claude models and make your conenction follow the same region for example us-east5 for Claude 3.5.\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8Q4aff5m9QOn" - }, - "source": [ - "### Use BigQuery DataFrames ML package with Claude LLM \n", - "\n", - "In this example, we are using the Claude3TextGenerator class from BigQuery DataFrames to translate title of art piece to english.\n", - "\n", - "Documentation for the Claude3TextGenerator Class: https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm.Claude3TextGenerator" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 513 - }, - "executionInfo": { - "elapsed": 25662, - "status": "ok", - "timestamp": 1724271197922, - "user": { - "displayName": "Annie Xu", - "userId": "11935526703047498014" - }, - "user_tz": 420 - }, - "id": "1pdyI5KBTyTD", - "outputId": "8f1e976b-1fd0-49ba-e068-f480eafb1765" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 514f5afe-15e0-4474-9e09-fbf94f0fe8ca is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5d4df544-e8a4-42f3-8a94-5f7e79b23562 is DONE. 635 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 25288d94-b10c-4b39-a272-3969ccb19af3 is DONE. 14 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job d5693878-1037-4798-8aa0-f568ec0be9e3 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 54080328-ba8b-4715-bf2b-3e5b7affa90b is DONE. 4.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ml_generate_text_llm_resultml_generate_text_statusprompt
0This text is already in English. It appears to...translate this into English: Addie Card, 12 ye...
1The phrase \"Portrait of a Man\" is already in E...translate this into English: Portrait of a Man
2The phrase \"[Snow Crystal]\" is already in Engl...translate this into English: [Snow Crystal]
3The phrase \"Newhaven Fisherman\" is already in ...translate this into English: Newhaven Fisherman
4Here's the English translation:\n", - "\n", - "\"Attributed t...translate this into English: 전(傳) 오원 장승업 (1843...
5I apologize, but I'm not sure which language \"...translate this into English: Bridge of Augustu...
6This title is already in English. It describes...translate this into English: Antoine Dominique...
7<NA><NA><NA>
8<NA><NA><NA>
9<NA><NA><NA>
\n", - "

10 rows × 3 columns

\n", - "
[10 rows x 3 columns in total]" - ], - "text/plain": [ - " ml_generate_text_llm_result ml_generate_text_status \\\n", - "0 This text is already in English. It appears to... \n", - "1 The phrase \"Portrait of a Man\" is already in E... \n", - "2 The phrase \"[Snow Crystal]\" is already in Engl... \n", - "3 The phrase \"Newhaven Fisherman\" is already in ... \n", - "4 Here's the English translation:\n", - "\n", - "\"Attributed t... \n", - "5 I apologize, but I'm not sure which language \"... \n", - "6 This title is already in English. It describes... \n", - "7 \n", - "8 \n", - "9 \n", - "\n", - " prompt \n", - "0 translate this into English: Addie Card, 12 ye... \n", - "1 translate this into English: Portrait of a Man \n", - "2 translate this into English: [Snow Crystal] \n", - "3 translate this into English: Newhaven Fisherman \n", - "4 translate this into English: 전(傳) 오원 장승업 (1843... \n", - "5 translate this into English: Bridge of Augustu... \n", - "6 translate this into English: Antoine Dominique... \n", - "7 \n", - "8 \n", - "9 \n", - "\n", - "[10 rows x 3 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from bigframes.ml import llm\n", - "model = llm.Claude3TextGenerator(model_name=\"claude-3-5-sonnet\",\n", - " connection_name=\"bigframes-dev.us-east5.bigframes-rf-conn\" ) # replace with your connection\n", - "df[\"input_prompt\"] = \"translate this into English: \" + df[\"title\"]\n", - "result = model.predict(df[\"input_prompt\"])\n", - "result" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 461 - }, - "executionInfo": { - "elapsed": 5249, - "status": "ok", - "timestamp": 1724274172557, - "user": { - "displayName": "Annie Xu", - "userId": "11935526703047498014" - }, - "user_tz": 420 - }, - "id": "Ux1VI5qujHOB", - "outputId": "7b859943-5e7c-4cc0-d9c2-bb3d44682010" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 6b6eceaa-e713-493e-beac-481a3d777a5c is DONE. 4.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5c660da9-318c-424e-9412-43f09e44a8b3 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 82b61007-8370-4514-addb-258d7c48d66c is DONE. 4.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
object_idtitleml_generate_text_llm_resultprompt
0285844Addie Card, 12 years. Spinner in North Pownal ...This text is already in English. It appears to...translate this into English: Addie Card, 12 ye...
1437141Portrait of a ManThe phrase \"Portrait of a Man\" is already in E...translate this into English: Portrait of a Man
2670650[Snow Crystal]The phrase \"[Snow Crystal]\" is already in Engl...translate this into English: [Snow Crystal]
3268450Newhaven FishermanThe phrase \"Newhaven Fisherman\" is already in ...translate this into English: Newhaven Fisherman
4646996전(傳) 오원 장승업 (1843–1897) 청동기와 화초가 있는 정물화 조선|傳 吾...Here's the English translation:\n", - "\n", - "\"Attributed t...translate this into English: 전(傳) 오원 장승업 (1843...
5287958Bridge of Augustus at NaniI apologize, but I'm not sure which language \"...translate this into English: Bridge of Augustu...
6435869Antoine Dominique Sauveur Aubert (born 1817), ...This title is already in English. It describes...translate this into English: Antoine Dominique...
755834<NA><NA><NA>
845087<NA><NA><NA>
956883<NA><NA><NA>
\n", - "

10 rows × 4 columns

\n", - "
[10 rows x 4 columns in total]" - ], - "text/plain": [ - " object_id title \\\n", - "0 285844 Addie Card, 12 years. Spinner in North Pownal ... \n", - "1 437141 Portrait of a Man \n", - "2 670650 [Snow Crystal] \n", - "3 268450 Newhaven Fisherman \n", - "4 646996 전(傳) 오원 장승업 (1843–1897) 청동기와 화초가 있는 정물화 조선|傳 吾... \n", - "5 287958 Bridge of Augustus at Nani \n", - "6 435869 Antoine Dominique Sauveur Aubert (born 1817), ... \n", - "7 55834 \n", - "8 45087 \n", - "9 56883 \n", - "\n", - " ml_generate_text_llm_result \\\n", - "0 This text is already in English. It appears to... \n", - "1 The phrase \"Portrait of a Man\" is already in E... \n", - "2 The phrase \"[Snow Crystal]\" is already in Engl... \n", - "3 The phrase \"Newhaven Fisherman\" is already in ... \n", - "4 Here's the English translation:\n", - "\n", - "\"Attributed t... \n", - "5 I apologize, but I'm not sure which language \"... \n", - "6 This title is already in English. It describes... \n", - "7 \n", - "8 \n", - "9 \n", - "\n", - " prompt \n", - "0 translate this into English: Addie Card, 12 ye... \n", - "1 translate this into English: Portrait of a Man \n", - "2 translate this into English: [Snow Crystal] \n", - "3 translate this into English: Newhaven Fisherman \n", - "4 translate this into English: 전(傳) 오원 장승업 (1843... \n", - "5 translate this into English: Bridge of Augustu... \n", - "6 translate this into English: Antoine Dominique... \n", - "7 \n", - "8 \n", - "9 \n", - "\n", - "[10 rows x 4 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_df=df.drop(columns=[\"input_prompt\"]).join(result.drop(columns=\"ml_generate_text_status\"))\n", - "output_df" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "ej70vFMvelsg" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 8c3f1d21-9033-4224-b6f3-4f2414f4ed18 is DONE. 4.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'bigframes-dev.garrettwu_us_east5.museum_art_translate'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# prompt: load the dataframe output to another Bigquery table\n", - "\n", - "# @title Save results to BigQuery\n", - "\n", - "output_df.to_gbq(\"bigframes-dev.garrettwu_us_east5.museum_art_translate\", if_exists=\"replace\") # replace with your table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index 4f1329129e..b05a6f034f 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -1283,7 +1283,8 @@ "toc_visible": true }, "kernelspec": { - "display_name": "Python 3", + "display_name": "venv (3.10.14)", + "language": "python", "name": "python3" }, "language_info": { @@ -1296,7 +1297,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb b/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb deleted file mode 100644 index 1a9b568897..0000000000 --- a/notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb +++ /dev/null @@ -1,377 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# BigFrames Gemini 2.0 Text Generation Simple Example" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: This feature is only available in bigframes >= 1.29.0" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Import packages" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes.pandas as bpd\n", - "from bigframes.ml import llm" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create Gemini 2.0 experimental Model with model_name as \"gemini-2.0-flash-exp\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/garrettwu/src/bigframes/bigframes/ml/llm.py:803: PreviewWarning: Model gemini-2.0-flash-exp is subject to the \"Pre-GA Offerings Terms\" in the General Service Terms section of the\n", - " Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available \"as is\"\n", - " and might have limited support. For more information, see the launch stage descriptions\n", - " (https://cloud.google.com/products#product-launch-stages).\n", - " warnings.warn(\n", - "/usr/local/google/home/garrettwu/src/bigframes/bigframes/pandas/__init__.py:435: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " return global_session.get_global_session()\n" - ] - }, - { - "data": { - "text/html": [ - "Query job f673a2ea-023e-4771-84a2-fb81f808fa1b is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "model = llm.GeminiTextGenerator(model_name=\"gemini-2.0-flash-exp\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a simple DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 2276ea5b-2e08-4ed6-af34-49a7d165d145 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
prompt
0Tell me something about Gemini 2.0.
\n", - "

1 rows × 1 columns

\n", - "
[1 rows x 1 columns in total]" - ], - "text/plain": [ - " prompt\n", - "0 Tell me something about Gemini 2.0.\n", - "\n", - "[1 rows x 1 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = bpd.DataFrame({\"prompt\": [\"Tell me something about Gemini 2.0.\"]})\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Make predictions" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 9ba21e96-6023-491e-8e83-f2e6fa7df0e7 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/__init__.py:109: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 933d45cc-4bc0-4bdf-b4b8-573da2d58be3 is DONE. 2 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 3dda9bc6-84b1-4f4a-8891-85d25d8848ce is DONE. 4.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ml_generate_text_llm_resultml_generate_text_rai_resultml_generate_text_statusprompt
0Alright, let's talk about Gemini 2.0! It's a b...<NA>Tell me something about Gemini 2.0.
\n", - "

1 rows × 4 columns

\n", - "
[1 rows x 4 columns in total]" - ], - "text/plain": [ - " ml_generate_text_llm_result \\\n", - "0 Alright, let's talk about Gemini 2.0! It's a b... \n", - "\n", - " ml_generate_text_rai_result ml_generate_text_status \\\n", - "0 \n", - "\n", - " prompt \n", - "0 Tell me something about Gemini 2.0. \n", - "\n", - "[1 rows x 4 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result = model.predict(df)\n", - "result" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save the model" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Copy job 8e68af62-e7ab-475b-99c9-b79e8ba3c40b is DONE. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/garrettwu/src/bigframes/bigframes/ml/llm.py:803: PreviewWarning: Model gemini-2.0-flash-exp is subject to the \"Pre-GA Offerings Terms\" in the General Service Terms section of the\n", - " Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available \"as is\"\n", - " and might have limited support. For more information, see the launch stage descriptions\n", - " (https://cloud.google.com/products#product-launch-stages).\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "Query job cae7f929-d8cb-4819-a644-ac832cdc0912 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "GeminiTextGenerator(connection_name='bigframes-dev.us.bigframes-rf-connection',\n", - " model_name='gemini-2.0-flash-exp',\n", - " session=)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.to_gbq(\"bigframes-dev.garrettwu.gemini_2_flash\", replace=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/noxfile.py b/noxfile.py index 888f9fd765..671aae13d2 100644 --- a/noxfile.py +++ b/noxfile.py @@ -707,11 +707,9 @@ def notebook(session: nox.Session): # bq_dataframes_llm_code_generation creates a bucket in the sample. "notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb", # Needs BUCKET_URI. "notebooks/generative_ai/sentiment_analysis.ipynb", # Too slow - "notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb", # Gemini 2.0 backend hasn't ready in prod. "notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb", # Limited quota for vector index ddl statements on table. "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", # Needs CONNECTION. # TODO(b/366290533): to protect BQML quota - "notebooks/generative_ai/bq_dataframes_llm_claude3_museum_art.ipynb", "notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb", # Needs BUCKET_URI. "notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb", # Needs BUCKET_URI. "notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb", # Needs BUCKET_URI. diff --git a/tests/system/large/ml/test_llm.py b/tests/system/large/ml/test_llm.py index 1daaebb8cb..6e2695b1b5 100644 --- a/tests/system/large/ml/test_llm.py +++ b/tests/system/large/ml/test_llm.py @@ -12,11 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Callable +from unittest import mock + import pandas as pd import pyarrow as pa import pytest -from bigframes.ml import llm +from bigframes.ml import core, llm import bigframes.pandas as bpd from bigframes.testing import utils @@ -24,7 +27,6 @@ @pytest.mark.parametrize( "model_name", ( - "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", "gemini-2.5-pro", @@ -32,9 +34,7 @@ "gemini-2.5-flash-lite", ), ) -@pytest.mark.flaky( - retries=2 -) # usually create model shouldn't be flaky, but this one due to the limited quota of gemini-2.0-flash-exp. +@pytest.mark.flaky(retries=2) def test_create_load_gemini_text_generator_model( dataset_id, model_name, session, bq_connection ): @@ -56,7 +56,6 @@ def test_create_load_gemini_text_generator_model( @pytest.mark.parametrize( "model_name", ( - "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", "gemini-2.5-pro", @@ -80,7 +79,6 @@ def test_gemini_text_generator_predict_default_params_success( @pytest.mark.parametrize( "model_name", ( - "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", "gemini-2.5-pro", @@ -106,7 +104,6 @@ def test_gemini_text_generator_predict_with_params_success( @pytest.mark.parametrize( "model_name", ( - "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", "gemini-2.5-pro", @@ -134,7 +131,6 @@ def test_gemini_text_generator_multi_cols_predict_success( @pytest.mark.parametrize( "model_name", ( - "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", "gemini-2.5-pro", @@ -231,3 +227,581 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index, model_name) "evaluation_status", ], ) + + +@pytest.mark.parametrize( + "model_name", + ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), +) +def test_create_load_text_embedding_generator_model( + dataset_id, model_name, session, bq_connection +): + text_embedding_model = llm.TextEmbeddingGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + assert text_embedding_model is not None + assert text_embedding_model._bqml_model is not None + + # save, load to ensure configuration was kept + reloaded_model = text_embedding_model.to_gbq( + f"{dataset_id}.temp_text_model", replace=True + ) + assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name + assert reloaded_model.connection_name == bq_connection + assert reloaded_model.model_name == model_name + + +@pytest.mark.parametrize( + "model_name", + ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), +) +@pytest.mark.flaky(retries=2) +def test_text_embedding_generator_predict_default_params_success( + llm_text_df, model_name, session, bq_connection +): + text_embedding_model = llm.TextEmbeddingGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + df = text_embedding_model.predict(llm_text_df).to_pandas() + utils.check_pandas_df_schema_and_index( + df, columns=utils.ML_GENERATE_EMBEDDING_OUTPUT, index=3, col_exact=False + ) + assert len(df["ml_generate_embedding_result"][0]) == 768 + + +@pytest.mark.parametrize( + "model_name", + ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), +) +@pytest.mark.flaky(retries=2) +def test_text_embedding_generator_multi_cols_predict_success( + llm_text_df: bpd.DataFrame, model_name, session, bq_connection +): + df = llm_text_df.assign(additional_col=1) + df = df.rename(columns={"prompt": "content"}) + text_embedding_model = llm.TextEmbeddingGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + pd_df = text_embedding_model.predict(df).to_pandas() + utils.check_pandas_df_schema_and_index( + pd_df, + columns=utils.ML_GENERATE_EMBEDDING_OUTPUT + ["additional_col"], + index=3, + col_exact=False, + ) + assert len(pd_df["ml_generate_embedding_result"][0]) == 768 + + +def test_create_load_multimodal_embedding_generator_model( + dataset_id, session, bq_connection +): + mm_embedding_model = llm.MultimodalEmbeddingGenerator( + connection_name=bq_connection, session=session + ) + assert mm_embedding_model is not None + assert mm_embedding_model._bqml_model is not None + + # save, load to ensure configuration was kept + reloaded_model = mm_embedding_model.to_gbq( + f"{dataset_id}.temp_mm_model", replace=True + ) + assert f"{dataset_id}.temp_mm_model" == reloaded_model._bqml_model.model_name + assert reloaded_model.connection_name == bq_connection + + +# Overrides __eq__ function for comparing as mock.call parameter +class EqCmpAllDataFrame(bpd.DataFrame): + def __eq__(self, other): + return self.equals(other) + + +@pytest.mark.skip("b/436340035 test failed") +@pytest.mark.parametrize( + ( + "model_class", + "options", + ), + [ + ( + llm.GeminiTextGenerator, + { + "temperature": 0.9, + "max_output_tokens": 8192, + "top_p": 1.0, + "ground_with_google_search": False, + }, + ), + ( + llm.Claude3TextGenerator, + { + "max_output_tokens": 128, + "top_k": 40, + "top_p": 0.95, + }, + ), + ], +) +def test_text_generator_retry_success( + session, + model_class, + options, + bq_connection, +): + # Requests. + df0 = EqCmpAllDataFrame( + { + "prompt": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ] + }, + index=[0, 1, 2], + session=session, + ) + df1 = EqCmpAllDataFrame( + { + "ml_generate_text_status": ["error", "error"], + "prompt": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ) + df2 = EqCmpAllDataFrame( + { + "ml_generate_text_status": ["error"], + "prompt": [ + "What is BQML?", + ], + }, + index=[1], + session=session, + ) + + mock_generate_text = mock.create_autospec( + Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] + ) + mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) + type(mock_bqml_model).session = mock.PropertyMock(return_value=session) + generate_text_tvf = core.BqmlModel.TvfDef( + mock_generate_text, "ml_generate_text_status" + ) + # Responses. Retry twice then all succeeded. + mock_generate_text.side_effect = [ + EqCmpAllDataFrame( + { + "ml_generate_text_status": ["", "error", "error"], + "prompt": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + session=session, + ), + EqCmpAllDataFrame( + { + "ml_generate_text_status": ["error", ""], + "prompt": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ), + EqCmpAllDataFrame( + { + "ml_generate_text_status": [""], + "prompt": [ + "What is BQML?", + ], + }, + index=[1], + session=session, + ), + ] + + text_generator_model = model_class(connection_name=bq_connection, session=session) + text_generator_model._bqml_model = mock_bqml_model + + with mock.patch.object(core.BqmlModel, "generate_text_tvf", generate_text_tvf): + # 3rd retry isn't triggered + result = text_generator_model.predict(df0, max_retries=3) + + mock_generate_text.assert_has_calls( + [ + mock.call(mock_bqml_model, df0, options), + mock.call(mock_bqml_model, df1, options), + mock.call(mock_bqml_model, df2, options), + ] + ) + pd.testing.assert_frame_equal( + result.to_pandas(), + pd.DataFrame( + { + "ml_generate_text_status": ["", "", ""], + "prompt": [ + "What is BigQuery?", + "What is BigQuery DataFrame?", + "What is BQML?", + ], + }, + index=[0, 2, 1], + ), + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.skip("b/436340035 test failed") +@pytest.mark.parametrize( + ( + "model_class", + "options", + ), + [ + ( + llm.GeminiTextGenerator, + { + "temperature": 0.9, + "max_output_tokens": 8192, + "top_p": 1.0, + "ground_with_google_search": False, + }, + ), + ( + llm.Claude3TextGenerator, + { + "max_output_tokens": 128, + "top_k": 40, + "top_p": 0.95, + }, + ), + ], +) +def test_text_generator_retry_no_progress(session, model_class, options, bq_connection): + # Requests. + df0 = EqCmpAllDataFrame( + { + "prompt": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ] + }, + index=[0, 1, 2], + session=session, + ) + df1 = EqCmpAllDataFrame( + { + "ml_generate_text_status": ["error", "error"], + "prompt": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ) + + mock_generate_text = mock.create_autospec( + Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] + ) + mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) + type(mock_bqml_model).session = mock.PropertyMock(return_value=session) + generate_text_tvf = core.BqmlModel.TvfDef( + mock_generate_text, "ml_generate_text_status" + ) + # Responses. Retry once, no progress, just stop. + mock_generate_text.side_effect = [ + EqCmpAllDataFrame( + { + "ml_generate_text_status": ["", "error", "error"], + "prompt": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + session=session, + ), + EqCmpAllDataFrame( + { + "ml_generate_text_status": ["error", "error"], + "prompt": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ), + ] + + text_generator_model = model_class(connection_name=bq_connection, session=session) + text_generator_model._bqml_model = mock_bqml_model + + with mock.patch.object(core.BqmlModel, "generate_text_tvf", generate_text_tvf): + # No progress, only conduct retry once + result = text_generator_model.predict(df0, max_retries=3) + + mock_generate_text.assert_has_calls( + [ + mock.call(mock_bqml_model, df0, options), + mock.call(mock_bqml_model, df1, options), + ] + ) + pd.testing.assert_frame_equal( + result.to_pandas(), + pd.DataFrame( + { + "ml_generate_text_status": ["", "error", "error"], + "prompt": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + ), + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.skip("b/436340035 test failed") +def test_text_embedding_generator_retry_success(session, bq_connection): + # Requests. + df0 = EqCmpAllDataFrame( + { + "content": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ] + }, + index=[0, 1, 2], + session=session, + ) + df1 = EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["error", "error"], + "content": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ) + df2 = EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["error"], + "content": [ + "What is BQML?", + ], + }, + index=[1], + session=session, + ) + + mock_generate_embedding = mock.create_autospec( + Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] + ) + mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) + type(mock_bqml_model).session = mock.PropertyMock(return_value=session) + generate_embedding_tvf = core.BqmlModel.TvfDef( + mock_generate_embedding, "ml_generate_embedding_status" + ) + + # Responses. Retry twice then all succeeded. + mock_generate_embedding.side_effect = [ + EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["", "error", "error"], + "content": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + session=session, + ), + EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["error", ""], + "content": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ), + EqCmpAllDataFrame( + { + "ml_generate_embedding_status": [""], + "content": [ + "What is BQML?", + ], + }, + index=[1], + session=session, + ), + ] + options: dict = {} + + text_embedding_model = llm.TextEmbeddingGenerator( + connection_name=bq_connection, session=session + ) + text_embedding_model._bqml_model = mock_bqml_model + + with mock.patch.object( + core.BqmlModel, "generate_embedding_tvf", generate_embedding_tvf + ): + # 3rd retry isn't triggered + result = text_embedding_model.predict(df0, max_retries=3) + + mock_generate_embedding.assert_has_calls( + [ + mock.call(mock_bqml_model, df0, options), + mock.call(mock_bqml_model, df1, options), + mock.call(mock_bqml_model, df2, options), + ] + ) + pd.testing.assert_frame_equal( + result.to_pandas(), + pd.DataFrame( + { + "ml_generate_embedding_status": ["", "", ""], + "content": [ + "What is BigQuery?", + "What is BigQuery DataFrame?", + "What is BQML?", + ], + }, + index=[0, 2, 1], + ), + check_dtype=False, + check_index_type=False, + ) + + +def test_text_embedding_generator_retry_no_progress(session, bq_connection): + # Requests. + df0 = EqCmpAllDataFrame( + { + "content": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ] + }, + index=[0, 1, 2], + session=session, + ) + df1 = EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["error", "error"], + "content": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ) + + mock_generate_embedding = mock.create_autospec( + Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] + ) + mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) + type(mock_bqml_model).session = mock.PropertyMock(return_value=session) + generate_embedding_tvf = core.BqmlModel.TvfDef( + mock_generate_embedding, "ml_generate_embedding_status" + ) + + # Responses. Retry once, no progress, just stop. + mock_generate_embedding.side_effect = [ + EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["", "error", "error"], + "content": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + session=session, + ), + EqCmpAllDataFrame( + { + "ml_generate_embedding_status": ["error", "error"], + "content": [ + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[1, 2], + session=session, + ), + ] + options: dict = {} + + text_embedding_model = llm.TextEmbeddingGenerator( + connection_name=bq_connection, session=session + ) + text_embedding_model._bqml_model = mock_bqml_model + + with mock.patch.object( + core.BqmlModel, "generate_embedding_tvf", generate_embedding_tvf + ): + # No progress, only conduct retry once + result = text_embedding_model.predict(df0, max_retries=3) + + mock_generate_embedding.assert_has_calls( + [ + mock.call(mock_bqml_model, df0, options), + mock.call(mock_bqml_model, df1, options), + ] + ) + pd.testing.assert_frame_equal( + result.to_pandas(), + pd.DataFrame( + { + "ml_generate_embedding_status": ["", "error", "error"], + "content": [ + "What is BigQuery?", + "What is BQML?", + "What is BigQuery DataFrame?", + ], + }, + index=[0, 1, 2], + ), + check_dtype=False, + check_index_type=False, + ) + + +# b/436340035 temp disable the test to unblock presumbit +@pytest.mark.parametrize( + "model_class", + [ + llm.TextEmbeddingGenerator, + llm.MultimodalEmbeddingGenerator, + llm.GeminiTextGenerator, + # llm.Claude3TextGenerator, + ], +) +def test_text_embedding_generator_no_default_model_warning(model_class): + message = "Since upgrading the default model can cause unintended breakages, the\ndefault model will be removed in BigFrames 3.0. Please supply an\nexplicit model to avoid this message." + with pytest.warns(FutureWarning, match=message): + model_class(model_name=None) diff --git a/tests/system/large/ml/test_multimodal_llm.py b/tests/system/large/ml/test_multimodal_llm.py index 03fdddf665..f94f0f1dee 100644 --- a/tests/system/large/ml/test_multimodal_llm.py +++ b/tests/system/large/ml/test_multimodal_llm.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pandas as pd +import pyarrow as pa import pytest from bigframes.ml import llm @@ -22,7 +24,6 @@ @pytest.mark.parametrize( "model_name", ( - "gemini-2.0-flash-exp", "gemini-2.0-flash-001", "gemini-2.0-flash-lite-001", ), @@ -43,3 +44,63 @@ def test_gemini_text_generator_multimodal_input( index=2, col_exact=False, ) + + +@pytest.mark.flaky(retries=2) +def test_multimodal_embedding_generator_predict_default_params_success( + images_mm_df, session, bq_connection +): + text_embedding_model = llm.MultimodalEmbeddingGenerator( + connection_name=bq_connection, session=session + ) + df = text_embedding_model.predict(images_mm_df).to_pandas() + utils.check_pandas_df_schema_and_index( + df, + columns=utils.ML_MULTIMODAL_GENERATE_EMBEDDING_OUTPUT, + index=2, + col_exact=False, + ) + assert len(df["ml_generate_embedding_result"][0]) == 1408 + + +@pytest.mark.parametrize( + "model_name", + ("gemini-2.0-flash-001",), +) +@pytest.mark.flaky(retries=2) +def test_gemini_text_generator_multimodal_structured_output( + images_mm_df: bpd.DataFrame, model_name, session, bq_connection +): + gemini_text_generator_model = llm.GeminiTextGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + output_schema = { + "bool_output": "bool", + "int_output": "int64", + "float_output": "float64", + "str_output": "string", + "array_output": "array", + "struct_output": "struct", + } + df = gemini_text_generator_model.predict( + images_mm_df, + prompt=["Describe", images_mm_df["blob_col"]], + output_schema=output_schema, + ) + assert df["bool_output"].dtype == pd.BooleanDtype() + assert df["int_output"].dtype == pd.Int64Dtype() + assert df["float_output"].dtype == pd.Float64Dtype() + assert df["str_output"].dtype == pd.StringDtype(storage="pyarrow") + assert df["array_output"].dtype == pd.ArrowDtype(pa.list_(pa.int64())) + assert df["struct_output"].dtype == pd.ArrowDtype( + pa.struct([("number", pa.int64())]) + ) + + pd_df = df.to_pandas() + utils.check_pandas_df_schema_and_index( + pd_df, + columns=list(output_schema.keys()) + + ["blob_col", "prompt", "full_response", "status"], + index=2, + col_exact=False, + ) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py deleted file mode 100644 index d15c5d3160..0000000000 --- a/tests/system/small/ml/test_llm.py +++ /dev/null @@ -1,611 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Callable -from unittest import mock - -import pandas as pd -import pytest - -from bigframes import exceptions -from bigframes.ml import core, llm -import bigframes.pandas as bpd -from bigframes.testing import utils - - -@pytest.mark.parametrize( - "model_name", - ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), -) -def test_create_load_text_embedding_generator_model( - dataset_id, model_name, session, bq_connection -): - text_embedding_model = llm.TextEmbeddingGenerator( - model_name=model_name, connection_name=bq_connection, session=session - ) - assert text_embedding_model is not None - assert text_embedding_model._bqml_model is not None - - # save, load to ensure configuration was kept - reloaded_model = text_embedding_model.to_gbq( - f"{dataset_id}.temp_text_model", replace=True - ) - assert f"{dataset_id}.temp_text_model" == reloaded_model._bqml_model.model_name - assert reloaded_model.connection_name == bq_connection - assert reloaded_model.model_name == model_name - - -@pytest.mark.parametrize( - "model_name", - ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), -) -@pytest.mark.flaky(retries=2) -def test_text_embedding_generator_predict_default_params_success( - llm_text_df, model_name, session, bq_connection -): - text_embedding_model = llm.TextEmbeddingGenerator( - model_name=model_name, connection_name=bq_connection, session=session - ) - df = text_embedding_model.predict(llm_text_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, columns=utils.ML_GENERATE_EMBEDDING_OUTPUT, index=3, col_exact=False - ) - assert len(df["ml_generate_embedding_result"][0]) == 768 - - -@pytest.mark.parametrize( - "model_name", - ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), -) -@pytest.mark.flaky(retries=2) -def test_text_embedding_generator_multi_cols_predict_success( - llm_text_df: bpd.DataFrame, model_name, session, bq_connection -): - df = llm_text_df.assign(additional_col=1) - df = df.rename(columns={"prompt": "content"}) - text_embedding_model = llm.TextEmbeddingGenerator( - model_name=model_name, connection_name=bq_connection, session=session - ) - pd_df = text_embedding_model.predict(df).to_pandas() - utils.check_pandas_df_schema_and_index( - pd_df, - columns=utils.ML_GENERATE_EMBEDDING_OUTPUT + ["additional_col"], - index=3, - col_exact=False, - ) - assert len(pd_df["ml_generate_embedding_result"][0]) == 768 - - -def test_create_load_multimodal_embedding_generator_model( - dataset_id, session, bq_connection -): - mm_embedding_model = llm.MultimodalEmbeddingGenerator( - connection_name=bq_connection, session=session - ) - assert mm_embedding_model is not None - assert mm_embedding_model._bqml_model is not None - - # save, load to ensure configuration was kept - reloaded_model = mm_embedding_model.to_gbq( - f"{dataset_id}.temp_mm_model", replace=True - ) - assert f"{dataset_id}.temp_mm_model" == reloaded_model._bqml_model.model_name - assert reloaded_model.connection_name == bq_connection - - -# Overrides __eq__ function for comparing as mock.call parameter -class EqCmpAllDataFrame(bpd.DataFrame): - def __eq__(self, other): - return self.equals(other) - - -@pytest.mark.skip("b/436340035 test failed") -@pytest.mark.parametrize( - ( - "model_class", - "options", - ), - [ - ( - llm.GeminiTextGenerator, - { - "temperature": 0.9, - "max_output_tokens": 8192, - "top_p": 1.0, - "ground_with_google_search": False, - }, - ), - ( - llm.Claude3TextGenerator, - { - "max_output_tokens": 128, - "top_k": 40, - "top_p": 0.95, - }, - ), - ], -) -def test_text_generator_retry_success( - session, - model_class, - options, - bq_connection, -): - # Requests. - df0 = EqCmpAllDataFrame( - { - "prompt": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ] - }, - index=[0, 1, 2], - session=session, - ) - df1 = EqCmpAllDataFrame( - { - "ml_generate_text_status": ["error", "error"], - "prompt": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ) - df2 = EqCmpAllDataFrame( - { - "ml_generate_text_status": ["error"], - "prompt": [ - "What is BQML?", - ], - }, - index=[1], - session=session, - ) - - mock_generate_text = mock.create_autospec( - Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] - ) - mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) - type(mock_bqml_model).session = mock.PropertyMock(return_value=session) - generate_text_tvf = core.BqmlModel.TvfDef( - mock_generate_text, "ml_generate_text_status" - ) - # Responses. Retry twice then all succeeded. - mock_generate_text.side_effect = [ - EqCmpAllDataFrame( - { - "ml_generate_text_status": ["", "error", "error"], - "prompt": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - session=session, - ), - EqCmpAllDataFrame( - { - "ml_generate_text_status": ["error", ""], - "prompt": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ), - EqCmpAllDataFrame( - { - "ml_generate_text_status": [""], - "prompt": [ - "What is BQML?", - ], - }, - index=[1], - session=session, - ), - ] - - text_generator_model = model_class(connection_name=bq_connection, session=session) - text_generator_model._bqml_model = mock_bqml_model - - with mock.patch.object(core.BqmlModel, "generate_text_tvf", generate_text_tvf): - # 3rd retry isn't triggered - result = text_generator_model.predict(df0, max_retries=3) - - mock_generate_text.assert_has_calls( - [ - mock.call(mock_bqml_model, df0, options), - mock.call(mock_bqml_model, df1, options), - mock.call(mock_bqml_model, df2, options), - ] - ) - pd.testing.assert_frame_equal( - result.to_pandas(), - pd.DataFrame( - { - "ml_generate_text_status": ["", "", ""], - "prompt": [ - "What is BigQuery?", - "What is BigQuery DataFrame?", - "What is BQML?", - ], - }, - index=[0, 2, 1], - ), - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.skip("b/436340035 test failed") -@pytest.mark.parametrize( - ( - "model_class", - "options", - ), - [ - ( - llm.GeminiTextGenerator, - { - "temperature": 0.9, - "max_output_tokens": 8192, - "top_p": 1.0, - "ground_with_google_search": False, - }, - ), - ( - llm.Claude3TextGenerator, - { - "max_output_tokens": 128, - "top_k": 40, - "top_p": 0.95, - }, - ), - ], -) -def test_text_generator_retry_no_progress(session, model_class, options, bq_connection): - # Requests. - df0 = EqCmpAllDataFrame( - { - "prompt": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ] - }, - index=[0, 1, 2], - session=session, - ) - df1 = EqCmpAllDataFrame( - { - "ml_generate_text_status": ["error", "error"], - "prompt": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ) - - mock_generate_text = mock.create_autospec( - Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] - ) - mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) - type(mock_bqml_model).session = mock.PropertyMock(return_value=session) - generate_text_tvf = core.BqmlModel.TvfDef( - mock_generate_text, "ml_generate_text_status" - ) - # Responses. Retry once, no progress, just stop. - mock_generate_text.side_effect = [ - EqCmpAllDataFrame( - { - "ml_generate_text_status": ["", "error", "error"], - "prompt": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - session=session, - ), - EqCmpAllDataFrame( - { - "ml_generate_text_status": ["error", "error"], - "prompt": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ), - ] - - text_generator_model = model_class(connection_name=bq_connection, session=session) - text_generator_model._bqml_model = mock_bqml_model - - with mock.patch.object(core.BqmlModel, "generate_text_tvf", generate_text_tvf): - # No progress, only conduct retry once - result = text_generator_model.predict(df0, max_retries=3) - - mock_generate_text.assert_has_calls( - [ - mock.call(mock_bqml_model, df0, options), - mock.call(mock_bqml_model, df1, options), - ] - ) - pd.testing.assert_frame_equal( - result.to_pandas(), - pd.DataFrame( - { - "ml_generate_text_status": ["", "error", "error"], - "prompt": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - ), - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.skip("b/436340035 test failed") -def test_text_embedding_generator_retry_success(session, bq_connection): - # Requests. - df0 = EqCmpAllDataFrame( - { - "content": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ] - }, - index=[0, 1, 2], - session=session, - ) - df1 = EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["error", "error"], - "content": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ) - df2 = EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["error"], - "content": [ - "What is BQML?", - ], - }, - index=[1], - session=session, - ) - - mock_generate_embedding = mock.create_autospec( - Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] - ) - mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) - type(mock_bqml_model).session = mock.PropertyMock(return_value=session) - generate_embedding_tvf = core.BqmlModel.TvfDef( - mock_generate_embedding, "ml_generate_embedding_status" - ) - - # Responses. Retry twice then all succeeded. - mock_generate_embedding.side_effect = [ - EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["", "error", "error"], - "content": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - session=session, - ), - EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["error", ""], - "content": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ), - EqCmpAllDataFrame( - { - "ml_generate_embedding_status": [""], - "content": [ - "What is BQML?", - ], - }, - index=[1], - session=session, - ), - ] - options: dict = {} - - text_embedding_model = llm.TextEmbeddingGenerator( - connection_name=bq_connection, session=session - ) - text_embedding_model._bqml_model = mock_bqml_model - - with mock.patch.object( - core.BqmlModel, "generate_embedding_tvf", generate_embedding_tvf - ): - # 3rd retry isn't triggered - result = text_embedding_model.predict(df0, max_retries=3) - - mock_generate_embedding.assert_has_calls( - [ - mock.call(mock_bqml_model, df0, options), - mock.call(mock_bqml_model, df1, options), - mock.call(mock_bqml_model, df2, options), - ] - ) - pd.testing.assert_frame_equal( - result.to_pandas(), - pd.DataFrame( - { - "ml_generate_embedding_status": ["", "", ""], - "content": [ - "What is BigQuery?", - "What is BigQuery DataFrame?", - "What is BQML?", - ], - }, - index=[0, 2, 1], - ), - check_dtype=False, - check_index_type=False, - ) - - -def test_text_embedding_generator_retry_no_progress(session, bq_connection): - # Requests. - df0 = EqCmpAllDataFrame( - { - "content": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ] - }, - index=[0, 1, 2], - session=session, - ) - df1 = EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["error", "error"], - "content": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ) - - mock_generate_embedding = mock.create_autospec( - Callable[[core.BqmlModel, bpd.DataFrame, dict], bpd.DataFrame] - ) - mock_bqml_model = mock.create_autospec(spec=core.BqmlModel) - type(mock_bqml_model).session = mock.PropertyMock(return_value=session) - generate_embedding_tvf = core.BqmlModel.TvfDef( - mock_generate_embedding, "ml_generate_embedding_status" - ) - - # Responses. Retry once, no progress, just stop. - mock_generate_embedding.side_effect = [ - EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["", "error", "error"], - "content": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - session=session, - ), - EqCmpAllDataFrame( - { - "ml_generate_embedding_status": ["error", "error"], - "content": [ - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[1, 2], - session=session, - ), - ] - options: dict = {} - - text_embedding_model = llm.TextEmbeddingGenerator( - connection_name=bq_connection, session=session - ) - text_embedding_model._bqml_model = mock_bqml_model - - with mock.patch.object( - core.BqmlModel, "generate_embedding_tvf", generate_embedding_tvf - ): - # No progress, only conduct retry once - result = text_embedding_model.predict(df0, max_retries=3) - - mock_generate_embedding.assert_has_calls( - [ - mock.call(mock_bqml_model, df0, options), - mock.call(mock_bqml_model, df1, options), - ] - ) - pd.testing.assert_frame_equal( - result.to_pandas(), - pd.DataFrame( - { - "ml_generate_embedding_status": ["", "error", "error"], - "content": [ - "What is BigQuery?", - "What is BQML?", - "What is BigQuery DataFrame?", - ], - }, - index=[0, 1, 2], - ), - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - "model_name", - ("gemini-2.0-flash-exp",), -) -def test_gemini_preview_model_warnings(model_name): - with pytest.warns(exceptions.PreviewWarning): - llm.GeminiTextGenerator(model_name=model_name) - - -# b/436340035 temp disable the test to unblock presumbit -@pytest.mark.parametrize( - "model_class", - [ - llm.TextEmbeddingGenerator, - llm.MultimodalEmbeddingGenerator, - llm.GeminiTextGenerator, - # llm.Claude3TextGenerator, - ], -) -def test_text_embedding_generator_no_default_model_warning(model_class): - message = "Since upgrading the default model can cause unintended breakages, the\ndefault model will be removed in BigFrames 3.0. Please supply an\nexplicit model to avoid this message." - with pytest.warns(FutureWarning, match=message): - model_class(model_name=None) diff --git a/tests/system/small/ml/test_multimodal_llm.py b/tests/system/small/ml/test_multimodal_llm.py deleted file mode 100644 index e29669afd3..0000000000 --- a/tests/system/small/ml/test_multimodal_llm.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pandas as pd -import pyarrow as pa -import pytest - -from bigframes.ml import llm -import bigframes.pandas as bpd -from bigframes.testing import utils - - -@pytest.mark.flaky(retries=2) -def test_multimodal_embedding_generator_predict_default_params_success( - images_mm_df, session, bq_connection -): - text_embedding_model = llm.MultimodalEmbeddingGenerator( - connection_name=bq_connection, session=session - ) - df = text_embedding_model.predict(images_mm_df).to_pandas() - utils.check_pandas_df_schema_and_index( - df, - columns=utils.ML_MULTIMODAL_GENERATE_EMBEDDING_OUTPUT, - index=2, - col_exact=False, - ) - assert len(df["ml_generate_embedding_result"][0]) == 1408 - - -@pytest.mark.parametrize( - "model_name", - ( - "gemini-2.0-flash-exp", - "gemini-2.0-flash-001", - ), -) -@pytest.mark.flaky(retries=2) -def test_gemini_text_generator_multimodal_structured_output( - images_mm_df: bpd.DataFrame, model_name, session, bq_connection -): - gemini_text_generator_model = llm.GeminiTextGenerator( - model_name=model_name, connection_name=bq_connection, session=session - ) - output_schema = { - "bool_output": "bool", - "int_output": "int64", - "float_output": "float64", - "str_output": "string", - "array_output": "array", - "struct_output": "struct", - } - df = gemini_text_generator_model.predict( - images_mm_df, - prompt=["Describe", images_mm_df["blob_col"]], - output_schema=output_schema, - ) - assert df["bool_output"].dtype == pd.BooleanDtype() - assert df["int_output"].dtype == pd.Int64Dtype() - assert df["float_output"].dtype == pd.Float64Dtype() - assert df["str_output"].dtype == pd.StringDtype(storage="pyarrow") - assert df["array_output"].dtype == pd.ArrowDtype(pa.list_(pa.int64())) - assert df["struct_output"].dtype == pd.ArrowDtype( - pa.struct([("number", pa.int64())]) - ) - - pd_df = df.to_pandas() - utils.check_pandas_df_schema_and_index( - pd_df, - columns=list(output_schema.keys()) - + ["blob_col", "prompt", "full_response", "status"], - index=2, - col_exact=False, - ) diff --git a/tests/system/small/test_iceberg.py b/tests/system/small/test_iceberg.py deleted file mode 100644 index ea0acc6214..0000000000 --- a/tests/system/small/test_iceberg.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pytest - -import bigframes -import bigframes.pandas as bpd - - -@pytest.fixture() -def fresh_global_session(): - bpd.reset_session() - yield None - bpd.close_session() - # Undoes side effect of using ths global session to read table - bpd.options.bigquery.location = None - - -def test_read_iceberg_table_w_location(): - session = bigframes.Session(bigframes.BigQueryOptions(location="us-central1")) - df = session.read_gbq( - "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data.nyc_taxicab_2021" - ) - assert df.shape == (30904427, 20) - - -def test_read_iceberg_table_w_wrong_location(): - session = bigframes.Session(bigframes.BigQueryOptions(location="europe-west1")) - with pytest.raises(ValueError, match="Current session is in europe-west1"): - session.read_gbq( - "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data.nyc_taxicab_2021" - ) - - -def test_read_iceberg_table_wo_location(fresh_global_session): - df = bpd.read_gbq( - "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data.nyc_taxicab_2021" - ) - assert df.shape == (30904427, 20)