From 9e83480cdea94568207a5c80dc18dfc799fdfced Mon Sep 17 00:00:00 2001 From: Jakub Roztocil Date: Sun, 18 Dec 2022 13:36:28 +0100 Subject: [PATCH 01/12] Make `numpy` and `pandas` optional dependencies --- openai/api_resources/embedding.py | 4 +- openai/datalib.py | 40 ++++++++++++++++++++ openai/embeddings_utils.py | 4 +- openai/tests/test_long_examples_validator.py | 9 ++++- openai/validators.py | 3 +- openai/wandb_logger.py | 5 +-- setup.py | 24 +++++++++--- 7 files changed, 74 insertions(+), 15 deletions(-) create mode 100644 openai/datalib.py diff --git a/openai/api_resources/embedding.py b/openai/api_resources/embedding.py index 85ede2c088..e87bca86ea 100644 --- a/openai/api_resources/embedding.py +++ b/openai/api_resources/embedding.py @@ -1,10 +1,9 @@ import base64 import time -import numpy as np +from openai.datalib import numpy as np, assert_has_numpy from openai import util -from openai.api_resources.abstract import DeletableAPIResource, ListableAPIResource from openai.api_resources.abstract.engine_api_resource import EngineAPIResource from openai.error import TryAgain @@ -40,6 +39,7 @@ def create(cls, *args, **kwargs): # If an engine isn't using this optimization, don't do anything if type(data["embedding"]) == str: + assert_has_numpy() data["embedding"] = np.frombuffer( base64.b64decode(data["embedding"]), dtype="float32" ).tolist() diff --git a/openai/datalib.py b/openai/datalib.py new file mode 100644 index 0000000000..71923e9fb6 --- /dev/null +++ b/openai/datalib.py @@ -0,0 +1,40 @@ +""" +This module helps make data libraries like `numpy` and `pandas` optional dependencies. + +The libraries add up to 100M+, which makes it challenging to deploy applications +using this library in environments with code size constraints, like AWS Lambda. + +This module serves as an import proxy and provides a few utilities for dealing with the optionality. + +Since the basic use case of this library (talking to the OpenAI API) doesn’t generally require data libraries, +it’s safe to make them optional. The rare case when data libraries are needed are handled through assertions +with instructive error messages. + +See also `setup.py`. + +""" +try: + import numpy +except ImportError: + numpy = None + +try: + import pandas +except ImportError: + pandas = None + +HAS_NUMPY = bool(numpy) +HAS_PANDAS = bool(pandas) + +NUMPY_INSTRUCTIONS = "numpy is not installed: pip install openai[datalib]" +PANDAS_INSTRUCTIONS = "pandas is not installed: pip install openai[datalib]" + + +def assert_has_numpy(): + if not HAS_NUMPY: + raise Exception(NUMPY_INSTRUCTIONS) + + +def assert_has_pandas(): + if not HAS_NUMPY: + raise Exception(PANDAS_INSTRUCTIONS) diff --git a/openai/embeddings_utils.py b/openai/embeddings_utils.py index 47a04e6582..07cc88c6c9 100644 --- a/openai/embeddings_utils.py +++ b/openai/embeddings_utils.py @@ -2,8 +2,6 @@ from typing import List, Optional import matplotlib.pyplot as plt -import numpy as np -import pandas as pd import plotly.express as px from scipy import spatial from sklearn.decomposition import PCA @@ -12,6 +10,8 @@ from tenacity import retry, stop_after_attempt, wait_random_exponential import openai +from openai.datalib import numpy as np +from openai.datalib import pandas as pd @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6)) diff --git a/openai/tests/test_long_examples_validator.py b/openai/tests/test_long_examples_validator.py index 7f3e4c8cf1..4d8901abe6 100644 --- a/openai/tests/test_long_examples_validator.py +++ b/openai/tests/test_long_examples_validator.py @@ -2,9 +2,14 @@ import subprocess from tempfile import NamedTemporaryFile +import pytest + +from openai.datalib import HAS_PANDAS, HAS_NUMPY, NUMPY_INSTRUCTIONS, PANDAS_INSTRUCTIONS -def test_long_examples_validator() -> None: +@pytest.mark.skipif(not HAS_PANDAS, reason=PANDAS_INSTRUCTIONS) +@pytest.mark.skipif(not HAS_NUMPY, reason=NUMPY_INSTRUCTIONS) +def test_long_examples_validator() -> None: """ Ensures that long_examples_validator() handles previously applied recommendations, namely dropped duplicates, without resulting in a KeyError. @@ -45,4 +50,4 @@ def test_long_examples_validator() -> None: # validate get_long_indexes() applied during optional_fn() call in long_examples_validator() assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout - return prepared_data_cmd_output.stdout \ No newline at end of file + return prepared_data_cmd_output.stdout diff --git a/openai/validators.py b/openai/validators.py index 23ff525495..0329ed5c7d 100644 --- a/openai/validators.py +++ b/openai/validators.py @@ -2,7 +2,7 @@ import sys from typing import Any, Callable, NamedTuple, Optional -import pandas as pd +from openai.datalib import pandas as pd, assert_has_pandas class Remediation(NamedTuple): @@ -474,6 +474,7 @@ def read_any_format(fname, fields=["prompt", "completion"]): - for .xlsx it will read the first sheet - for .txt it will assume completions and split on newline """ + assert_has_pandas() remediation = None necessary_msg = None immediate_msg = None diff --git a/openai/wandb_logger.py b/openai/wandb_logger.py index 6dd7614ca2..ba650d1fe4 100644 --- a/openai/wandb_logger.py +++ b/openai/wandb_logger.py @@ -13,10 +13,9 @@ import re from pathlib import Path - import numpy as np - import pandas as pd - from openai import File, FineTune + from openai.datalib import numpy as np + from openai.datalib import pandas as pd class WandbLogger: diff --git a/setup.py b/setup.py index 0b6956ef0e..4de265f774 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,14 @@ with open(version_path, "rt") as f: exec(f.read(), version_contents) + +# See `openai/datalib.py`. +DATA_LIBRARIES = [ + "numpy", + "pandas>=1.2.3", # Needed for CLI fine-tuning data preparation tool + "pandas-stubs>=1.1.0.11", # Needed for type hints for mypy +] + setup( name="openai", description="Python client library for the OpenAI API", @@ -16,21 +24,27 @@ install_requires=[ "requests>=2.20", # to get the patch for CVE-2018-18074 "tqdm", # Needed for progress bars - "pandas>=1.2.3", # Needed for CLI fine-tuning data preparation tool - "pandas-stubs>=1.1.0.11", # Needed for type hints for mypy "openpyxl>=3.0.7", # Needed for CLI fine-tuning data preparation tool xlsx format - "numpy", "typing_extensions", # Needed for type hints for mypy ], extras_require={ - "dev": ["black~=21.6b0", "pytest==6.*"], - "wandb": ["wandb"], + "dev": [ + "black~=21.6b0", + "pytest==6.*", + "pytest_mock", + ], + "datalib": DATA_LIBRARIES, + "wandb": [ + "wandb", + *DATA_LIBRARIES, + ], "embeddings": [ "scikit-learn>=1.0.2", # Needed for embedding utils, versions >= 1.1 require python 3.8 "tenacity>=8.0.1", "matplotlib", "sklearn", "plotly", + *DATA_LIBRARIES, ], }, python_requires=">=3.7.1", From acd8b9345cdc664ac622c8a646dc390a39e13c1e Mon Sep 17 00:00:00 2001 From: Jakub Roztocil Date: Sun, 18 Dec 2022 13:39:06 +0100 Subject: [PATCH 02/12] Cleanup --- openai/api_resources/embedding.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/openai/api_resources/embedding.py b/openai/api_resources/embedding.py index e87bca86ea..7de36e6421 100644 --- a/openai/api_resources/embedding.py +++ b/openai/api_resources/embedding.py @@ -1,10 +1,10 @@ import base64 import time -from openai.datalib import numpy as np, assert_has_numpy from openai import util from openai.api_resources.abstract.engine_api_resource import EngineAPIResource +from openai.datalib import numpy as np, assert_has_numpy from openai.error import TryAgain diff --git a/setup.py b/setup.py index 4de265f774..66c95beb6d 100644 --- a/setup.py +++ b/setup.py @@ -10,8 +10,8 @@ exec(f.read(), version_contents) -# See `openai/datalib.py`. DATA_LIBRARIES = [ + # These libraries are optional because of their size. See `openai/datalib.py`. "numpy", "pandas>=1.2.3", # Needed for CLI fine-tuning data preparation tool "pandas-stubs>=1.1.0.11", # Needed for type hints for mypy From 658a4cadceeaebbc9f1a7bcbda1e156442a5e72c Mon Sep 17 00:00:00 2001 From: Jakub Roztocil Date: Sun, 18 Dec 2022 13:42:19 +0100 Subject: [PATCH 03/12] Cleanup --- openai/datalib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openai/datalib.py b/openai/datalib.py index 71923e9fb6..a8daa66345 100644 --- a/openai/datalib.py +++ b/openai/datalib.py @@ -6,8 +6,8 @@ This module serves as an import proxy and provides a few utilities for dealing with the optionality. -Since the basic use case of this library (talking to the OpenAI API) doesn’t generally require data libraries, -it’s safe to make them optional. The rare case when data libraries are needed are handled through assertions +Since the primary use case of this library (talking to the OpenAI API) doesn’t generally require data libraries, +it’s safe to make them optional. The rare case when data libraries are needed is handled through assertions with instructive error messages. See also `setup.py`. From 41fb5d29fc2c28b44af58c92e20caf21edb3c3db Mon Sep 17 00:00:00 2001 From: Jakub Roztocil Date: Sun, 18 Dec 2022 13:50:12 +0100 Subject: [PATCH 04/12] Cleanup --- openai/datalib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openai/datalib.py b/openai/datalib.py index a8daa66345..643ff5090b 100644 --- a/openai/datalib.py +++ b/openai/datalib.py @@ -1,7 +1,7 @@ """ This module helps make data libraries like `numpy` and `pandas` optional dependencies. -The libraries add up to 100M+, which makes it challenging to deploy applications +The libraries add up to 130M+, which makes it challenging to deploy applications using this library in environments with code size constraints, like AWS Lambda. This module serves as an import proxy and provides a few utilities for dealing with the optionality. From 49941e4f55d011eefdc99d9fb213e33a596ec79a Mon Sep 17 00:00:00 2001 From: Jakub Roztocil Date: Sun, 18 Dec 2022 14:04:46 +0100 Subject: [PATCH 05/12] Cleanup --- openai/datalib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openai/datalib.py b/openai/datalib.py index 643ff5090b..398d7401e5 100644 --- a/openai/datalib.py +++ b/openai/datalib.py @@ -1,7 +1,7 @@ """ This module helps make data libraries like `numpy` and `pandas` optional dependencies. -The libraries add up to 130M+, which makes it challenging to deploy applications +The libraries add up to 130MB+, which makes it challenging to deploy applications using this library in environments with code size constraints, like AWS Lambda. This module serves as an import proxy and provides a few utilities for dealing with the optionality. From 69a42c69644bbda15c678cb6eed84815f5e27265 Mon Sep 17 00:00:00 2001 From: Jakub Roztocil Date: Sun, 18 Dec 2022 14:09:46 +0100 Subject: [PATCH 06/12] Cleanup --- openai/datalib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openai/datalib.py b/openai/datalib.py index 398d7401e5..d99e5ca02d 100644 --- a/openai/datalib.py +++ b/openai/datalib.py @@ -7,8 +7,8 @@ This module serves as an import proxy and provides a few utilities for dealing with the optionality. Since the primary use case of this library (talking to the OpenAI API) doesn’t generally require data libraries, -it’s safe to make them optional. The rare case when data libraries are needed is handled through assertions -with instructive error messages. +it’s safe to make them optional. The rare case when data libraries are needed in the client is handled through +assertions with instructive error messages. See also `setup.py`. From 8bd45b2f18bb1ac0d5383211363b9712c78b7ba9 Mon Sep 17 00:00:00 2001 From: Jakub Roztocil Date: Sun, 18 Dec 2022 14:11:03 +0100 Subject: [PATCH 07/12] Cleanup --- openai/datalib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openai/datalib.py b/openai/datalib.py index d99e5ca02d..b1dd223a30 100644 --- a/openai/datalib.py +++ b/openai/datalib.py @@ -36,5 +36,5 @@ def assert_has_numpy(): def assert_has_pandas(): - if not HAS_NUMPY: + if not HAS_PANDAS: raise Exception(PANDAS_INSTRUCTIONS) From 184248c6b2cd966e6b8954d4feef5a1923b2c7b6 Mon Sep 17 00:00:00 2001 From: Jakub Roztocil Date: Sun, 18 Dec 2022 14:19:16 +0100 Subject: [PATCH 08/12] Move `openpyxl` to `datalib` extras --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 66c95beb6d..fa0f154faa 100644 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ "numpy", "pandas>=1.2.3", # Needed for CLI fine-tuning data preparation tool "pandas-stubs>=1.1.0.11", # Needed for type hints for mypy + "openpyxl>=3.0.7", # Needed for CLI fine-tuning data preparation tool xlsx format ] setup( @@ -24,7 +25,6 @@ install_requires=[ "requests>=2.20", # to get the patch for CVE-2018-18074 "tqdm", # Needed for progress bars - "openpyxl>=3.0.7", # Needed for CLI fine-tuning data preparation tool xlsx format "typing_extensions", # Needed for type hints for mypy ], extras_require={ From cbe944610349ef5726595189fea90bbe0224a2b8 Mon Sep 17 00:00:00 2001 From: Jakub Roztocil Date: Wed, 21 Dec 2022 23:21:01 +0100 Subject: [PATCH 09/12] Improve errors and instructions --- openai/datalib.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/openai/datalib.py b/openai/datalib.py index b1dd223a30..2781cfc4db 100644 --- a/openai/datalib.py +++ b/openai/datalib.py @@ -26,15 +26,31 @@ HAS_NUMPY = bool(numpy) HAS_PANDAS = bool(pandas) -NUMPY_INSTRUCTIONS = "numpy is not installed: pip install openai[datalib]" -PANDAS_INSTRUCTIONS = "pandas is not installed: pip install openai[datalib]" +INSTRUCTIONS = """ + +OpenAI error: + + missing `{library}` + +This feature requires additional dependencies: + + $ pip install openai[datalib] + +""" + +NUMPY_INSTRUCTIONS = INSTRUCTIONS.format(library="numpy") +PANDAS_INSTRUCTIONS = INSTRUCTIONS.format(library="pandas") + + +class MissingDependencyError(Exception): + pass def assert_has_numpy(): if not HAS_NUMPY: - raise Exception(NUMPY_INSTRUCTIONS) + raise MissingDependencyError(NUMPY_INSTRUCTIONS) def assert_has_pandas(): if not HAS_PANDAS: - raise Exception(PANDAS_INSTRUCTIONS) + raise MissingDependencyError(PANDAS_INSTRUCTIONS) From 054f9b4818ff4c93562a7a77c661c13cbb7d2470 Mon Sep 17 00:00:00 2001 From: Jakub Roztocil Date: Wed, 21 Dec 2022 23:21:36 +0100 Subject: [PATCH 10/12] =?UTF-8?q?Add=20=E2=80=9COptional=20dependencies?= =?UTF-8?q?=E2=80=9D=20to=20README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index 1dad8c0a96..af5396cec4 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,27 @@ Install from source with: python setup.py install ``` +### Optional dependencies + +Data libraries including `numpy` and `pandas` are not installed by default due to their size. They’re needed for some functionality of this library, but generally not for talking to the API. If you encounter a `MissingDependencyError`, install them with: + +```sh +pip install openai[datalib] +```` + +Dependencies for [`openapi.embeddings_utils`](openai/embeddings_utils.py): + +```sh +pip install openai[embeddings] +``` + +Support for [Weights & Biases](https://wandb.me/openai-docs): + +``` +pip install openai[wandb] +``` + + ## Usage The library needs to be configured with your account's secret key which is available on the [website](https://beta.openai.com/account/api-keys). Either set it as the `OPENAI_API_KEY` environment variable before using the library: From 1ffae5dd4eb27e25467a9b0acb47408ddd2731ea Mon Sep 17 00:00:00 2001 From: Jakub Roztocil Date: Wed, 21 Dec 2022 23:34:16 +0100 Subject: [PATCH 11/12] Polish README.md --- README.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index af5396cec4..88f6a262c3 100644 --- a/README.md +++ b/README.md @@ -27,24 +27,23 @@ python setup.py install ### Optional dependencies -Data libraries including `numpy` and `pandas` are not installed by default due to their size. They’re needed for some functionality of this library, but generally not for talking to the API. If you encounter a `MissingDependencyError`, install them with: - -```sh -pip install openai[datalib] -```` - -Dependencies for [`openapi.embeddings_utils`](openai/embeddings_utils.py): +Install dependencies for [`openapi.embeddings_utils`](openai/embeddings_utils.py): ```sh pip install openai[embeddings] ``` -Support for [Weights & Biases](https://wandb.me/openai-docs): +Install support for [Weights & Biases](https://wandb.me/openai-docs): ``` pip install openai[wandb] ``` +Data libraries including `numpy` and `pandas` are not installed by default due to their size. They’re needed for some functionality of this library, but generally not for talking to the API. If you encounter a `MissingDependencyError`, install them with: + +```sh +pip install openai[datalib] +```` ## Usage From be992104381693bd378a99d44121f59897bee0b3 Mon Sep 17 00:00:00 2001 From: Jakub Roztocil Date: Wed, 21 Dec 2022 23:34:59 +0100 Subject: [PATCH 12/12] Polish README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 88f6a262c3..53656928c3 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Install support for [Weights & Biases](https://wandb.me/openai-docs): pip install openai[wandb] ``` -Data libraries including `numpy` and `pandas` are not installed by default due to their size. They’re needed for some functionality of this library, but generally not for talking to the API. If you encounter a `MissingDependencyError`, install them with: +Data libraries like `numpy` and `pandas` are not installed by default due to their size. They’re needed for some functionality of this library, but generally not for talking to the API. If you encounter a `MissingDependencyError`, install them with: ```sh pip install openai[datalib]