From 3bd6ba93ca633d0202783b3dd5b0c8dc4460083b Mon Sep 17 00:00:00 2001 From: sahusiddharth <112792547+sahusiddharth@users.noreply.github.com> Date: Mon, 5 Feb 2024 20:50:46 +0530 Subject: [PATCH] =?UTF-8?q?feat:Add=20dimensions=20parameter=20to=20OpenAI?= =?UTF-8?q?=20Embedders=20to=20fully=20support=20th=E2=80=A6=20(#6841)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat:Add dimensions parameter to OpenAI Embedders to fully support the new models * fixed linting * changed != None to is not None --- .../embedders/openai_document_embedder.py | 9 ++++++++- .../components/embedders/openai_text_embedder.py | 14 +++++++++++--- ...ly-support-the-new-models-1393cc235e457733.yaml | 4 ++++ .../embedders/test_openai_document_embedder.py | 2 ++ .../embedders/test_openai_text_embedder.py | 4 +++- 5 files changed, 28 insertions(+), 5 deletions(-) create mode 100644 releasenotes/notes/add-dimensions-parameter-to-OpenAI-Embedders-to-fully-support-the-new-models-1393cc235e457733.yaml diff --git a/haystack/components/embedders/openai_document_embedder.py b/haystack/components/embedders/openai_document_embedder.py index e4d3ad13b..7f409178e 100644 --- a/haystack/components/embedders/openai_document_embedder.py +++ b/haystack/components/embedders/openai_document_embedder.py @@ -33,6 +33,7 @@ class OpenAIDocumentEmbedder: self, api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), model: str = "text-embedding-ada-002", + dimensions: Optional[int] = None, api_base_url: Optional[str] = None, organization: Optional[str] = None, prefix: str = "", @@ -46,6 +47,7 @@ class OpenAIDocumentEmbedder: Create a OpenAIDocumentEmbedder component. :param api_key: The OpenAI API key. :param model: The name of the model to use. + :param dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models. :param api_base_url: The OpenAI API Base url, defaults to None. For more details, see OpenAI [docs](https://platform.openai.com/docs/api-reference/audio). :param organization: The Organization ID, defaults to `None`. See [production best practices](https://platform.openai.com/docs/guides/production-best-practices/setting-up-your-organization). @@ -59,6 +61,7 @@ class OpenAIDocumentEmbedder: """ self.api_key = api_key self.model = model + self.dimensions = dimensions self.api_base_url = api_base_url self.organization = organization self.prefix = prefix @@ -84,6 +87,7 @@ class OpenAIDocumentEmbedder: return default_to_dict( self, model=self.model, + dimensions=self.dimensions, organization=self.organization, api_base_url=self.api_base_url, prefix=self.prefix, @@ -131,7 +135,10 @@ class OpenAIDocumentEmbedder: range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" ): batch = texts_to_embed[i : i + batch_size] - response = self.client.embeddings.create(model=self.model, input=batch) + if self.dimensions is not None: + response = self.client.embeddings.create(model=self.model, dimensions=self.dimensions, input=batch) + else: + response = self.client.embeddings.create(model=self.model, input=batch) embeddings = [el.embedding for el in response.data] all_embeddings.extend(embeddings) diff --git a/haystack/components/embedders/openai_text_embedder.py b/haystack/components/embedders/openai_text_embedder.py index 02b7d3ed4..5d30bccf4 100644 --- a/haystack/components/embedders/openai_text_embedder.py +++ b/haystack/components/embedders/openai_text_embedder.py @@ -1,8 +1,8 @@ -from typing import List, Optional, Dict, Any +from typing import Any, Dict, List, Optional from openai import OpenAI -from haystack import component, default_to_dict, default_from_dict +from haystack import component, default_from_dict, default_to_dict from haystack.utils import Secret, deserialize_secrets_inplace @@ -31,6 +31,7 @@ class OpenAITextEmbedder: self, api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), model: str = "text-embedding-ada-002", + dimensions: Optional[int] = None, api_base_url: Optional[str] = None, organization: Optional[str] = None, prefix: str = "", @@ -42,6 +43,7 @@ class OpenAITextEmbedder: :param api_key: The OpenAI API key. :param model: The name of the OpenAI model to use. For more details on the available models, see [OpenAI documentation](https://platform.openai.com/docs/guides/embeddings/embedding-models). + :param dimensions: The number of dimensions the resulting output embeddings should have. Only supported in text-embedding-3 and later models. :param organization: The Organization ID, defaults to `None`. See [production best practices](https://platform.openai.com/docs/guides/production-best-practices/setting-up-your-organization). :param api_base_url: The OpenAI API Base url, defaults to None. For more details, see OpenAI [docs](https://platform.openai.com/docs/api-reference/audio). @@ -49,6 +51,7 @@ class OpenAITextEmbedder: :param suffix: A string to add to the end of each text. """ self.model = model + self.dimensions = dimensions self.organization = organization self.prefix = prefix self.suffix = suffix @@ -69,6 +72,7 @@ class OpenAITextEmbedder: organization=self.organization, prefix=self.prefix, suffix=self.suffix, + dimensions=self.dimensions, api_key=self.api_key.to_dict(), ) @@ -92,7 +96,11 @@ class OpenAITextEmbedder: # replace newlines, which can negatively affect performance. text_to_embed = text_to_embed.replace("\n", " ") - response = self.client.embeddings.create(model=self.model, input=text_to_embed) + if self.dimensions is not None: + response = self.client.embeddings.create(model=self.model, dimensions=self.dimensions, input=text_to_embed) + else: + response = self.client.embeddings.create(model=self.model, input=text_to_embed) + meta = {"model": response.model, "usage": dict(response.usage)} return {"embedding": response.data[0].embedding, "meta": meta} diff --git a/releasenotes/notes/add-dimensions-parameter-to-OpenAI-Embedders-to-fully-support-the-new-models-1393cc235e457733.yaml b/releasenotes/notes/add-dimensions-parameter-to-OpenAI-Embedders-to-fully-support-the-new-models-1393cc235e457733.yaml new file mode 100644 index 000000000..777047af0 --- /dev/null +++ b/releasenotes/notes/add-dimensions-parameter-to-OpenAI-Embedders-to-fully-support-the-new-models-1393cc235e457733.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + add dimensions parameter to OpenAI Embedders to fully support new embedding models like text-embedding-3-small, text-embedding-3-large and upcoming ones diff --git a/test/components/embedders/test_openai_document_embedder.py b/test/components/embedders/test_openai_document_embedder.py index 833a485b1..3e0a55f44 100644 --- a/test/components/embedders/test_openai_document_embedder.py +++ b/test/components/embedders/test_openai_document_embedder.py @@ -71,6 +71,7 @@ class TestOpenAIDocumentEmbedder: "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"}, "api_base_url": None, "model": "text-embedding-ada-002", + "dimensions": None, "organization": None, "prefix": "", "suffix": "", @@ -101,6 +102,7 @@ class TestOpenAIDocumentEmbedder: "api_key": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, "api_base_url": None, "model": "model", + "dimensions": None, "organization": "my-org", "prefix": "prefix", "suffix": "suffix", diff --git a/test/components/embedders/test_openai_text_embedder.py b/test/components/embedders/test_openai_text_embedder.py index 5cd10e94a..3a1e747f6 100644 --- a/test/components/embedders/test_openai_text_embedder.py +++ b/test/components/embedders/test_openai_text_embedder.py @@ -1,9 +1,9 @@ import os -from haystack.utils.auth import Secret import pytest from haystack.components.embedders.openai_text_embedder import OpenAITextEmbedder +from haystack.utils.auth import Secret class TestOpenAITextEmbedder: @@ -44,6 +44,7 @@ class TestOpenAITextEmbedder: "type": "haystack.components.embedders.openai_text_embedder.OpenAITextEmbedder", "init_parameters": { "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"}, + "dimensions": None, "model": "text-embedding-ada-002", "organization": None, "prefix": "", @@ -66,6 +67,7 @@ class TestOpenAITextEmbedder: "init_parameters": { "api_key": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, "model": "model", + "dimensions": None, "organization": "fake-organization", "prefix": "prefix", "suffix": "suffix",