From 1a706771facd0adef754e2f87ec58479e42251e6 Mon Sep 17 00:00:00 2001 From: David Potter Date: Sat, 10 Feb 2024 07:27:06 -0800 Subject: [PATCH] feature: add octoai for embeddings (#2538) Thanks to Pedro at OctoAI we have a new embedding option. The following PR adds support for the use of OctoAI embeddings. Forked from the original OpenAI embeddings class. We removed the use of the LangChain adaptor, and use OpenAI's SDK directly instead. Also updated out-of-date example script. Including new test file for OctoAI. # Testing Get a token from our platform at: https://www.octoai.cloud/ For testing one can do the following: ``` export OCTOAI_TOKEN= python3 examples/embed/example_octoai.py ``` ## Testing done Validated running the above script from within a locally built container via `make docker-start-dev` --------- Co-authored-by: potter-potter --- CHANGELOG.md | 4 +- docs/source/core/embedding.rst | 48 ++++++++++- .../embed/{example.py => example_octoai.py} | 6 +- examples/embed/example_openai.py | 18 +++++ test_unstructured/embed/test_octoai.py | 19 +++++ unstructured/__version__.py | 2 +- unstructured/embed/__init__.py | 2 + unstructured/embed/octoai.py | 79 +++++++++++++++++++ unstructured/ingest/interfaces.py | 4 + 9 files changed, 176 insertions(+), 6 deletions(-) rename examples/embed/{example.py => example_octoai.py} (67%) create mode 100644 examples/embed/example_openai.py create mode 100644 test_unstructured/embed/test_octoai.py create mode 100644 unstructured/embed/octoai.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 7647f3efb..6e352c0a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,11 @@ -## 0.12.5-dev1 +## 0.12.5-dev2 ### Enhancements ### Features +* **Add OctoAI embedder** Adds support for embeddings via OctoAI. + ### Fixes * **Fix `check_connection` in opensearch, databricks, postgres, azure connectors ** diff --git a/docs/source/core/embedding.rst b/docs/source/core/embedding.rst index 399fa1f2f..ec117bd71 100644 --- a/docs/source/core/embedding.rst +++ b/docs/source/core/embedding.rst @@ -43,10 +43,10 @@ To obtain an api key, visit: https://platform.openai.com/account/api-keys import os from unstructured.documents.elements import Text - from unstructured.embed.openai import OpenAIEmbeddingEncoder + from unstructured.embed.openai import OpenAiEmbeddingConfig, OpenAIEmbeddingEncoder # Initialize the encoder with OpenAI credentials - embedding_encoder = OpenAIEmbeddingEncoder(api_key=os.environ["OPENAI_API_KEY"]) + embedding_encoder = OpenAIEmbeddingEncoder(config=OpenAiEmbeddingConfig(api_key=os.environ["OPENAI_API_KEY"])) # Embed a list of Elements elements = embedding_encoder.embed_documents( @@ -130,3 +130,47 @@ To create an instance of the `BedrockEmbeddingEncoder`, AWS credentials and the Dependencies: This class relies on several dependencies which include boto3, numpy, and langchain. Ensure these are installed and available in the environment where this class is utilized. + +``OctoAIEmbeddingEncoder`` +-------------------------- + +The ``OctoAIEmbeddingEncoder`` class connects to the OctoAI Text&Embedding API to obtain embeddings for pieces of text. + +``embed_documents`` will receive a list of Elements, and return an updated list which +includes the ``embeddings`` attribute for each Element. + +``embed_query`` will receive a query as a string, and return a list of floats which is the +embedding vector for the given query string. + +``num_of_dimensions`` is a metadata property that denotes the number of dimensions in any +embedding vector obtained via this class. + +``is_unit_vector`` is a metadata property that denotes if embedding vectors obtained via +this class are unit vectors. + +The following code block shows an example of how to use ``OctoAIEmbeddingEncoder``. You will +see the updated elements list (with the ``embeddings`` attribute included for each element), +the embedding vector for the query string, and some metadata properties about the embedding model. +You will need to set an environment variable named ``OCTOAI_API_KEY`` to be able to run this example. +To obtain an api key, visit: https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token + +.. code:: python + + import os + + from unstructured.documents.elements import Text + from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder + + embedding_encoder = OctoAIEmbeddingEncoder( + config=OctoAiEmbeddingConfig(api_key=os.environ["OCTOAI_API_KEY"]) + ) + elements = embedding_encoder.embed_documents( + elements=[Text("This is sentence 1"), Text("This is sentence 2")], + ) + + query = "This is the query" + query_embedding = embedding_encoder.embed_query(query=query) + + [print(e.embeddings, e) for e in elements] + print(query_embedding, query) + print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions()) \ No newline at end of file diff --git a/examples/embed/example.py b/examples/embed/example_octoai.py similarity index 67% rename from examples/embed/example.py rename to examples/embed/example_octoai.py index f86ed5659..d780ac27b 100644 --- a/examples/embed/example.py +++ b/examples/embed/example_octoai.py @@ -1,9 +1,11 @@ import os from unstructured.documents.elements import Text -from unstructured.embed.openai import OpenAIEmbeddingEncoder +from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder -embedding_encoder = OpenAIEmbeddingEncoder(api_key=os.environ["OPENAI_API_KEY"]) +embedding_encoder = OctoAIEmbeddingEncoder( + config=OctoAiEmbeddingConfig(api_key=os.environ["OCTOAI_API_KEY"]) +) elements = embedding_encoder.embed_documents( elements=[Text("This is sentence 1"), Text("This is sentence 2")], ) diff --git a/examples/embed/example_openai.py b/examples/embed/example_openai.py new file mode 100644 index 000000000..374613a0e --- /dev/null +++ b/examples/embed/example_openai.py @@ -0,0 +1,18 @@ +import os + +from unstructured.documents.elements import Text +from unstructured.embed.openai import OpenAiEmbeddingConfig, OpenAIEmbeddingEncoder + +embedding_encoder = OpenAIEmbeddingEncoder( + config=OpenAiEmbeddingConfig(api_key=os.environ["OPENAI_API_KEY"]) +) +elements = embedding_encoder.embed_documents( + elements=[Text("This is sentence 1"), Text("This is sentence 2")], +) + +query = "This is the query" +query_embedding = embedding_encoder.embed_query(query=query) + +[print(e.embeddings, e) for e in elements] +print(query_embedding, query) +print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions()) diff --git a/test_unstructured/embed/test_octoai.py b/test_unstructured/embed/test_octoai.py new file mode 100644 index 000000000..df9b302e4 --- /dev/null +++ b/test_unstructured/embed/test_octoai.py @@ -0,0 +1,19 @@ +from unstructured.documents.elements import Text +from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder + + +def test_embed_documents_does_not_break_element_to_dict(mocker): + # Mocked client with the desired behavior for embed_documents + mock_client = mocker.MagicMock() + mock_client.embed_documents.return_value = [1, 2] + + # Mock create_client to return our mock_client + mocker.patch.object(OctoAIEmbeddingEncoder, "create_client", return_value=mock_client) + + encoder = OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(api_key="api_key")) + elements = encoder.embed_documents( + elements=[Text("This is sentence 1"), Text("This is sentence 2")], + ) + assert len(elements) == 2 + assert elements[0].to_dict()["text"] == "This is sentence 1" + assert elements[1].to_dict()["text"] == "This is sentence 2" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3e1af5b02..bafbd15c5 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.12.5-dev1" # pragma: no cover +__version__ = "0.12.5-dev2" # pragma: no cover diff --git a/unstructured/embed/__init__.py b/unstructured/embed/__init__.py index fc5160ffd..97adb74d3 100644 --- a/unstructured/embed/__init__.py +++ b/unstructured/embed/__init__.py @@ -1,9 +1,11 @@ from unstructured.embed.bedrock import BedrockEmbeddingEncoder from unstructured.embed.huggingface import HuggingFaceEmbeddingEncoder +from unstructured.embed.octoai import OctoAIEmbeddingEncoder from unstructured.embed.openai import OpenAIEmbeddingEncoder EMBEDDING_PROVIDER_TO_CLASS_MAP = { "langchain-openai": OpenAIEmbeddingEncoder, "langchain-huggingface": HuggingFaceEmbeddingEncoder, "langchain-aws-bedrock": BedrockEmbeddingEncoder, + "octoai": OctoAIEmbeddingEncoder, } diff --git a/unstructured/embed/octoai.py b/unstructured/embed/octoai.py new file mode 100644 index 000000000..119fa154d --- /dev/null +++ b/unstructured/embed/octoai.py @@ -0,0 +1,79 @@ +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, List, Optional + +import numpy as np + +from unstructured.documents.elements import ( + Element, +) +from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig +from unstructured.ingest.error import EmbeddingEncoderConnectionError +from unstructured.utils import requires_dependencies + +if TYPE_CHECKING: + from openai import OpenAI + +OCTOAI_BASE_URL = "https://text.octoai.run/v1" + + +@dataclass +class OctoAiEmbeddingConfig(EmbeddingConfig): + api_key: str + model_name: str = "thenlper/gte-large" + + +@dataclass +class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder): + config: OctoAiEmbeddingConfig + # Uses the OpenAI SDK + _client: Optional["OpenAI"] = field(init=False, default=None) + _exemplary_embedding: Optional[List[float]] = field(init=False, default=None) + + @property + def client(self) -> "OpenAI": + if self._client is None: + self._client = self.create_client() + return self._client + + @property + def exemplary_embedding(self) -> List[float]: + if self._exemplary_embedding is None: + self._exemplary_embedding = self.embed_query("Q") + return self._exemplary_embedding + + def initialize(self): + pass + + def num_of_dimensions(self): + return np.shape(self.exemplary_embedding) + + def is_unit_vector(self): + return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0) + + def embed_query(self, query): + response = self.client.embeddings.create(input=str(query), model=self.config.model_name) + return response.data[0].embedding + + def embed_documents(self, elements: List[Element]) -> List[Element]: + embeddings = [self.embed_query(e) for e in elements] + elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings) + return elements_with_embeddings + + def _add_embeddings_to_elements(self, elements, embeddings) -> List[Element]: + assert len(elements) == len(embeddings) + elements_w_embedding = [] + for i, element in enumerate(elements): + element.embeddings = embeddings[i] + elements_w_embedding.append(element) + return elements + + @EmbeddingEncoderConnectionError.wrap + @requires_dependencies( + ["openai", "tiktoken"], + extras="embed-openai", + ) + def create_client(self) -> "OpenAI": + """Creates an OpenAI python client to embed elements. Uses the OpenAI SDK.""" + from openai import OpenAI + + return OpenAI(api_key=self.config.api_key, base_url=OCTOAI_BASE_URL) diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index eef8d5a55..bcc250de8 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -206,6 +206,10 @@ class EmbeddingConfig(BaseConfig): ) return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs)) + elif self.provider == "octoai": + from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder + + return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs)) else: raise ValueError(f"{self.provider} not a recognized encoder")