feature: add octoai for embeddings (#2538)

Thanks to Pedro at OctoAI we have a new embedding option.

The following PR adds support for the use of OctoAI embeddings.

Forked from the original OpenAI embeddings class. We removed the use of
the LangChain adaptor, and use OpenAI's SDK directly instead.

Also updated out-of-date example script.

Including new test file for OctoAI.

# Testing
Get a token from our platform at: https://www.octoai.cloud/
For testing one can do the following:
```
export OCTOAI_TOKEN=<your octo token>
python3 examples/embed/example_octoai.py
```

## Testing done
Validated running the above script from within a locally built container
via `make docker-start-dev`

---------

Co-authored-by: potter-potter <david.potter@gmail.com>
This commit is contained in:
David Potter 2024-02-10 07:27:06 -08:00 committed by GitHub
parent d11c70cf83
commit 1a706771fa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 176 additions and 6 deletions

View File

@ -1,9 +1,11 @@
## 0.12.5-dev1 ## 0.12.5-dev2
### Enhancements ### Enhancements
### Features ### Features
* **Add OctoAI embedder** Adds support for embeddings via OctoAI.
### Fixes ### Fixes
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors ** * **Fix `check_connection` in opensearch, databricks, postgres, azure connectors **

View File

@ -43,10 +43,10 @@ To obtain an api key, visit: https://platform.openai.com/account/api-keys
import os import os
from unstructured.documents.elements import Text from unstructured.documents.elements import Text
from unstructured.embed.openai import OpenAIEmbeddingEncoder from unstructured.embed.openai import OpenAiEmbeddingConfig, OpenAIEmbeddingEncoder
# Initialize the encoder with OpenAI credentials # Initialize the encoder with OpenAI credentials
embedding_encoder = OpenAIEmbeddingEncoder(api_key=os.environ["OPENAI_API_KEY"]) embedding_encoder = OpenAIEmbeddingEncoder(config=OpenAiEmbeddingConfig(api_key=os.environ["OPENAI_API_KEY"]))
# Embed a list of Elements # Embed a list of Elements
elements = embedding_encoder.embed_documents( elements = embedding_encoder.embed_documents(
@ -130,3 +130,47 @@ To create an instance of the `BedrockEmbeddingEncoder`, AWS credentials and the
Dependencies: Dependencies:
This class relies on several dependencies which include boto3, numpy, and langchain. Ensure these are installed and available in the environment where this class is utilized. This class relies on several dependencies which include boto3, numpy, and langchain. Ensure these are installed and available in the environment where this class is utilized.
``OctoAIEmbeddingEncoder``
--------------------------
The ``OctoAIEmbeddingEncoder`` class connects to the OctoAI Text&Embedding API to obtain embeddings for pieces of text.
``embed_documents`` will receive a list of Elements, and return an updated list which
includes the ``embeddings`` attribute for each Element.
``embed_query`` will receive a query as a string, and return a list of floats which is the
embedding vector for the given query string.
``num_of_dimensions`` is a metadata property that denotes the number of dimensions in any
embedding vector obtained via this class.
``is_unit_vector`` is a metadata property that denotes if embedding vectors obtained via
this class are unit vectors.
The following code block shows an example of how to use ``OctoAIEmbeddingEncoder``. You will
see the updated elements list (with the ``embeddings`` attribute included for each element),
the embedding vector for the query string, and some metadata properties about the embedding model.
You will need to set an environment variable named ``OCTOAI_API_KEY`` to be able to run this example.
To obtain an api key, visit: https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token
.. code:: python
import os
from unstructured.documents.elements import Text
from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
embedding_encoder = OctoAIEmbeddingEncoder(
config=OctoAiEmbeddingConfig(api_key=os.environ["OCTOAI_API_KEY"])
)
elements = embedding_encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)
query = "This is the query"
query_embedding = embedding_encoder.embed_query(query=query)
[print(e.embeddings, e) for e in elements]
print(query_embedding, query)
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())

View File

@ -1,9 +1,11 @@
import os import os
from unstructured.documents.elements import Text from unstructured.documents.elements import Text
from unstructured.embed.openai import OpenAIEmbeddingEncoder from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
embedding_encoder = OpenAIEmbeddingEncoder(api_key=os.environ["OPENAI_API_KEY"]) embedding_encoder = OctoAIEmbeddingEncoder(
config=OctoAiEmbeddingConfig(api_key=os.environ["OCTOAI_API_KEY"])
)
elements = embedding_encoder.embed_documents( elements = embedding_encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")], elements=[Text("This is sentence 1"), Text("This is sentence 2")],
) )

View File

@ -0,0 +1,18 @@
import os
from unstructured.documents.elements import Text
from unstructured.embed.openai import OpenAiEmbeddingConfig, OpenAIEmbeddingEncoder
embedding_encoder = OpenAIEmbeddingEncoder(
config=OpenAiEmbeddingConfig(api_key=os.environ["OPENAI_API_KEY"])
)
elements = embedding_encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)
query = "This is the query"
query_embedding = embedding_encoder.embed_query(query=query)
[print(e.embeddings, e) for e in elements]
print(query_embedding, query)
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())

View File

@ -0,0 +1,19 @@
from unstructured.documents.elements import Text
from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
def test_embed_documents_does_not_break_element_to_dict(mocker):
# Mocked client with the desired behavior for embed_documents
mock_client = mocker.MagicMock()
mock_client.embed_documents.return_value = [1, 2]
# Mock create_client to return our mock_client
mocker.patch.object(OctoAIEmbeddingEncoder, "create_client", return_value=mock_client)
encoder = OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(api_key="api_key"))
elements = encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)
assert len(elements) == 2
assert elements[0].to_dict()["text"] == "This is sentence 1"
assert elements[1].to_dict()["text"] == "This is sentence 2"

View File

@ -1 +1 @@
__version__ = "0.12.5-dev1" # pragma: no cover __version__ = "0.12.5-dev2" # pragma: no cover

View File

@ -1,9 +1,11 @@
from unstructured.embed.bedrock import BedrockEmbeddingEncoder from unstructured.embed.bedrock import BedrockEmbeddingEncoder
from unstructured.embed.huggingface import HuggingFaceEmbeddingEncoder from unstructured.embed.huggingface import HuggingFaceEmbeddingEncoder
from unstructured.embed.octoai import OctoAIEmbeddingEncoder
from unstructured.embed.openai import OpenAIEmbeddingEncoder from unstructured.embed.openai import OpenAIEmbeddingEncoder
EMBEDDING_PROVIDER_TO_CLASS_MAP = { EMBEDDING_PROVIDER_TO_CLASS_MAP = {
"langchain-openai": OpenAIEmbeddingEncoder, "langchain-openai": OpenAIEmbeddingEncoder,
"langchain-huggingface": HuggingFaceEmbeddingEncoder, "langchain-huggingface": HuggingFaceEmbeddingEncoder,
"langchain-aws-bedrock": BedrockEmbeddingEncoder, "langchain-aws-bedrock": BedrockEmbeddingEncoder,
"octoai": OctoAIEmbeddingEncoder,
} }

View File

@ -0,0 +1,79 @@
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, List, Optional
import numpy as np
from unstructured.documents.elements import (
Element,
)
from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
from unstructured.ingest.error import EmbeddingEncoderConnectionError
from unstructured.utils import requires_dependencies
if TYPE_CHECKING:
from openai import OpenAI
OCTOAI_BASE_URL = "https://text.octoai.run/v1"
@dataclass
class OctoAiEmbeddingConfig(EmbeddingConfig):
api_key: str
model_name: str = "thenlper/gte-large"
@dataclass
class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
config: OctoAiEmbeddingConfig
# Uses the OpenAI SDK
_client: Optional["OpenAI"] = field(init=False, default=None)
_exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
@property
def client(self) -> "OpenAI":
if self._client is None:
self._client = self.create_client()
return self._client
@property
def exemplary_embedding(self) -> List[float]:
if self._exemplary_embedding is None:
self._exemplary_embedding = self.embed_query("Q")
return self._exemplary_embedding
def initialize(self):
pass
def num_of_dimensions(self):
return np.shape(self.exemplary_embedding)
def is_unit_vector(self):
return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0)
def embed_query(self, query):
response = self.client.embeddings.create(input=str(query), model=self.config.model_name)
return response.data[0].embedding
def embed_documents(self, elements: List[Element]) -> List[Element]:
embeddings = [self.embed_query(e) for e in elements]
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
return elements_with_embeddings
def _add_embeddings_to_elements(self, elements, embeddings) -> List[Element]:
assert len(elements) == len(embeddings)
elements_w_embedding = []
for i, element in enumerate(elements):
element.embeddings = embeddings[i]
elements_w_embedding.append(element)
return elements
@EmbeddingEncoderConnectionError.wrap
@requires_dependencies(
["openai", "tiktoken"],
extras="embed-openai",
)
def create_client(self) -> "OpenAI":
"""Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
from openai import OpenAI
return OpenAI(api_key=self.config.api_key, base_url=OCTOAI_BASE_URL)

View File

@ -206,6 +206,10 @@ class EmbeddingConfig(BaseConfig):
) )
return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs)) return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
elif self.provider == "octoai":
from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
else: else:
raise ValueError(f"{self.provider} not a recognized encoder") raise ValueError(f"{self.provider} not a recognized encoder")