mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-03 19:43:24 +00:00 
			
		
		
		
	feature: add octoai for embeddings (#2538)
Thanks to Pedro at OctoAI we have a new embedding option. The following PR adds support for the use of OctoAI embeddings. Forked from the original OpenAI embeddings class. We removed the use of the LangChain adaptor, and use OpenAI's SDK directly instead. Also updated out-of-date example script. Including new test file for OctoAI. # Testing Get a token from our platform at: https://www.octoai.cloud/ For testing one can do the following: ``` export OCTOAI_TOKEN=<your octo token> python3 examples/embed/example_octoai.py ``` ## Testing done Validated running the above script from within a locally built container via `make docker-start-dev` --------- Co-authored-by: potter-potter <david.potter@gmail.com>
This commit is contained in:
		
							parent
							
								
									d11c70cf83
								
							
						
					
					
						commit
						1a706771fa
					
				@ -1,9 +1,11 @@
 | 
			
		||||
## 0.12.5-dev1
 | 
			
		||||
## 0.12.5-dev2
 | 
			
		||||
 | 
			
		||||
### Enhancements
 | 
			
		||||
 | 
			
		||||
### Features
 | 
			
		||||
 | 
			
		||||
* **Add OctoAI embedder** Adds support for embeddings via OctoAI.
 | 
			
		||||
 | 
			
		||||
### Fixes
 | 
			
		||||
 | 
			
		||||
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors ** 
 | 
			
		||||
 | 
			
		||||
@ -43,10 +43,10 @@ To obtain an api key, visit: https://platform.openai.com/account/api-keys
 | 
			
		||||
    import os
 | 
			
		||||
 | 
			
		||||
    from unstructured.documents.elements import Text
 | 
			
		||||
    from unstructured.embed.openai import OpenAIEmbeddingEncoder
 | 
			
		||||
    from unstructured.embed.openai import OpenAiEmbeddingConfig, OpenAIEmbeddingEncoder
 | 
			
		||||
 | 
			
		||||
    # Initialize the encoder with OpenAI credentials
 | 
			
		||||
    embedding_encoder = OpenAIEmbeddingEncoder(api_key=os.environ["OPENAI_API_KEY"])
 | 
			
		||||
    embedding_encoder = OpenAIEmbeddingEncoder(config=OpenAiEmbeddingConfig(api_key=os.environ["OPENAI_API_KEY"]))
 | 
			
		||||
 | 
			
		||||
    # Embed a list of Elements
 | 
			
		||||
    elements = embedding_encoder.embed_documents(
 | 
			
		||||
@ -130,3 +130,47 @@ To create an instance of the `BedrockEmbeddingEncoder`, AWS credentials and the
 | 
			
		||||
 | 
			
		||||
Dependencies:
 | 
			
		||||
This class relies on several dependencies which include boto3, numpy, and langchain. Ensure these are installed and available in the environment where this class is utilized.
 | 
			
		||||
 | 
			
		||||
``OctoAIEmbeddingEncoder``
 | 
			
		||||
--------------------------
 | 
			
		||||
 | 
			
		||||
The ``OctoAIEmbeddingEncoder`` class connects to the OctoAI Text&Embedding API to obtain embeddings for pieces of text.
 | 
			
		||||
 | 
			
		||||
``embed_documents`` will receive a list of Elements, and return an updated list which
 | 
			
		||||
includes the ``embeddings`` attribute for each Element.
 | 
			
		||||
 | 
			
		||||
``embed_query`` will receive a query as a string, and return a list of floats which is the
 | 
			
		||||
embedding vector for the given query string.
 | 
			
		||||
 | 
			
		||||
``num_of_dimensions`` is a metadata property that denotes the number of dimensions in any
 | 
			
		||||
embedding vector obtained via this class.
 | 
			
		||||
 | 
			
		||||
``is_unit_vector`` is a metadata property that denotes if embedding vectors obtained via
 | 
			
		||||
this class are unit vectors.
 | 
			
		||||
 | 
			
		||||
The following code block shows an example of how to use ``OctoAIEmbeddingEncoder``. You will
 | 
			
		||||
see the updated elements list (with the ``embeddings`` attribute included for each element),
 | 
			
		||||
the embedding vector for the query string, and some metadata properties about the embedding model.
 | 
			
		||||
You will need to set an environment variable named ``OCTOAI_API_KEY`` to be able to run this example.
 | 
			
		||||
To obtain an api key, visit: https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token
 | 
			
		||||
 | 
			
		||||
.. code:: python
 | 
			
		||||
 | 
			
		||||
    import os
 | 
			
		||||
 | 
			
		||||
    from unstructured.documents.elements import Text
 | 
			
		||||
    from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
 | 
			
		||||
 | 
			
		||||
    embedding_encoder = OctoAIEmbeddingEncoder(
 | 
			
		||||
        config=OctoAiEmbeddingConfig(api_key=os.environ["OCTOAI_API_KEY"])
 | 
			
		||||
    )
 | 
			
		||||
    elements = embedding_encoder.embed_documents(
 | 
			
		||||
        elements=[Text("This is sentence 1"), Text("This is sentence 2")],
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    query = "This is the query"
 | 
			
		||||
    query_embedding = embedding_encoder.embed_query(query=query)
 | 
			
		||||
 | 
			
		||||
    [print(e.embeddings, e) for e in elements]
 | 
			
		||||
    print(query_embedding, query)
 | 
			
		||||
    print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
 | 
			
		||||
@ -1,9 +1,11 @@
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
from unstructured.documents.elements import Text
 | 
			
		||||
from unstructured.embed.openai import OpenAIEmbeddingEncoder
 | 
			
		||||
from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
 | 
			
		||||
 | 
			
		||||
embedding_encoder = OpenAIEmbeddingEncoder(api_key=os.environ["OPENAI_API_KEY"])
 | 
			
		||||
embedding_encoder = OctoAIEmbeddingEncoder(
 | 
			
		||||
    config=OctoAiEmbeddingConfig(api_key=os.environ["OCTOAI_API_KEY"])
 | 
			
		||||
)
 | 
			
		||||
elements = embedding_encoder.embed_documents(
 | 
			
		||||
    elements=[Text("This is sentence 1"), Text("This is sentence 2")],
 | 
			
		||||
)
 | 
			
		||||
							
								
								
									
										18
									
								
								examples/embed/example_openai.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								examples/embed/example_openai.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,18 @@
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
from unstructured.documents.elements import Text
 | 
			
		||||
from unstructured.embed.openai import OpenAiEmbeddingConfig, OpenAIEmbeddingEncoder
 | 
			
		||||
 | 
			
		||||
embedding_encoder = OpenAIEmbeddingEncoder(
 | 
			
		||||
    config=OpenAiEmbeddingConfig(api_key=os.environ["OPENAI_API_KEY"])
 | 
			
		||||
)
 | 
			
		||||
elements = embedding_encoder.embed_documents(
 | 
			
		||||
    elements=[Text("This is sentence 1"), Text("This is sentence 2")],
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
query = "This is the query"
 | 
			
		||||
query_embedding = embedding_encoder.embed_query(query=query)
 | 
			
		||||
 | 
			
		||||
[print(e.embeddings, e) for e in elements]
 | 
			
		||||
print(query_embedding, query)
 | 
			
		||||
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
 | 
			
		||||
							
								
								
									
										19
									
								
								test_unstructured/embed/test_octoai.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								test_unstructured/embed/test_octoai.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,19 @@
 | 
			
		||||
from unstructured.documents.elements import Text
 | 
			
		||||
from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_embed_documents_does_not_break_element_to_dict(mocker):
 | 
			
		||||
    # Mocked client with the desired behavior for embed_documents
 | 
			
		||||
    mock_client = mocker.MagicMock()
 | 
			
		||||
    mock_client.embed_documents.return_value = [1, 2]
 | 
			
		||||
 | 
			
		||||
    # Mock create_client to return our mock_client
 | 
			
		||||
    mocker.patch.object(OctoAIEmbeddingEncoder, "create_client", return_value=mock_client)
 | 
			
		||||
 | 
			
		||||
    encoder = OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(api_key="api_key"))
 | 
			
		||||
    elements = encoder.embed_documents(
 | 
			
		||||
        elements=[Text("This is sentence 1"), Text("This is sentence 2")],
 | 
			
		||||
    )
 | 
			
		||||
    assert len(elements) == 2
 | 
			
		||||
    assert elements[0].to_dict()["text"] == "This is sentence 1"
 | 
			
		||||
    assert elements[1].to_dict()["text"] == "This is sentence 2"
 | 
			
		||||
@ -1 +1 @@
 | 
			
		||||
__version__ = "0.12.5-dev1"  # pragma: no cover
 | 
			
		||||
__version__ = "0.12.5-dev2"  # pragma: no cover
 | 
			
		||||
 | 
			
		||||
@ -1,9 +1,11 @@
 | 
			
		||||
from unstructured.embed.bedrock import BedrockEmbeddingEncoder
 | 
			
		||||
from unstructured.embed.huggingface import HuggingFaceEmbeddingEncoder
 | 
			
		||||
from unstructured.embed.octoai import OctoAIEmbeddingEncoder
 | 
			
		||||
from unstructured.embed.openai import OpenAIEmbeddingEncoder
 | 
			
		||||
 | 
			
		||||
EMBEDDING_PROVIDER_TO_CLASS_MAP = {
 | 
			
		||||
    "langchain-openai": OpenAIEmbeddingEncoder,
 | 
			
		||||
    "langchain-huggingface": HuggingFaceEmbeddingEncoder,
 | 
			
		||||
    "langchain-aws-bedrock": BedrockEmbeddingEncoder,
 | 
			
		||||
    "octoai": OctoAIEmbeddingEncoder,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										79
									
								
								unstructured/embed/octoai.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								unstructured/embed/octoai.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,79 @@
 | 
			
		||||
from dataclasses import dataclass, field
 | 
			
		||||
from typing import TYPE_CHECKING, List, Optional
 | 
			
		||||
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
from unstructured.documents.elements import (
 | 
			
		||||
    Element,
 | 
			
		||||
)
 | 
			
		||||
from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
 | 
			
		||||
from unstructured.ingest.error import EmbeddingEncoderConnectionError
 | 
			
		||||
from unstructured.utils import requires_dependencies
 | 
			
		||||
 | 
			
		||||
if TYPE_CHECKING:
 | 
			
		||||
    from openai import OpenAI
 | 
			
		||||
 | 
			
		||||
OCTOAI_BASE_URL = "https://text.octoai.run/v1"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class OctoAiEmbeddingConfig(EmbeddingConfig):
 | 
			
		||||
    api_key: str
 | 
			
		||||
    model_name: str = "thenlper/gte-large"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
 | 
			
		||||
    config: OctoAiEmbeddingConfig
 | 
			
		||||
    # Uses the OpenAI SDK
 | 
			
		||||
    _client: Optional["OpenAI"] = field(init=False, default=None)
 | 
			
		||||
    _exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def client(self) -> "OpenAI":
 | 
			
		||||
        if self._client is None:
 | 
			
		||||
            self._client = self.create_client()
 | 
			
		||||
        return self._client
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def exemplary_embedding(self) -> List[float]:
 | 
			
		||||
        if self._exemplary_embedding is None:
 | 
			
		||||
            self._exemplary_embedding = self.embed_query("Q")
 | 
			
		||||
        return self._exemplary_embedding
 | 
			
		||||
 | 
			
		||||
    def initialize(self):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    def num_of_dimensions(self):
 | 
			
		||||
        return np.shape(self.exemplary_embedding)
 | 
			
		||||
 | 
			
		||||
    def is_unit_vector(self):
 | 
			
		||||
        return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0)
 | 
			
		||||
 | 
			
		||||
    def embed_query(self, query):
 | 
			
		||||
        response = self.client.embeddings.create(input=str(query), model=self.config.model_name)
 | 
			
		||||
        return response.data[0].embedding
 | 
			
		||||
 | 
			
		||||
    def embed_documents(self, elements: List[Element]) -> List[Element]:
 | 
			
		||||
        embeddings = [self.embed_query(e) for e in elements]
 | 
			
		||||
        elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
 | 
			
		||||
        return elements_with_embeddings
 | 
			
		||||
 | 
			
		||||
    def _add_embeddings_to_elements(self, elements, embeddings) -> List[Element]:
 | 
			
		||||
        assert len(elements) == len(embeddings)
 | 
			
		||||
        elements_w_embedding = []
 | 
			
		||||
        for i, element in enumerate(elements):
 | 
			
		||||
            element.embeddings = embeddings[i]
 | 
			
		||||
            elements_w_embedding.append(element)
 | 
			
		||||
        return elements
 | 
			
		||||
 | 
			
		||||
    @EmbeddingEncoderConnectionError.wrap
 | 
			
		||||
    @requires_dependencies(
 | 
			
		||||
        ["openai", "tiktoken"],
 | 
			
		||||
        extras="embed-openai",
 | 
			
		||||
    )
 | 
			
		||||
    def create_client(self) -> "OpenAI":
 | 
			
		||||
        """Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
 | 
			
		||||
        from openai import OpenAI
 | 
			
		||||
 | 
			
		||||
        return OpenAI(api_key=self.config.api_key, base_url=OCTOAI_BASE_URL)
 | 
			
		||||
@ -206,6 +206,10 @@ class EmbeddingConfig(BaseConfig):
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
 | 
			
		||||
        elif self.provider == "octoai":
 | 
			
		||||
            from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
 | 
			
		||||
 | 
			
		||||
            return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
 | 
			
		||||
        else:
 | 
			
		||||
            raise ValueError(f"{self.provider} not a recognized encoder")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user