mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-04 15:42:16 +00:00

### Description Leverage a similar pattern to what is used for connectors, where there is a nested config dataclass as a field, along with cached content for things like the client and sample embedding for each. This required an update on the embeddings config in ingest and I left a TODO in there because the current approach breaks on other encoders such as bedrock because the parameters in that config don't map to all encoders. But this keeps the existing functionality working. This update makes sure all variables associated with the dataclass exist when it's instantiated rather than being added in the `__post_init__()` method or the `initialize()`, allowing other libraries like pydantic to appropriately generate schemas from it. It also now follows the pattern of the connectors in that each class has a nested config class used to instantiate the client itself as well as a field/property approach used to cache the client.
20 lines
868 B
Python
20 lines
868 B
Python
from unstructured.documents.elements import Text
|
|
from unstructured.embed.openai import OpenAiEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
|
|
|
|
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
|
# Mocked client with the desired behavior for embed_documents
|
|
mock_client = mocker.MagicMock()
|
|
mock_client.embed_documents.return_value = [1, 2]
|
|
|
|
# Mock create_client to return our mock_client
|
|
mocker.patch.object(OpenAIEmbeddingEncoder, "create_client", return_value=mock_client)
|
|
|
|
encoder = OpenAIEmbeddingEncoder(config=OpenAiEmbeddingConfig(api_key="api_key"))
|
|
elements = encoder.embed_documents(
|
|
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
|
|
)
|
|
assert len(elements) == 2
|
|
assert elements[0].to_dict()["text"] == "This is sentence 1"
|
|
assert elements[1].to_dict()["text"] == "This is sentence 2"
|