mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-04 23:52:23 +00:00

Closes https://github.com/Unstructured-IO/unstructured/issues/1319, closes https://github.com/Unstructured-IO/unstructured/issues/1372 This module: - implements EmbeddingEncoder classes which track embedding related data - implements embed_documents method which receives a list of Elements, obtains embeddings for the text within Elements, updates the Elements with an attribute named embeddings , and returns the updated Elements - the module uses langchain to obtain the embeddings ----- - The PR additionally fixes a JSON de-serialization issue on the metadata fields. To test the changes, run `examples/embed/example.py`
17 lines
567 B
Python
17 lines
567 B
Python
import os
|
|
|
|
from unstructured.documents.elements import Text
|
|
from unstructured.embed.openai import OpenAIEmbeddingEncoder
|
|
|
|
embedding_encoder = OpenAIEmbeddingEncoder(api_key=os.environ["OPENAI_API_KEY"])
|
|
elements = embedding_encoder.embed_documents(
|
|
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
|
|
)
|
|
|
|
query = "This is the query"
|
|
query_embedding = embedding_encoder.embed_query(query=query)
|
|
|
|
[print(e.embeddings, e) for e in elements]
|
|
print(query_embedding, query)
|
|
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
|