mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-05 08:02:48 +00:00

This PR: - Adds VertexAI embeddings as an embedding provider Testing - Tested with pinecone destination connector on [this](https://github.com/Unstructured-IO/unstructured/actions/runs/8429035114/job/23082700074?pr=2693) job run. --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
31 lines
1.4 KiB
Python
31 lines
1.4 KiB
Python
import os
|
|
|
|
from unstructured.documents.elements import Text
|
|
from unstructured.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
|
|
|
|
# To use Vertex AI PaLM tou will need to:
|
|
# - either, pass the full json content of your GCP VertexAI application credentials to the
|
|
# VertexAIEmbeddingConfig as the api_key parameter. (This will create a file in the ``/tmp``
|
|
# directory with the content of the json, and set the GOOGLE_APPLICATION_CREDENTIALS environment
|
|
# variable to the **path** of the created file.)
|
|
# - or, you'll need to store the path to a manually created service account JSON file as the
|
|
# GOOGLE_APPLICATION_CREDENTIALS environment variable. (For more information:
|
|
# https://python.langchain.com/docs/integrations/text_embedding/google_vertex_ai_palm)
|
|
# - or, you'll need to have the credentials configured for your environment (gcloud,
|
|
# workload identity, etc…)
|
|
|
|
embedding_encoder = VertexAIEmbeddingEncoder(
|
|
config=VertexAIEmbeddingConfig(api_key=os.environ["VERTEXAI_GCP_APP_CREDS_JSON_CONTENT"])
|
|
)
|
|
|
|
elements = embedding_encoder.embed_documents(
|
|
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
|
|
)
|
|
|
|
query = "This is the query"
|
|
query_embedding = embedding_encoder.embed_query(query=query)
|
|
|
|
[print(e.embeddings, e) for e in elements]
|
|
print(query_embedding, query)
|
|
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
|