feat: add VoyageAI embeddings (#3069) (#3099)

Original PR was #3069. Merged in to a feature branch to fix dependency
and linting issues. Application code changes from the original PR were
already reviewed and approved.

------------
Original PR description:
Adding VoyageAI embeddings 
Voyage AI’s embedding models and rerankers are state-of-the-art in
retrieval accuracy.

---------

Co-authored-by: fzowl <160063452+fzowl@users.noreply.github.com>
Co-authored-by: Liuhong99 <39693953+Liuhong99@users.noreply.github.com>
This commit is contained in:
Matt Robinson 2024-05-24 17:48:35 -04:00 committed by GitHub
parent 32df4ee1c6
commit 6b400b46fe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
41 changed files with 20601 additions and 56 deletions

View File

@ -1,9 +1,10 @@
## 0.14.3-dev4
## 0.14.3-dev5
### Enhancements
* **Move `category` field from Text class to Element class.**
* **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image.
* **Add VoyageAI embedder** Adds VoyageAI embeddings to support embedding via Voyage AI.
### Features

View File

@ -0,0 +1,25 @@
import os
from unstructured.documents.elements import Text
from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
# To use Voyage AI you will need to pass
# Voyage AI API Key (obtained from https://dash.voyageai.com/)
# as the ``api_key`` parameter.
#
# The ``model_name`` parameter is mandatory, please check the available models
# at https://docs.voyageai.com/docs/embeddings
embedding_encoder = VoyageAIEmbeddingEncoder(
config=VoyageAIEmbeddingConfig(api_key=os.environ["VOYAGE_API_KEY"], model_name="voyage-law-2")
)
elements = embedding_encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)
query = "This is the query"
query_embedding = embedding_encoder.embed_query(query=query)
[print(e, e.embeddings) for e in elements]
print(query, query_embedding)
print(embedding_encoder.is_unit_vector, embedding_encoder.num_of_dimensions)

View File

@ -86,7 +86,7 @@ tabulate==0.9.0
# via -r ./base.in
tqdm==4.66.4
# via nltk
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -r ./base.in
# emoji

View File

@ -57,7 +57,10 @@ unstructured-client<=0.18.0
fsspec==2024.5.0
# python 3.12 support
# python 3.12 support
numpy>=1.26.0
wrapt>=1.14.0
# NOTE(robinson): for compatiblity with voyage embeddings
langsmith==0.1.62

View File

@ -151,7 +151,7 @@ jsonschema-specifications==2023.12.1
# jsonschema
jupyter==1.0.0
# via -r ./dev.in
jupyter-client==8.6.1
jupyter-client==8.6.2
# via
# ipykernel
# jupyter-console
@ -185,7 +185,7 @@ jupyter-server==2.14.0
# notebook-shim
jupyter-server-terminals==0.5.3
# via jupyter-server
jupyterlab==4.2.0
jupyterlab==4.2.1
# via notebook
jupyterlab-pygments==0.3.0
# via nbconvert
@ -392,7 +392,7 @@ traitlets==5.14.3
# qtconsole
types-python-dateutil==2.9.0.20240316
# via arrow
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./base.txt
# -c ./test.txt

View File

@ -12,7 +12,7 @@ python-docx==1.1.2
# via
# -c ././deps/constraints.txt
# -r ./extra-docx.in
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./base.txt
# python-docx

View File

@ -14,7 +14,7 @@ python-docx==1.1.2
# via
# -c ././deps/constraints.txt
# -r ./extra-odt.in
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./base.txt
# python-docx

View File

@ -8,7 +8,7 @@ attrdict==2.0.1
# via unstructured-paddleocr
babel==2.15.0
# via flask-babel
bce-python-sdk==0.9.10
bce-python-sdk==0.9.11
# via visualdl
blinker==1.8.2
# via flask
@ -45,7 +45,7 @@ flask==3.0.3
# visualdl
flask-babel==4.0.0
# via visualdl
fonttools==4.51.0
fonttools==4.52.1
# via matplotlib
future==1.0.0
# via bce-python-sdk
@ -200,7 +200,7 @@ six==1.16.0
# imgaug
# python-dateutil
# visualdl
tifffile==2024.5.10
tifffile==2024.5.22
# via scikit-image
tqdm==4.66.4
# via

View File

@ -39,7 +39,7 @@ filelock==3.14.0
# transformers
flatbuffers==24.3.25
# via onnxruntime
fonttools==4.51.0
fonttools==4.52.1
# via matplotlib
fsspec==2024.5.0
# via
@ -118,7 +118,7 @@ numpy==1.26.4
# transformers
omegaconf==2.3.0
# via effdet
onnx==1.16.0
onnx==1.16.1
# via
# -r ./extra-pdf-image.in
# unstructured-inference
@ -278,7 +278,7 @@ tqdm==4.66.4
# transformers
transformers==4.41.1
# via unstructured-inference
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./base.txt
# huggingface-hub

View File

@ -102,7 +102,7 @@ tqdm==4.66.4
# transformers
transformers==4.41.1
# via -r ./huggingface.in
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./base.txt
# huggingface-hub

View File

@ -31,7 +31,7 @@ requests==2.32.2
# via
# -c ./ingest/../base.txt
# pyairtable
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# pyairtable

View File

@ -34,7 +34,7 @@ six==1.16.0
# -c ./ingest/../base.txt
# azure-core
# isodate
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# azure-core

View File

@ -93,7 +93,7 @@ six==1.16.0
# -c ./ingest/../base.txt
# azure-core
# isodate
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# azure-core

View File

@ -198,7 +198,7 @@ typer==0.9.0
# via
# -r ./ingest/chroma.in
# chromadb
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# chromadb

View File

@ -15,7 +15,7 @@ charset-normalizer==3.3.2
# via
# -c ./ingest/../base.txt
# requests
databricks-sdk==0.27.1
databricks-sdk==0.28.0
# via -r ./ingest/databricks-volumes.in
google-auth==2.29.0
# via databricks-sdk

View File

@ -11,7 +11,7 @@ certifi==2024.2.2
# elastic-transport
elastic-transport==8.13.0
# via elasticsearch
elasticsearch==8.13.1
elasticsearch==8.13.2
# via -r ./ingest/elasticsearch.in
urllib3==1.26.18
# via

View File

@ -37,7 +37,6 @@ charset-normalizer==3.3.2
dataclasses-json==0.6.6
# via
# -c ./ingest/../base.txt
# langchain
# langchain-community
frozenlist==1.4.1
# via
@ -56,9 +55,9 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==2.4
# via jsonpatch
langchain==0.2.0
langchain==0.2.1
# via langchain-community
langchain-community==0.2.0
langchain-community==0.2.1
# via -r ./ingest/embed-aws-bedrock.in
langchain-core==0.2.1
# via
@ -67,8 +66,9 @@ langchain-core==0.2.1
# langchain-text-splitters
langchain-text-splitters==0.2.0
# via langchain
langsmith==0.1.61
langsmith==0.1.62
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-community
# langchain-core
@ -135,7 +135,7 @@ tenacity==8.3.0
# langchain
# langchain-community
# langchain-core
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# pydantic

View File

@ -30,7 +30,6 @@ charset-normalizer==3.3.2
dataclasses-json==0.6.6
# via
# -c ./ingest/../base.txt
# langchain
# langchain-community
filelock==3.14.0
# via
@ -68,9 +67,9 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==2.4
# via jsonpatch
langchain==0.2.0
langchain==0.2.1
# via langchain-community
langchain-community==0.2.0
langchain-community==0.2.1
# via -r ./ingest/embed-huggingface.in
langchain-core==0.2.1
# via
@ -79,8 +78,9 @@ langchain-core==0.2.1
# langchain-text-splitters
langchain-text-splitters==0.2.0
# via langchain
langsmith==0.1.61
langsmith==0.1.62
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-community
# langchain-core
@ -188,7 +188,7 @@ tqdm==4.66.4
# transformers
transformers==4.41.1
# via sentence-transformers
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# huggingface-hub

View File

@ -38,7 +38,7 @@ idna==3.7
# anyio
# httpx
# requests
openai==1.30.1
openai==1.30.3
# via -r ./ingest/embed-octoai.in
pydantic==2.7.1
# via openai
@ -63,7 +63,7 @@ tqdm==4.66.4
# via
# -c ./ingest/../base.txt
# openai
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# openai

View File

@ -37,7 +37,6 @@ charset-normalizer==3.3.2
dataclasses-json==0.6.6
# via
# -c ./ingest/../base.txt
# langchain
# langchain-community
distro==1.9.0
# via openai
@ -64,9 +63,9 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==2.4
# via jsonpatch
langchain==0.2.0
langchain==0.2.1
# via langchain-community
langchain-community==0.2.0
langchain-community==0.2.1
# via -r ./ingest/embed-openai.in
langchain-core==0.2.1
# via
@ -75,8 +74,9 @@ langchain-core==0.2.1
# langchain-text-splitters
langchain-text-splitters==0.2.0
# via langchain
langsmith==0.1.61
langsmith==0.1.62
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-community
# langchain-core
@ -98,7 +98,7 @@ numpy==1.26.4
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-community
openai==1.30.1
openai==1.30.3
# via -r ./ingest/embed-openai.in
orjson==3.10.3
# via langsmith
@ -152,7 +152,7 @@ tqdm==4.66.4
# via
# -c ./ingest/../base.txt
# openai
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# openai

View File

@ -32,7 +32,6 @@ charset-normalizer==3.3.2
dataclasses-json==0.6.6
# via
# -c ./ingest/../base.txt
# langchain
# langchain-community
docstring-parser==0.16
# via google-cloud-aiplatform
@ -101,11 +100,11 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==2.4
# via jsonpatch
langchain==0.2.0
langchain==0.2.1
# via
# -r ./ingest/embed-vertexai.in
# langchain-community
langchain-community==0.2.0
langchain-community==0.2.1
# via -r ./ingest/embed-vertexai.in
langchain-core==0.2.1
# via
@ -117,8 +116,9 @@ langchain-google-vertexai==1.0.4
# via -r ./ingest/embed-vertexai.in
langchain-text-splitters==0.2.0
# via langchain
langsmith==0.1.61
langsmith==0.1.62
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-community
# langchain-core
@ -215,7 +215,7 @@ tenacity==8.3.0
# langchain
# langchain-community
# langchain-core
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# pydantic

View File

@ -0,0 +1,4 @@
-c ../deps/constraints.txt
-c ../base.txt
langchain
langchain-voyageai

View File

@ -0,0 +1,116 @@
#
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile ./ingest/embed-voyageai.in
#
aiohttp==3.9.5
# via
# langchain
# voyageai
aiolimiter==1.1.0
# via voyageai
aiosignal==1.3.1
# via aiohttp
annotated-types==0.7.0
# via pydantic
async-timeout==4.0.3
# via
# aiohttp
# langchain
attrs==23.2.0
# via aiohttp
certifi==2024.2.2
# via
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# requests
charset-normalizer==3.3.2
# via
# -c ./ingest/../base.txt
# requests
frozenlist==1.4.1
# via
# aiohttp
# aiosignal
idna==3.7
# via
# -c ./ingest/../base.txt
# requests
# yarl
jsonpatch==1.33
# via langchain-core
jsonpointer==2.4
# via jsonpatch
langchain==0.2.1
# via -r ./ingest/embed-voyageai.in
langchain-core==0.2.1
# via
# langchain
# langchain-text-splitters
# langchain-voyageai
langchain-text-splitters==0.2.0
# via langchain
langchain-voyageai==0.1.1
# via -r ./ingest/embed-voyageai.in
langsmith==0.1.62
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-core
multidict==6.0.5
# via
# aiohttp
# yarl
numpy==1.26.4
# via
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# langchain
# voyageai
orjson==3.10.3
# via langsmith
packaging==23.2
# via
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# langchain-core
pydantic==2.7.1
# via
# langchain
# langchain-core
# langsmith
pydantic-core==2.18.2
# via pydantic
pyyaml==6.0.1
# via
# langchain
# langchain-core
requests==2.32.2
# via
# -c ./ingest/../base.txt
# langchain
# langsmith
# voyageai
sqlalchemy==2.0.30
# via langchain
tenacity==8.3.0
# via
# langchain
# langchain-core
# voyageai
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# pydantic
# pydantic-core
# sqlalchemy
urllib3==1.26.18
# via
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# requests
voyageai==0.2.2
# via langchain-voyageai
yarl==1.9.4
# via aiohttp

View File

@ -37,7 +37,7 @@ requests==2.32.2
# via
# -c ./ingest/../base.txt
# pygithub
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# pygithub

View File

@ -17,7 +17,7 @@ charset-normalizer==3.3.2
# requests
google-api-core==2.19.0
# via google-api-python-client
google-api-python-client==2.129.0
google-api-python-client==2.130.0
# via -r ./ingest/google-drive.in
google-auth==2.29.0
# via

View File

@ -15,7 +15,7 @@ tqdm==4.66.4
# via
# -c ./ingest/../base.txt
# pinecone-client
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# pinecone-client

View File

@ -62,7 +62,7 @@ sniffio==1.3.1
# via
# anyio
# httpx
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# pydantic

View File

@ -51,7 +51,7 @@ six==1.16.0
# via
# -c ./ingest/../base.txt
# python-dateutil
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# aioitertools

View File

@ -58,7 +58,7 @@ six==1.16.0
# via
# -c ./ingest/../base.txt
# isodate
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# simple-salesforce

View File

@ -69,7 +69,7 @@ sniffio==1.3.1
# via
# anyio
# httpx
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# pydantic

View File

@ -130,7 +130,7 @@ rpds-py==0.18.1
# via
# jsonschema
# referencing
ruff==0.4.4
ruff==0.4.5
# via -r ./test.in
six==1.16.0
# via
@ -153,7 +153,7 @@ types-tabulate==0.9.0.20240106
# via -r ./test.in
types-urllib3==1.26.25.14
# via types-requests
typing-extensions==4.11.0
typing-extensions==4.12.0
# via
# -c ./base.txt
# black

View File

@ -12,5 +12,3 @@ pushd ./requirements || exit
make clean
make all
popd || exit
cp requirements/build.txt docs/requirements.txt

View File

@ -171,6 +171,7 @@ setup(
"embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"),
"embed-octoai": load_requirements("requirements/ingest/embed-octoai.in"),
"embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.in"),
"embed-voyageai": load_requirements("requirements/ingest/embed-voyageai.in"),
"openai": load_requirements("requirements/ingest/embed-openai.in"),
"bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"),
"databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.in"),

View File

@ -0,0 +1,21 @@
from unstructured.documents.elements import Text
from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
def test_embed_documents_does_not_break_element_to_dict(mocker):
# Mocked client with the desired behavior for embed_documents
mock_client = mocker.MagicMock()
mock_client.embed_documents.return_value = [1, 2]
# Mock create_client to return our mock_client
mocker.patch.object(VoyageAIEmbeddingEncoder, "create_client", return_value=mock_client)
encoder = VoyageAIEmbeddingEncoder(
config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-law-2")
)
elements = encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)
assert len(elements) == 2
assert elements[0].to_dict()["text"] == "This is sentence 1"
assert elements[1].to_dict()["text"] == "This is sentence 2"

View File

@ -0,0 +1,41 @@
#!/usr/bin/env bash
set -e
SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=embed-voyageai
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
VOYAGE_API_KEY=${VOYAGE_API_KEY:-$VOYAGE_API_KEY}
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
}
trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--verbose \
--reprocess \
--input-path example-docs/book-war-and-peace-1p.txt \
--work-dir "$WORK_DIR" \
--embedding-provider "langchain-voyageai" \
--embedding-api-key "$VOYAGE_API_KEY" \
--embedding-model-name "voyage-large-2"
set +e
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -59,6 +59,7 @@ all_tests=(
'local-embed-bedrock.sh'
'local-embed-octoai.sh'
'local-embed-vertexai.sh'
'local-embed-voyageai.sh'
'sftp.sh'
'opensearch.sh'
# NOTE(robinson) - mongo conflicts with astra because it ships with its

View File

@ -1 +1 @@
__version__ = "0.14.3-dev4" # pragma: no cover
__version__ = "0.14.3-dev5" # pragma: no cover

View File

@ -3,11 +3,13 @@ from unstructured.embed.huggingface import HuggingFaceEmbeddingEncoder
from unstructured.embed.octoai import OctoAIEmbeddingEncoder
from unstructured.embed.openai import OpenAIEmbeddingEncoder
from unstructured.embed.vertexai import VertexAIEmbeddingEncoder
from unstructured.embed.voyageai import VoyageAIEmbeddingEncoder
EMBEDDING_PROVIDER_TO_CLASS_MAP = {
"langchain-openai": OpenAIEmbeddingEncoder,
"langchain-huggingface": HuggingFaceEmbeddingEncoder,
"langchain-aws-bedrock": BedrockEmbeddingEncoder,
"langchain-vertexai": VertexAIEmbeddingEncoder,
"langchain-voyageai": VoyageAIEmbeddingEncoder,
"octoai": OctoAIEmbeddingEncoder,
}

View File

@ -0,0 +1,82 @@
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, List, Optional
import numpy as np
from unstructured.documents.elements import Element
from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
from unstructured.ingest.error import EmbeddingEncoderConnectionError
from unstructured.utils import requires_dependencies
if TYPE_CHECKING:
from langchain_voyageai import VoyageAIEmbeddings
@dataclass
class VoyageAIEmbeddingConfig(EmbeddingConfig):
api_key: str
model_name: str
batch_size: Optional[int] = None
truncation: Optional[bool] = None
@dataclass
class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
config: VoyageAIEmbeddingConfig
_client: Optional["VoyageAIEmbeddings"] = field(init=False, default=None)
_exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
@property
def client(self) -> "VoyageAIEmbeddings":
if self._client is None:
self._client = self.create_client()
return self._client
@property
def exemplary_embedding(self) -> List[float]:
if self._exemplary_embedding is None:
self._exemplary_embedding = self.client.embed_query("A sample query.")
return self._exemplary_embedding
def initialize(self):
pass
@property
def num_of_dimensions(self) -> tuple[int, ...]:
return np.shape(self.exemplary_embedding)
@property
def is_unit_vector(self) -> bool:
return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0)
def embed_documents(self, elements: List[Element]) -> List[Element]:
embeddings = self.client.embed_documents([str(e) for e in elements])
return self._add_embeddings_to_elements(elements, embeddings)
def embed_query(self, query: str) -> List[float]:
return self.client.embed_query(query)
@staticmethod
def _add_embeddings_to_elements(elements, embeddings) -> List[Element]:
assert len(elements) == len(embeddings)
elements_w_embedding = []
for i, element in enumerate(elements):
element.embeddings = embeddings[i]
elements_w_embedding.append(element)
return elements
@EmbeddingEncoderConnectionError.wrap
@requires_dependencies(
["langchain", "langchain_voyageai"],
extras="embed-voyageai",
)
def create_client(self) -> "VoyageAIEmbeddings":
"""Creates a Langchain VoyageAI python client to embed elements."""
from langchain_voyageai import VoyageAIEmbeddings
return VoyageAIEmbeddings(
voyage_api_key=self.config.api_key,
model=self.config.model_name,
batch_size=self.config.batch_size,
truncation=self.config.truncation,
)

View File

@ -234,6 +234,13 @@ class EmbeddingConfig(BaseConfig):
)
return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
elif self.provider == "langchain-voyageai":
from unstructured.embed.voyageai import (
VoyageAIEmbeddingConfig,
VoyageAIEmbeddingEncoder,
)
return VoyageAIEmbeddingEncoder(config=VoyageAIEmbeddingConfig(**kwargs))
else:
raise ValueError(f"{self.provider} not a recognized encoder")