feat: add VoyageAI embeddings (#3069) (#3099)

Original PR was #3069. Merged in to a feature branch to fix dependency
and linting issues. Application code changes from the original PR were
already reviewed and approved.

------------
Original PR description:
Adding VoyageAI embeddings 
Voyage AI’s embedding models and rerankers are state-of-the-art in
retrieval accuracy.

---------

Co-authored-by: fzowl <160063452+fzowl@users.noreply.github.com>
Co-authored-by: Liuhong99 <39693953+Liuhong99@users.noreply.github.com>
This commit is contained in:
Matt Robinson 2024-05-24 17:48:35 -04:00 committed by GitHub
parent 32df4ee1c6
commit 6b400b46fe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
41 changed files with 20601 additions and 56 deletions

View File

@ -1,9 +1,10 @@
## 0.14.3-dev4 ## 0.14.3-dev5
### Enhancements ### Enhancements
* **Move `category` field from Text class to Element class.** * **Move `category` field from Text class to Element class.**
* **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image. * **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image.
* **Add VoyageAI embedder** Adds VoyageAI embeddings to support embedding via Voyage AI.
### Features ### Features

View File

@ -0,0 +1,25 @@
import os
from unstructured.documents.elements import Text
from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
# To use Voyage AI you will need to pass
# Voyage AI API Key (obtained from https://dash.voyageai.com/)
# as the ``api_key`` parameter.
#
# The ``model_name`` parameter is mandatory, please check the available models
# at https://docs.voyageai.com/docs/embeddings
embedding_encoder = VoyageAIEmbeddingEncoder(
config=VoyageAIEmbeddingConfig(api_key=os.environ["VOYAGE_API_KEY"], model_name="voyage-law-2")
)
elements = embedding_encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)
query = "This is the query"
query_embedding = embedding_encoder.embed_query(query=query)
[print(e, e.embeddings) for e in elements]
print(query, query_embedding)
print(embedding_encoder.is_unit_vector, embedding_encoder.num_of_dimensions)

View File

@ -86,7 +86,7 @@ tabulate==0.9.0
# via -r ./base.in # via -r ./base.in
tqdm==4.66.4 tqdm==4.66.4
# via nltk # via nltk
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -r ./base.in # -r ./base.in
# emoji # emoji

View File

@ -61,3 +61,6 @@ fsspec==2024.5.0
numpy>=1.26.0 numpy>=1.26.0
wrapt>=1.14.0 wrapt>=1.14.0
# NOTE(robinson): for compatiblity with voyage embeddings
langsmith==0.1.62

View File

@ -151,7 +151,7 @@ jsonschema-specifications==2023.12.1
# jsonschema # jsonschema
jupyter==1.0.0 jupyter==1.0.0
# via -r ./dev.in # via -r ./dev.in
jupyter-client==8.6.1 jupyter-client==8.6.2
# via # via
# ipykernel # ipykernel
# jupyter-console # jupyter-console
@ -185,7 +185,7 @@ jupyter-server==2.14.0
# notebook-shim # notebook-shim
jupyter-server-terminals==0.5.3 jupyter-server-terminals==0.5.3
# via jupyter-server # via jupyter-server
jupyterlab==4.2.0 jupyterlab==4.2.1
# via notebook # via notebook
jupyterlab-pygments==0.3.0 jupyterlab-pygments==0.3.0
# via nbconvert # via nbconvert
@ -392,7 +392,7 @@ traitlets==5.14.3
# qtconsole # qtconsole
types-python-dateutil==2.9.0.20240316 types-python-dateutil==2.9.0.20240316
# via arrow # via arrow
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./base.txt # -c ./base.txt
# -c ./test.txt # -c ./test.txt

View File

@ -12,7 +12,7 @@ python-docx==1.1.2
# via # via
# -c ././deps/constraints.txt # -c ././deps/constraints.txt
# -r ./extra-docx.in # -r ./extra-docx.in
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./base.txt # -c ./base.txt
# python-docx # python-docx

View File

@ -14,7 +14,7 @@ python-docx==1.1.2
# via # via
# -c ././deps/constraints.txt # -c ././deps/constraints.txt
# -r ./extra-odt.in # -r ./extra-odt.in
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./base.txt # -c ./base.txt
# python-docx # python-docx

View File

@ -8,7 +8,7 @@ attrdict==2.0.1
# via unstructured-paddleocr # via unstructured-paddleocr
babel==2.15.0 babel==2.15.0
# via flask-babel # via flask-babel
bce-python-sdk==0.9.10 bce-python-sdk==0.9.11
# via visualdl # via visualdl
blinker==1.8.2 blinker==1.8.2
# via flask # via flask
@ -45,7 +45,7 @@ flask==3.0.3
# visualdl # visualdl
flask-babel==4.0.0 flask-babel==4.0.0
# via visualdl # via visualdl
fonttools==4.51.0 fonttools==4.52.1
# via matplotlib # via matplotlib
future==1.0.0 future==1.0.0
# via bce-python-sdk # via bce-python-sdk
@ -200,7 +200,7 @@ six==1.16.0
# imgaug # imgaug
# python-dateutil # python-dateutil
# visualdl # visualdl
tifffile==2024.5.10 tifffile==2024.5.22
# via scikit-image # via scikit-image
tqdm==4.66.4 tqdm==4.66.4
# via # via

View File

@ -39,7 +39,7 @@ filelock==3.14.0
# transformers # transformers
flatbuffers==24.3.25 flatbuffers==24.3.25
# via onnxruntime # via onnxruntime
fonttools==4.51.0 fonttools==4.52.1
# via matplotlib # via matplotlib
fsspec==2024.5.0 fsspec==2024.5.0
# via # via
@ -118,7 +118,7 @@ numpy==1.26.4
# transformers # transformers
omegaconf==2.3.0 omegaconf==2.3.0
# via effdet # via effdet
onnx==1.16.0 onnx==1.16.1
# via # via
# -r ./extra-pdf-image.in # -r ./extra-pdf-image.in
# unstructured-inference # unstructured-inference
@ -278,7 +278,7 @@ tqdm==4.66.4
# transformers # transformers
transformers==4.41.1 transformers==4.41.1
# via unstructured-inference # via unstructured-inference
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./base.txt # -c ./base.txt
# huggingface-hub # huggingface-hub

View File

@ -102,7 +102,7 @@ tqdm==4.66.4
# transformers # transformers
transformers==4.41.1 transformers==4.41.1
# via -r ./huggingface.in # via -r ./huggingface.in
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./base.txt # -c ./base.txt
# huggingface-hub # huggingface-hub

View File

@ -31,7 +31,7 @@ requests==2.32.2
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# pyairtable # pyairtable
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# pyairtable # pyairtable

View File

@ -34,7 +34,7 @@ six==1.16.0
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# azure-core # azure-core
# isodate # isodate
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# azure-core # azure-core

View File

@ -93,7 +93,7 @@ six==1.16.0
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# azure-core # azure-core
# isodate # isodate
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# azure-core # azure-core

View File

@ -198,7 +198,7 @@ typer==0.9.0
# via # via
# -r ./ingest/chroma.in # -r ./ingest/chroma.in
# chromadb # chromadb
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# chromadb # chromadb

View File

@ -15,7 +15,7 @@ charset-normalizer==3.3.2
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# requests # requests
databricks-sdk==0.27.1 databricks-sdk==0.28.0
# via -r ./ingest/databricks-volumes.in # via -r ./ingest/databricks-volumes.in
google-auth==2.29.0 google-auth==2.29.0
# via databricks-sdk # via databricks-sdk

View File

@ -11,7 +11,7 @@ certifi==2024.2.2
# elastic-transport # elastic-transport
elastic-transport==8.13.0 elastic-transport==8.13.0
# via elasticsearch # via elasticsearch
elasticsearch==8.13.1 elasticsearch==8.13.2
# via -r ./ingest/elasticsearch.in # via -r ./ingest/elasticsearch.in
urllib3==1.26.18 urllib3==1.26.18
# via # via

View File

@ -37,7 +37,6 @@ charset-normalizer==3.3.2
dataclasses-json==0.6.6 dataclasses-json==0.6.6
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# langchain
# langchain-community # langchain-community
frozenlist==1.4.1 frozenlist==1.4.1
# via # via
@ -56,9 +55,9 @@ jsonpatch==1.33
# via langchain-core # via langchain-core
jsonpointer==2.4 jsonpointer==2.4
# via jsonpatch # via jsonpatch
langchain==0.2.0 langchain==0.2.1
# via langchain-community # via langchain-community
langchain-community==0.2.0 langchain-community==0.2.1
# via -r ./ingest/embed-aws-bedrock.in # via -r ./ingest/embed-aws-bedrock.in
langchain-core==0.2.1 langchain-core==0.2.1
# via # via
@ -67,8 +66,9 @@ langchain-core==0.2.1
# langchain-text-splitters # langchain-text-splitters
langchain-text-splitters==0.2.0 langchain-text-splitters==0.2.0
# via langchain # via langchain
langsmith==0.1.61 langsmith==0.1.62
# via # via
# -c ./ingest/../deps/constraints.txt
# langchain # langchain
# langchain-community # langchain-community
# langchain-core # langchain-core
@ -135,7 +135,7 @@ tenacity==8.3.0
# langchain # langchain
# langchain-community # langchain-community
# langchain-core # langchain-core
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# pydantic # pydantic

View File

@ -30,7 +30,6 @@ charset-normalizer==3.3.2
dataclasses-json==0.6.6 dataclasses-json==0.6.6
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# langchain
# langchain-community # langchain-community
filelock==3.14.0 filelock==3.14.0
# via # via
@ -68,9 +67,9 @@ jsonpatch==1.33
# via langchain-core # via langchain-core
jsonpointer==2.4 jsonpointer==2.4
# via jsonpatch # via jsonpatch
langchain==0.2.0 langchain==0.2.1
# via langchain-community # via langchain-community
langchain-community==0.2.0 langchain-community==0.2.1
# via -r ./ingest/embed-huggingface.in # via -r ./ingest/embed-huggingface.in
langchain-core==0.2.1 langchain-core==0.2.1
# via # via
@ -79,8 +78,9 @@ langchain-core==0.2.1
# langchain-text-splitters # langchain-text-splitters
langchain-text-splitters==0.2.0 langchain-text-splitters==0.2.0
# via langchain # via langchain
langsmith==0.1.61 langsmith==0.1.62
# via # via
# -c ./ingest/../deps/constraints.txt
# langchain # langchain
# langchain-community # langchain-community
# langchain-core # langchain-core
@ -188,7 +188,7 @@ tqdm==4.66.4
# transformers # transformers
transformers==4.41.1 transformers==4.41.1
# via sentence-transformers # via sentence-transformers
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# huggingface-hub # huggingface-hub

View File

@ -38,7 +38,7 @@ idna==3.7
# anyio # anyio
# httpx # httpx
# requests # requests
openai==1.30.1 openai==1.30.3
# via -r ./ingest/embed-octoai.in # via -r ./ingest/embed-octoai.in
pydantic==2.7.1 pydantic==2.7.1
# via openai # via openai
@ -63,7 +63,7 @@ tqdm==4.66.4
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# openai # openai
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# openai # openai

View File

@ -37,7 +37,6 @@ charset-normalizer==3.3.2
dataclasses-json==0.6.6 dataclasses-json==0.6.6
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# langchain
# langchain-community # langchain-community
distro==1.9.0 distro==1.9.0
# via openai # via openai
@ -64,9 +63,9 @@ jsonpatch==1.33
# via langchain-core # via langchain-core
jsonpointer==2.4 jsonpointer==2.4
# via jsonpatch # via jsonpatch
langchain==0.2.0 langchain==0.2.1
# via langchain-community # via langchain-community
langchain-community==0.2.0 langchain-community==0.2.1
# via -r ./ingest/embed-openai.in # via -r ./ingest/embed-openai.in
langchain-core==0.2.1 langchain-core==0.2.1
# via # via
@ -75,8 +74,9 @@ langchain-core==0.2.1
# langchain-text-splitters # langchain-text-splitters
langchain-text-splitters==0.2.0 langchain-text-splitters==0.2.0
# via langchain # via langchain
langsmith==0.1.61 langsmith==0.1.62
# via # via
# -c ./ingest/../deps/constraints.txt
# langchain # langchain
# langchain-community # langchain-community
# langchain-core # langchain-core
@ -98,7 +98,7 @@ numpy==1.26.4
# -c ./ingest/../deps/constraints.txt # -c ./ingest/../deps/constraints.txt
# langchain # langchain
# langchain-community # langchain-community
openai==1.30.1 openai==1.30.3
# via -r ./ingest/embed-openai.in # via -r ./ingest/embed-openai.in
orjson==3.10.3 orjson==3.10.3
# via langsmith # via langsmith
@ -152,7 +152,7 @@ tqdm==4.66.4
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# openai # openai
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# openai # openai

View File

@ -32,7 +32,6 @@ charset-normalizer==3.3.2
dataclasses-json==0.6.6 dataclasses-json==0.6.6
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# langchain
# langchain-community # langchain-community
docstring-parser==0.16 docstring-parser==0.16
# via google-cloud-aiplatform # via google-cloud-aiplatform
@ -101,11 +100,11 @@ jsonpatch==1.33
# via langchain-core # via langchain-core
jsonpointer==2.4 jsonpointer==2.4
# via jsonpatch # via jsonpatch
langchain==0.2.0 langchain==0.2.1
# via # via
# -r ./ingest/embed-vertexai.in # -r ./ingest/embed-vertexai.in
# langchain-community # langchain-community
langchain-community==0.2.0 langchain-community==0.2.1
# via -r ./ingest/embed-vertexai.in # via -r ./ingest/embed-vertexai.in
langchain-core==0.2.1 langchain-core==0.2.1
# via # via
@ -117,8 +116,9 @@ langchain-google-vertexai==1.0.4
# via -r ./ingest/embed-vertexai.in # via -r ./ingest/embed-vertexai.in
langchain-text-splitters==0.2.0 langchain-text-splitters==0.2.0
# via langchain # via langchain
langsmith==0.1.61 langsmith==0.1.62
# via # via
# -c ./ingest/../deps/constraints.txt
# langchain # langchain
# langchain-community # langchain-community
# langchain-core # langchain-core
@ -215,7 +215,7 @@ tenacity==8.3.0
# langchain # langchain
# langchain-community # langchain-community
# langchain-core # langchain-core
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# pydantic # pydantic

View File

@ -0,0 +1,4 @@
-c ../deps/constraints.txt
-c ../base.txt
langchain
langchain-voyageai

View File

@ -0,0 +1,116 @@
#
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile ./ingest/embed-voyageai.in
#
aiohttp==3.9.5
# via
# langchain
# voyageai
aiolimiter==1.1.0
# via voyageai
aiosignal==1.3.1
# via aiohttp
annotated-types==0.7.0
# via pydantic
async-timeout==4.0.3
# via
# aiohttp
# langchain
attrs==23.2.0
# via aiohttp
certifi==2024.2.2
# via
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# requests
charset-normalizer==3.3.2
# via
# -c ./ingest/../base.txt
# requests
frozenlist==1.4.1
# via
# aiohttp
# aiosignal
idna==3.7
# via
# -c ./ingest/../base.txt
# requests
# yarl
jsonpatch==1.33
# via langchain-core
jsonpointer==2.4
# via jsonpatch
langchain==0.2.1
# via -r ./ingest/embed-voyageai.in
langchain-core==0.2.1
# via
# langchain
# langchain-text-splitters
# langchain-voyageai
langchain-text-splitters==0.2.0
# via langchain
langchain-voyageai==0.1.1
# via -r ./ingest/embed-voyageai.in
langsmith==0.1.62
# via
# -c ./ingest/../deps/constraints.txt
# langchain
# langchain-core
multidict==6.0.5
# via
# aiohttp
# yarl
numpy==1.26.4
# via
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# langchain
# voyageai
orjson==3.10.3
# via langsmith
packaging==23.2
# via
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# langchain-core
pydantic==2.7.1
# via
# langchain
# langchain-core
# langsmith
pydantic-core==2.18.2
# via pydantic
pyyaml==6.0.1
# via
# langchain
# langchain-core
requests==2.32.2
# via
# -c ./ingest/../base.txt
# langchain
# langsmith
# voyageai
sqlalchemy==2.0.30
# via langchain
tenacity==8.3.0
# via
# langchain
# langchain-core
# voyageai
typing-extensions==4.12.0
# via
# -c ./ingest/../base.txt
# pydantic
# pydantic-core
# sqlalchemy
urllib3==1.26.18
# via
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# requests
voyageai==0.2.2
# via langchain-voyageai
yarl==1.9.4
# via aiohttp

View File

@ -37,7 +37,7 @@ requests==2.32.2
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# pygithub # pygithub
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# pygithub # pygithub

View File

@ -17,7 +17,7 @@ charset-normalizer==3.3.2
# requests # requests
google-api-core==2.19.0 google-api-core==2.19.0
# via google-api-python-client # via google-api-python-client
google-api-python-client==2.129.0 google-api-python-client==2.130.0
# via -r ./ingest/google-drive.in # via -r ./ingest/google-drive.in
google-auth==2.29.0 google-auth==2.29.0
# via # via

View File

@ -15,7 +15,7 @@ tqdm==4.66.4
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# pinecone-client # pinecone-client
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# pinecone-client # pinecone-client

View File

@ -62,7 +62,7 @@ sniffio==1.3.1
# via # via
# anyio # anyio
# httpx # httpx
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# pydantic # pydantic

View File

@ -51,7 +51,7 @@ six==1.16.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# python-dateutil # python-dateutil
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# aioitertools # aioitertools

View File

@ -58,7 +58,7 @@ six==1.16.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# isodate # isodate
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# simple-salesforce # simple-salesforce

View File

@ -69,7 +69,7 @@ sniffio==1.3.1
# via # via
# anyio # anyio
# httpx # httpx
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# pydantic # pydantic

View File

@ -130,7 +130,7 @@ rpds-py==0.18.1
# via # via
# jsonschema # jsonschema
# referencing # referencing
ruff==0.4.4 ruff==0.4.5
# via -r ./test.in # via -r ./test.in
six==1.16.0 six==1.16.0
# via # via
@ -153,7 +153,7 @@ types-tabulate==0.9.0.20240106
# via -r ./test.in # via -r ./test.in
types-urllib3==1.26.25.14 types-urllib3==1.26.25.14
# via types-requests # via types-requests
typing-extensions==4.11.0 typing-extensions==4.12.0
# via # via
# -c ./base.txt # -c ./base.txt
# black # black

View File

@ -12,5 +12,3 @@ pushd ./requirements || exit
make clean make clean
make all make all
popd || exit popd || exit
cp requirements/build.txt docs/requirements.txt

View File

@ -171,6 +171,7 @@ setup(
"embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"), "embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"),
"embed-octoai": load_requirements("requirements/ingest/embed-octoai.in"), "embed-octoai": load_requirements("requirements/ingest/embed-octoai.in"),
"embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.in"), "embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.in"),
"embed-voyageai": load_requirements("requirements/ingest/embed-voyageai.in"),
"openai": load_requirements("requirements/ingest/embed-openai.in"), "openai": load_requirements("requirements/ingest/embed-openai.in"),
"bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"), "bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"),
"databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.in"), "databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.in"),

View File

@ -0,0 +1,21 @@
from unstructured.documents.elements import Text
from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
def test_embed_documents_does_not_break_element_to_dict(mocker):
# Mocked client with the desired behavior for embed_documents
mock_client = mocker.MagicMock()
mock_client.embed_documents.return_value = [1, 2]
# Mock create_client to return our mock_client
mocker.patch.object(VoyageAIEmbeddingEncoder, "create_client", return_value=mock_client)
encoder = VoyageAIEmbeddingEncoder(
config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-law-2")
)
elements = encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)
assert len(elements) == 2
assert elements[0].to_dict()["text"] == "This is sentence 1"
assert elements[1].to_dict()["text"] == "This is sentence 2"

View File

@ -0,0 +1,41 @@
#!/usr/bin/env bash
set -e
SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=embed-voyageai
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
VOYAGE_API_KEY=${VOYAGE_API_KEY:-$VOYAGE_API_KEY}
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
}
trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--verbose \
--reprocess \
--input-path example-docs/book-war-and-peace-1p.txt \
--work-dir "$WORK_DIR" \
--embedding-provider "langchain-voyageai" \
--embedding-api-key "$VOYAGE_API_KEY" \
--embedding-model-name "voyage-large-2"
set +e
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -59,6 +59,7 @@ all_tests=(
'local-embed-bedrock.sh' 'local-embed-bedrock.sh'
'local-embed-octoai.sh' 'local-embed-octoai.sh'
'local-embed-vertexai.sh' 'local-embed-vertexai.sh'
'local-embed-voyageai.sh'
'sftp.sh' 'sftp.sh'
'opensearch.sh' 'opensearch.sh'
# NOTE(robinson) - mongo conflicts with astra because it ships with its # NOTE(robinson) - mongo conflicts with astra because it ships with its

View File

@ -1 +1 @@
__version__ = "0.14.3-dev4" # pragma: no cover __version__ = "0.14.3-dev5" # pragma: no cover

View File

@ -3,11 +3,13 @@ from unstructured.embed.huggingface import HuggingFaceEmbeddingEncoder
from unstructured.embed.octoai import OctoAIEmbeddingEncoder from unstructured.embed.octoai import OctoAIEmbeddingEncoder
from unstructured.embed.openai import OpenAIEmbeddingEncoder from unstructured.embed.openai import OpenAIEmbeddingEncoder
from unstructured.embed.vertexai import VertexAIEmbeddingEncoder from unstructured.embed.vertexai import VertexAIEmbeddingEncoder
from unstructured.embed.voyageai import VoyageAIEmbeddingEncoder
EMBEDDING_PROVIDER_TO_CLASS_MAP = { EMBEDDING_PROVIDER_TO_CLASS_MAP = {
"langchain-openai": OpenAIEmbeddingEncoder, "langchain-openai": OpenAIEmbeddingEncoder,
"langchain-huggingface": HuggingFaceEmbeddingEncoder, "langchain-huggingface": HuggingFaceEmbeddingEncoder,
"langchain-aws-bedrock": BedrockEmbeddingEncoder, "langchain-aws-bedrock": BedrockEmbeddingEncoder,
"langchain-vertexai": VertexAIEmbeddingEncoder, "langchain-vertexai": VertexAIEmbeddingEncoder,
"langchain-voyageai": VoyageAIEmbeddingEncoder,
"octoai": OctoAIEmbeddingEncoder, "octoai": OctoAIEmbeddingEncoder,
} }

View File

@ -0,0 +1,82 @@
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, List, Optional
import numpy as np
from unstructured.documents.elements import Element
from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
from unstructured.ingest.error import EmbeddingEncoderConnectionError
from unstructured.utils import requires_dependencies
if TYPE_CHECKING:
from langchain_voyageai import VoyageAIEmbeddings
@dataclass
class VoyageAIEmbeddingConfig(EmbeddingConfig):
api_key: str
model_name: str
batch_size: Optional[int] = None
truncation: Optional[bool] = None
@dataclass
class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
config: VoyageAIEmbeddingConfig
_client: Optional["VoyageAIEmbeddings"] = field(init=False, default=None)
_exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
@property
def client(self) -> "VoyageAIEmbeddings":
if self._client is None:
self._client = self.create_client()
return self._client
@property
def exemplary_embedding(self) -> List[float]:
if self._exemplary_embedding is None:
self._exemplary_embedding = self.client.embed_query("A sample query.")
return self._exemplary_embedding
def initialize(self):
pass
@property
def num_of_dimensions(self) -> tuple[int, ...]:
return np.shape(self.exemplary_embedding)
@property
def is_unit_vector(self) -> bool:
return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0)
def embed_documents(self, elements: List[Element]) -> List[Element]:
embeddings = self.client.embed_documents([str(e) for e in elements])
return self._add_embeddings_to_elements(elements, embeddings)
def embed_query(self, query: str) -> List[float]:
return self.client.embed_query(query)
@staticmethod
def _add_embeddings_to_elements(elements, embeddings) -> List[Element]:
assert len(elements) == len(embeddings)
elements_w_embedding = []
for i, element in enumerate(elements):
element.embeddings = embeddings[i]
elements_w_embedding.append(element)
return elements
@EmbeddingEncoderConnectionError.wrap
@requires_dependencies(
["langchain", "langchain_voyageai"],
extras="embed-voyageai",
)
def create_client(self) -> "VoyageAIEmbeddings":
"""Creates a Langchain VoyageAI python client to embed elements."""
from langchain_voyageai import VoyageAIEmbeddings
return VoyageAIEmbeddings(
voyage_api_key=self.config.api_key,
model=self.config.model_name,
batch_size=self.config.batch_size,
truncation=self.config.truncation,
)

View File

@ -234,6 +234,13 @@ class EmbeddingConfig(BaseConfig):
) )
return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs)) return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
elif self.provider == "langchain-voyageai":
from unstructured.embed.voyageai import (
VoyageAIEmbeddingConfig,
VoyageAIEmbeddingEncoder,
)
return VoyageAIEmbeddingEncoder(config=VoyageAIEmbeddingConfig(**kwargs))
else: else:
raise ValueError(f"{self.provider} not a recognized encoder") raise ValueError(f"{self.provider} not a recognized encoder")