mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-14 19:57:26 +00:00
Original PR was #3069. Merged in to a feature branch to fix dependency and linting issues. Application code changes from the original PR were already reviewed and approved. ------------ Original PR description: Adding VoyageAI embeddings Voyage AI’s embedding models and rerankers are state-of-the-art in retrieval accuracy. --------- Co-authored-by: fzowl <160063452+fzowl@users.noreply.github.com> Co-authored-by: Liuhong99 <39693953+Liuhong99@users.noreply.github.com>
This commit is contained in:
parent
32df4ee1c6
commit
6b400b46fe
@ -1,9 +1,10 @@
|
||||
## 0.14.3-dev4
|
||||
## 0.14.3-dev5
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Move `category` field from Text class to Element class.**
|
||||
* **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image.
|
||||
* **Add VoyageAI embedder** Adds VoyageAI embeddings to support embedding via Voyage AI.
|
||||
|
||||
### Features
|
||||
|
||||
|
25
examples/embed/example_voyageai.py
Normal file
25
examples/embed/example_voyageai.py
Normal file
@ -0,0 +1,25 @@
|
||||
import os
|
||||
|
||||
from unstructured.documents.elements import Text
|
||||
from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
|
||||
|
||||
# To use Voyage AI you will need to pass
|
||||
# Voyage AI API Key (obtained from https://dash.voyageai.com/)
|
||||
# as the ``api_key`` parameter.
|
||||
#
|
||||
# The ``model_name`` parameter is mandatory, please check the available models
|
||||
# at https://docs.voyageai.com/docs/embeddings
|
||||
|
||||
embedding_encoder = VoyageAIEmbeddingEncoder(
|
||||
config=VoyageAIEmbeddingConfig(api_key=os.environ["VOYAGE_API_KEY"], model_name="voyage-law-2")
|
||||
)
|
||||
elements = embedding_encoder.embed_documents(
|
||||
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
|
||||
)
|
||||
|
||||
query = "This is the query"
|
||||
query_embedding = embedding_encoder.embed_query(query=query)
|
||||
|
||||
[print(e, e.embeddings) for e in elements]
|
||||
print(query, query_embedding)
|
||||
print(embedding_encoder.is_unit_vector, embedding_encoder.num_of_dimensions)
|
@ -86,7 +86,7 @@ tabulate==0.9.0
|
||||
# via -r ./base.in
|
||||
tqdm==4.66.4
|
||||
# via nltk
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -r ./base.in
|
||||
# emoji
|
||||
|
@ -57,7 +57,10 @@ unstructured-client<=0.18.0
|
||||
|
||||
fsspec==2024.5.0
|
||||
|
||||
# python 3.12 support
|
||||
# python 3.12 support
|
||||
numpy>=1.26.0
|
||||
wrapt>=1.14.0
|
||||
|
||||
|
||||
# NOTE(robinson): for compatiblity with voyage embeddings
|
||||
langsmith==0.1.62
|
||||
|
@ -151,7 +151,7 @@ jsonschema-specifications==2023.12.1
|
||||
# jsonschema
|
||||
jupyter==1.0.0
|
||||
# via -r ./dev.in
|
||||
jupyter-client==8.6.1
|
||||
jupyter-client==8.6.2
|
||||
# via
|
||||
# ipykernel
|
||||
# jupyter-console
|
||||
@ -185,7 +185,7 @@ jupyter-server==2.14.0
|
||||
# notebook-shim
|
||||
jupyter-server-terminals==0.5.3
|
||||
# via jupyter-server
|
||||
jupyterlab==4.2.0
|
||||
jupyterlab==4.2.1
|
||||
# via notebook
|
||||
jupyterlab-pygments==0.3.0
|
||||
# via nbconvert
|
||||
@ -392,7 +392,7 @@ traitlets==5.14.3
|
||||
# qtconsole
|
||||
types-python-dateutil==2.9.0.20240316
|
||||
# via arrow
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# -c ./test.txt
|
||||
|
@ -12,7 +12,7 @@ python-docx==1.1.2
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -r ./extra-docx.in
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-docx
|
||||
|
@ -14,7 +14,7 @@ python-docx==1.1.2
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -r ./extra-odt.in
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-docx
|
||||
|
@ -8,7 +8,7 @@ attrdict==2.0.1
|
||||
# via unstructured-paddleocr
|
||||
babel==2.15.0
|
||||
# via flask-babel
|
||||
bce-python-sdk==0.9.10
|
||||
bce-python-sdk==0.9.11
|
||||
# via visualdl
|
||||
blinker==1.8.2
|
||||
# via flask
|
||||
@ -45,7 +45,7 @@ flask==3.0.3
|
||||
# visualdl
|
||||
flask-babel==4.0.0
|
||||
# via visualdl
|
||||
fonttools==4.51.0
|
||||
fonttools==4.52.1
|
||||
# via matplotlib
|
||||
future==1.0.0
|
||||
# via bce-python-sdk
|
||||
@ -200,7 +200,7 @@ six==1.16.0
|
||||
# imgaug
|
||||
# python-dateutil
|
||||
# visualdl
|
||||
tifffile==2024.5.10
|
||||
tifffile==2024.5.22
|
||||
# via scikit-image
|
||||
tqdm==4.66.4
|
||||
# via
|
||||
|
@ -39,7 +39,7 @@ filelock==3.14.0
|
||||
# transformers
|
||||
flatbuffers==24.3.25
|
||||
# via onnxruntime
|
||||
fonttools==4.51.0
|
||||
fonttools==4.52.1
|
||||
# via matplotlib
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
@ -118,7 +118,7 @@ numpy==1.26.4
|
||||
# transformers
|
||||
omegaconf==2.3.0
|
||||
# via effdet
|
||||
onnx==1.16.0
|
||||
onnx==1.16.1
|
||||
# via
|
||||
# -r ./extra-pdf-image.in
|
||||
# unstructured-inference
|
||||
@ -278,7 +278,7 @@ tqdm==4.66.4
|
||||
# transformers
|
||||
transformers==4.41.1
|
||||
# via unstructured-inference
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
|
@ -102,7 +102,7 @@ tqdm==4.66.4
|
||||
# transformers
|
||||
transformers==4.41.1
|
||||
# via -r ./huggingface.in
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
|
@ -31,7 +31,7 @@ requests==2.32.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# pyairtable
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# pyairtable
|
||||
|
@ -34,7 +34,7 @@ six==1.16.0
|
||||
# -c ./ingest/../base.txt
|
||||
# azure-core
|
||||
# isodate
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# azure-core
|
||||
|
@ -93,7 +93,7 @@ six==1.16.0
|
||||
# -c ./ingest/../base.txt
|
||||
# azure-core
|
||||
# isodate
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# azure-core
|
||||
|
@ -198,7 +198,7 @@ typer==0.9.0
|
||||
# via
|
||||
# -r ./ingest/chroma.in
|
||||
# chromadb
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# chromadb
|
||||
|
@ -15,7 +15,7 @@ charset-normalizer==3.3.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
databricks-sdk==0.27.1
|
||||
databricks-sdk==0.28.0
|
||||
# via -r ./ingest/databricks-volumes.in
|
||||
google-auth==2.29.0
|
||||
# via databricks-sdk
|
||||
|
@ -11,7 +11,7 @@ certifi==2024.2.2
|
||||
# elastic-transport
|
||||
elastic-transport==8.13.0
|
||||
# via elasticsearch
|
||||
elasticsearch==8.13.1
|
||||
elasticsearch==8.13.2
|
||||
# via -r ./ingest/elasticsearch.in
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
|
@ -37,7 +37,6 @@ charset-normalizer==3.3.2
|
||||
dataclasses-json==0.6.6
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
frozenlist==1.4.1
|
||||
# via
|
||||
@ -56,9 +55,9 @@ jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.2.0
|
||||
langchain==0.2.1
|
||||
# via langchain-community
|
||||
langchain-community==0.2.0
|
||||
langchain-community==0.2.1
|
||||
# via -r ./ingest/embed-aws-bedrock.in
|
||||
langchain-core==0.2.1
|
||||
# via
|
||||
@ -67,8 +66,9 @@ langchain-core==0.2.1
|
||||
# langchain-text-splitters
|
||||
langchain-text-splitters==0.2.0
|
||||
# via langchain
|
||||
langsmith==0.1.61
|
||||
langsmith==0.1.62
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
@ -135,7 +135,7 @@ tenacity==8.3.0
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# pydantic
|
||||
|
@ -30,7 +30,6 @@ charset-normalizer==3.3.2
|
||||
dataclasses-json==0.6.6
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
filelock==3.14.0
|
||||
# via
|
||||
@ -68,9 +67,9 @@ jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.2.0
|
||||
langchain==0.2.1
|
||||
# via langchain-community
|
||||
langchain-community==0.2.0
|
||||
langchain-community==0.2.1
|
||||
# via -r ./ingest/embed-huggingface.in
|
||||
langchain-core==0.2.1
|
||||
# via
|
||||
@ -79,8 +78,9 @@ langchain-core==0.2.1
|
||||
# langchain-text-splitters
|
||||
langchain-text-splitters==0.2.0
|
||||
# via langchain
|
||||
langsmith==0.1.61
|
||||
langsmith==0.1.62
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
@ -188,7 +188,7 @@ tqdm==4.66.4
|
||||
# transformers
|
||||
transformers==4.41.1
|
||||
# via sentence-transformers
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# huggingface-hub
|
||||
|
@ -38,7 +38,7 @@ idna==3.7
|
||||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
openai==1.30.1
|
||||
openai==1.30.3
|
||||
# via -r ./ingest/embed-octoai.in
|
||||
pydantic==2.7.1
|
||||
# via openai
|
||||
@ -63,7 +63,7 @@ tqdm==4.66.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# openai
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# openai
|
||||
|
@ -37,7 +37,6 @@ charset-normalizer==3.3.2
|
||||
dataclasses-json==0.6.6
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
distro==1.9.0
|
||||
# via openai
|
||||
@ -64,9 +63,9 @@ jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.2.0
|
||||
langchain==0.2.1
|
||||
# via langchain-community
|
||||
langchain-community==0.2.0
|
||||
langchain-community==0.2.1
|
||||
# via -r ./ingest/embed-openai.in
|
||||
langchain-core==0.2.1
|
||||
# via
|
||||
@ -75,8 +74,9 @@ langchain-core==0.2.1
|
||||
# langchain-text-splitters
|
||||
langchain-text-splitters==0.2.0
|
||||
# via langchain
|
||||
langsmith==0.1.61
|
||||
langsmith==0.1.62
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
@ -98,7 +98,7 @@ numpy==1.26.4
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
openai==1.30.1
|
||||
openai==1.30.3
|
||||
# via -r ./ingest/embed-openai.in
|
||||
orjson==3.10.3
|
||||
# via langsmith
|
||||
@ -152,7 +152,7 @@ tqdm==4.66.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# openai
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# openai
|
||||
|
@ -32,7 +32,6 @@ charset-normalizer==3.3.2
|
||||
dataclasses-json==0.6.6
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
docstring-parser==0.16
|
||||
# via google-cloud-aiplatform
|
||||
@ -101,11 +100,11 @@ jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.2.0
|
||||
langchain==0.2.1
|
||||
# via
|
||||
# -r ./ingest/embed-vertexai.in
|
||||
# langchain-community
|
||||
langchain-community==0.2.0
|
||||
langchain-community==0.2.1
|
||||
# via -r ./ingest/embed-vertexai.in
|
||||
langchain-core==0.2.1
|
||||
# via
|
||||
@ -117,8 +116,9 @@ langchain-google-vertexai==1.0.4
|
||||
# via -r ./ingest/embed-vertexai.in
|
||||
langchain-text-splitters==0.2.0
|
||||
# via langchain
|
||||
langsmith==0.1.61
|
||||
langsmith==0.1.62
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
@ -215,7 +215,7 @@ tenacity==8.3.0
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# pydantic
|
||||
|
4
requirements/ingest/embed-voyageai.in
Normal file
4
requirements/ingest/embed-voyageai.in
Normal file
@ -0,0 +1,4 @@
|
||||
-c ../deps/constraints.txt
|
||||
-c ../base.txt
|
||||
langchain
|
||||
langchain-voyageai
|
116
requirements/ingest/embed-voyageai.txt
Normal file
116
requirements/ingest/embed-voyageai.txt
Normal file
@ -0,0 +1,116 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile ./ingest/embed-voyageai.in
|
||||
#
|
||||
aiohttp==3.9.5
|
||||
# via
|
||||
# langchain
|
||||
# voyageai
|
||||
aiolimiter==1.1.0
|
||||
# via voyageai
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
async-timeout==4.0.3
|
||||
# via
|
||||
# aiohttp
|
||||
# langchain
|
||||
attrs==23.2.0
|
||||
# via aiohttp
|
||||
certifi==2024.2.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# requests
|
||||
charset-normalizer==3.3.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
frozenlist==1.4.1
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
idna==3.7
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
# yarl
|
||||
jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.2.1
|
||||
# via -r ./ingest/embed-voyageai.in
|
||||
langchain-core==0.2.1
|
||||
# via
|
||||
# langchain
|
||||
# langchain-text-splitters
|
||||
# langchain-voyageai
|
||||
langchain-text-splitters==0.2.0
|
||||
# via langchain
|
||||
langchain-voyageai==0.1.1
|
||||
# via -r ./ingest/embed-voyageai.in
|
||||
langsmith==0.1.62
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain
|
||||
# langchain-core
|
||||
multidict==6.0.5
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain
|
||||
# voyageai
|
||||
orjson==3.10.3
|
||||
# via langsmith
|
||||
packaging==23.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain-core
|
||||
pydantic==2.7.1
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
# langsmith
|
||||
pydantic-core==2.18.2
|
||||
# via pydantic
|
||||
pyyaml==6.0.1
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
requests==2.32.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# langchain
|
||||
# langsmith
|
||||
# voyageai
|
||||
sqlalchemy==2.0.30
|
||||
# via langchain
|
||||
tenacity==8.3.0
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
# voyageai
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# sqlalchemy
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# requests
|
||||
voyageai==0.2.2
|
||||
# via langchain-voyageai
|
||||
yarl==1.9.4
|
||||
# via aiohttp
|
@ -37,7 +37,7 @@ requests==2.32.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# pygithub
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# pygithub
|
||||
|
@ -17,7 +17,7 @@ charset-normalizer==3.3.2
|
||||
# requests
|
||||
google-api-core==2.19.0
|
||||
# via google-api-python-client
|
||||
google-api-python-client==2.129.0
|
||||
google-api-python-client==2.130.0
|
||||
# via -r ./ingest/google-drive.in
|
||||
google-auth==2.29.0
|
||||
# via
|
||||
|
@ -15,7 +15,7 @@ tqdm==4.66.4
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# pinecone-client
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# pinecone-client
|
||||
|
@ -62,7 +62,7 @@ sniffio==1.3.1
|
||||
# via
|
||||
# anyio
|
||||
# httpx
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# pydantic
|
||||
|
@ -51,7 +51,7 @@ six==1.16.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# python-dateutil
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# aioitertools
|
||||
|
@ -58,7 +58,7 @@ six==1.16.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# isodate
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# simple-salesforce
|
||||
|
@ -69,7 +69,7 @@ sniffio==1.3.1
|
||||
# via
|
||||
# anyio
|
||||
# httpx
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# pydantic
|
||||
|
@ -130,7 +130,7 @@ rpds-py==0.18.1
|
||||
# via
|
||||
# jsonschema
|
||||
# referencing
|
||||
ruff==0.4.4
|
||||
ruff==0.4.5
|
||||
# via -r ./test.in
|
||||
six==1.16.0
|
||||
# via
|
||||
@ -153,7 +153,7 @@ types-tabulate==0.9.0.20240106
|
||||
# via -r ./test.in
|
||||
types-urllib3==1.26.25.14
|
||||
# via types-requests
|
||||
typing-extensions==4.11.0
|
||||
typing-extensions==4.12.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# black
|
||||
|
@ -12,5 +12,3 @@ pushd ./requirements || exit
|
||||
make clean
|
||||
make all
|
||||
popd || exit
|
||||
|
||||
cp requirements/build.txt docs/requirements.txt
|
||||
|
1
setup.py
1
setup.py
@ -171,6 +171,7 @@ setup(
|
||||
"embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"),
|
||||
"embed-octoai": load_requirements("requirements/ingest/embed-octoai.in"),
|
||||
"embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.in"),
|
||||
"embed-voyageai": load_requirements("requirements/ingest/embed-voyageai.in"),
|
||||
"openai": load_requirements("requirements/ingest/embed-openai.in"),
|
||||
"bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"),
|
||||
"databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.in"),
|
||||
|
21
test_unstructured/embed/test_voyageai.py
Normal file
21
test_unstructured/embed/test_voyageai.py
Normal file
@ -0,0 +1,21 @@
|
||||
from unstructured.documents.elements import Text
|
||||
from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
|
||||
|
||||
|
||||
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
||||
# Mocked client with the desired behavior for embed_documents
|
||||
mock_client = mocker.MagicMock()
|
||||
mock_client.embed_documents.return_value = [1, 2]
|
||||
|
||||
# Mock create_client to return our mock_client
|
||||
mocker.patch.object(VoyageAIEmbeddingEncoder, "create_client", return_value=mock_client)
|
||||
|
||||
encoder = VoyageAIEmbeddingEncoder(
|
||||
config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-law-2")
|
||||
)
|
||||
elements = encoder.embed_documents(
|
||||
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
|
||||
)
|
||||
assert len(elements) == 2
|
||||
assert elements[0].to_dict()["text"] == "This is sentence 1"
|
||||
assert elements[1].to_dict()["text"] == "This is sentence 2"
|
File diff suppressed because it is too large
Load Diff
41
test_unstructured_ingest/src/local-embed-voyageai.sh
Executable file
41
test_unstructured_ingest/src/local-embed-voyageai.sh
Executable file
@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
SRC_PATH=$(dirname "$(realpath "$0")")
|
||||
SCRIPT_DIR=$(dirname "$SRC_PATH")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=embed-voyageai
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
VOYAGE_API_KEY=${VOYAGE_API_KEY:-$VOYAGE_API_KEY}
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
function cleanup() {
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
||||
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
local \
|
||||
--num-processes "$max_processes" \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--verbose \
|
||||
--reprocess \
|
||||
--input-path example-docs/book-war-and-peace-1p.txt \
|
||||
--work-dir "$WORK_DIR" \
|
||||
--embedding-provider "langchain-voyageai" \
|
||||
--embedding-api-key "$VOYAGE_API_KEY" \
|
||||
--embedding-model-name "voyage-large-2"
|
||||
|
||||
set +e
|
||||
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
@ -59,6 +59,7 @@ all_tests=(
|
||||
'local-embed-bedrock.sh'
|
||||
'local-embed-octoai.sh'
|
||||
'local-embed-vertexai.sh'
|
||||
'local-embed-voyageai.sh'
|
||||
'sftp.sh'
|
||||
'opensearch.sh'
|
||||
# NOTE(robinson) - mongo conflicts with astra because it ships with its
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.14.3-dev4" # pragma: no cover
|
||||
__version__ = "0.14.3-dev5" # pragma: no cover
|
||||
|
@ -3,11 +3,13 @@ from unstructured.embed.huggingface import HuggingFaceEmbeddingEncoder
|
||||
from unstructured.embed.octoai import OctoAIEmbeddingEncoder
|
||||
from unstructured.embed.openai import OpenAIEmbeddingEncoder
|
||||
from unstructured.embed.vertexai import VertexAIEmbeddingEncoder
|
||||
from unstructured.embed.voyageai import VoyageAIEmbeddingEncoder
|
||||
|
||||
EMBEDDING_PROVIDER_TO_CLASS_MAP = {
|
||||
"langchain-openai": OpenAIEmbeddingEncoder,
|
||||
"langchain-huggingface": HuggingFaceEmbeddingEncoder,
|
||||
"langchain-aws-bedrock": BedrockEmbeddingEncoder,
|
||||
"langchain-vertexai": VertexAIEmbeddingEncoder,
|
||||
"langchain-voyageai": VoyageAIEmbeddingEncoder,
|
||||
"octoai": OctoAIEmbeddingEncoder,
|
||||
}
|
||||
|
82
unstructured/embed/voyageai.py
Normal file
82
unstructured/embed/voyageai.py
Normal file
@ -0,0 +1,82 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
||||
from unstructured.ingest.error import EmbeddingEncoderConnectionError
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_voyageai import VoyageAIEmbeddings
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoyageAIEmbeddingConfig(EmbeddingConfig):
|
||||
api_key: str
|
||||
model_name: str
|
||||
batch_size: Optional[int] = None
|
||||
truncation: Optional[bool] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
||||
config: VoyageAIEmbeddingConfig
|
||||
_client: Optional["VoyageAIEmbeddings"] = field(init=False, default=None)
|
||||
_exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
|
||||
|
||||
@property
|
||||
def client(self) -> "VoyageAIEmbeddings":
|
||||
if self._client is None:
|
||||
self._client = self.create_client()
|
||||
return self._client
|
||||
|
||||
@property
|
||||
def exemplary_embedding(self) -> List[float]:
|
||||
if self._exemplary_embedding is None:
|
||||
self._exemplary_embedding = self.client.embed_query("A sample query.")
|
||||
return self._exemplary_embedding
|
||||
|
||||
def initialize(self):
|
||||
pass
|
||||
|
||||
@property
|
||||
def num_of_dimensions(self) -> tuple[int, ...]:
|
||||
return np.shape(self.exemplary_embedding)
|
||||
|
||||
@property
|
||||
def is_unit_vector(self) -> bool:
|
||||
return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0)
|
||||
|
||||
def embed_documents(self, elements: List[Element]) -> List[Element]:
|
||||
embeddings = self.client.embed_documents([str(e) for e in elements])
|
||||
return self._add_embeddings_to_elements(elements, embeddings)
|
||||
|
||||
def embed_query(self, query: str) -> List[float]:
|
||||
return self.client.embed_query(query)
|
||||
|
||||
@staticmethod
|
||||
def _add_embeddings_to_elements(elements, embeddings) -> List[Element]:
|
||||
assert len(elements) == len(embeddings)
|
||||
elements_w_embedding = []
|
||||
for i, element in enumerate(elements):
|
||||
element.embeddings = embeddings[i]
|
||||
elements_w_embedding.append(element)
|
||||
return elements
|
||||
|
||||
@EmbeddingEncoderConnectionError.wrap
|
||||
@requires_dependencies(
|
||||
["langchain", "langchain_voyageai"],
|
||||
extras="embed-voyageai",
|
||||
)
|
||||
def create_client(self) -> "VoyageAIEmbeddings":
|
||||
"""Creates a Langchain VoyageAI python client to embed elements."""
|
||||
from langchain_voyageai import VoyageAIEmbeddings
|
||||
|
||||
return VoyageAIEmbeddings(
|
||||
voyage_api_key=self.config.api_key,
|
||||
model=self.config.model_name,
|
||||
batch_size=self.config.batch_size,
|
||||
truncation=self.config.truncation,
|
||||
)
|
@ -234,6 +234,13 @@ class EmbeddingConfig(BaseConfig):
|
||||
)
|
||||
|
||||
return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
|
||||
elif self.provider == "langchain-voyageai":
|
||||
from unstructured.embed.voyageai import (
|
||||
VoyageAIEmbeddingConfig,
|
||||
VoyageAIEmbeddingEncoder,
|
||||
)
|
||||
|
||||
return VoyageAIEmbeddingEncoder(config=VoyageAIEmbeddingConfig(**kwargs))
|
||||
else:
|
||||
raise ValueError(f"{self.provider} not a recognized encoder")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user