mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 04:08:49 +00:00
Original PR was #3069. Merged in to a feature branch to fix dependency and linting issues. Application code changes from the original PR were already reviewed and approved. ------------ Original PR description: Adding VoyageAI embeddings Voyage AI’s embedding models and rerankers are state-of-the-art in retrieval accuracy. --------- Co-authored-by: fzowl <160063452+fzowl@users.noreply.github.com> Co-authored-by: Liuhong99 <39693953+Liuhong99@users.noreply.github.com>
This commit is contained in:
parent
32df4ee1c6
commit
6b400b46fe
@ -1,9 +1,10 @@
|
|||||||
## 0.14.3-dev4
|
## 0.14.3-dev5
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* **Move `category` field from Text class to Element class.**
|
* **Move `category` field from Text class to Element class.**
|
||||||
* **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image.
|
* **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image.
|
||||||
|
* **Add VoyageAI embedder** Adds VoyageAI embeddings to support embedding via Voyage AI.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
25
examples/embed/example_voyageai.py
Normal file
25
examples/embed/example_voyageai.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from unstructured.documents.elements import Text
|
||||||
|
from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
|
||||||
|
|
||||||
|
# To use Voyage AI you will need to pass
|
||||||
|
# Voyage AI API Key (obtained from https://dash.voyageai.com/)
|
||||||
|
# as the ``api_key`` parameter.
|
||||||
|
#
|
||||||
|
# The ``model_name`` parameter is mandatory, please check the available models
|
||||||
|
# at https://docs.voyageai.com/docs/embeddings
|
||||||
|
|
||||||
|
embedding_encoder = VoyageAIEmbeddingEncoder(
|
||||||
|
config=VoyageAIEmbeddingConfig(api_key=os.environ["VOYAGE_API_KEY"], model_name="voyage-law-2")
|
||||||
|
)
|
||||||
|
elements = embedding_encoder.embed_documents(
|
||||||
|
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
|
||||||
|
)
|
||||||
|
|
||||||
|
query = "This is the query"
|
||||||
|
query_embedding = embedding_encoder.embed_query(query=query)
|
||||||
|
|
||||||
|
[print(e, e.embeddings) for e in elements]
|
||||||
|
print(query, query_embedding)
|
||||||
|
print(embedding_encoder.is_unit_vector, embedding_encoder.num_of_dimensions)
|
@ -86,7 +86,7 @@ tabulate==0.9.0
|
|||||||
# via -r ./base.in
|
# via -r ./base.in
|
||||||
tqdm==4.66.4
|
tqdm==4.66.4
|
||||||
# via nltk
|
# via nltk
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -r ./base.in
|
# -r ./base.in
|
||||||
# emoji
|
# emoji
|
||||||
|
@ -61,3 +61,6 @@ fsspec==2024.5.0
|
|||||||
numpy>=1.26.0
|
numpy>=1.26.0
|
||||||
wrapt>=1.14.0
|
wrapt>=1.14.0
|
||||||
|
|
||||||
|
|
||||||
|
# NOTE(robinson): for compatiblity with voyage embeddings
|
||||||
|
langsmith==0.1.62
|
||||||
|
@ -151,7 +151,7 @@ jsonschema-specifications==2023.12.1
|
|||||||
# jsonschema
|
# jsonschema
|
||||||
jupyter==1.0.0
|
jupyter==1.0.0
|
||||||
# via -r ./dev.in
|
# via -r ./dev.in
|
||||||
jupyter-client==8.6.1
|
jupyter-client==8.6.2
|
||||||
# via
|
# via
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# jupyter-console
|
# jupyter-console
|
||||||
@ -185,7 +185,7 @@ jupyter-server==2.14.0
|
|||||||
# notebook-shim
|
# notebook-shim
|
||||||
jupyter-server-terminals==0.5.3
|
jupyter-server-terminals==0.5.3
|
||||||
# via jupyter-server
|
# via jupyter-server
|
||||||
jupyterlab==4.2.0
|
jupyterlab==4.2.1
|
||||||
# via notebook
|
# via notebook
|
||||||
jupyterlab-pygments==0.3.0
|
jupyterlab-pygments==0.3.0
|
||||||
# via nbconvert
|
# via nbconvert
|
||||||
@ -392,7 +392,7 @@ traitlets==5.14.3
|
|||||||
# qtconsole
|
# qtconsole
|
||||||
types-python-dateutil==2.9.0.20240316
|
types-python-dateutil==2.9.0.20240316
|
||||||
# via arrow
|
# via arrow
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c ./base.txt
|
||||||
# -c ./test.txt
|
# -c ./test.txt
|
||||||
|
@ -12,7 +12,7 @@ python-docx==1.1.2
|
|||||||
# via
|
# via
|
||||||
# -c ././deps/constraints.txt
|
# -c ././deps/constraints.txt
|
||||||
# -r ./extra-docx.in
|
# -r ./extra-docx.in
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c ./base.txt
|
||||||
# python-docx
|
# python-docx
|
||||||
|
@ -14,7 +14,7 @@ python-docx==1.1.2
|
|||||||
# via
|
# via
|
||||||
# -c ././deps/constraints.txt
|
# -c ././deps/constraints.txt
|
||||||
# -r ./extra-odt.in
|
# -r ./extra-odt.in
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c ./base.txt
|
||||||
# python-docx
|
# python-docx
|
||||||
|
@ -8,7 +8,7 @@ attrdict==2.0.1
|
|||||||
# via unstructured-paddleocr
|
# via unstructured-paddleocr
|
||||||
babel==2.15.0
|
babel==2.15.0
|
||||||
# via flask-babel
|
# via flask-babel
|
||||||
bce-python-sdk==0.9.10
|
bce-python-sdk==0.9.11
|
||||||
# via visualdl
|
# via visualdl
|
||||||
blinker==1.8.2
|
blinker==1.8.2
|
||||||
# via flask
|
# via flask
|
||||||
@ -45,7 +45,7 @@ flask==3.0.3
|
|||||||
# visualdl
|
# visualdl
|
||||||
flask-babel==4.0.0
|
flask-babel==4.0.0
|
||||||
# via visualdl
|
# via visualdl
|
||||||
fonttools==4.51.0
|
fonttools==4.52.1
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
future==1.0.0
|
future==1.0.0
|
||||||
# via bce-python-sdk
|
# via bce-python-sdk
|
||||||
@ -200,7 +200,7 @@ six==1.16.0
|
|||||||
# imgaug
|
# imgaug
|
||||||
# python-dateutil
|
# python-dateutil
|
||||||
# visualdl
|
# visualdl
|
||||||
tifffile==2024.5.10
|
tifffile==2024.5.22
|
||||||
# via scikit-image
|
# via scikit-image
|
||||||
tqdm==4.66.4
|
tqdm==4.66.4
|
||||||
# via
|
# via
|
||||||
|
@ -39,7 +39,7 @@ filelock==3.14.0
|
|||||||
# transformers
|
# transformers
|
||||||
flatbuffers==24.3.25
|
flatbuffers==24.3.25
|
||||||
# via onnxruntime
|
# via onnxruntime
|
||||||
fonttools==4.51.0
|
fonttools==4.52.1
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
fsspec==2024.5.0
|
fsspec==2024.5.0
|
||||||
# via
|
# via
|
||||||
@ -118,7 +118,7 @@ numpy==1.26.4
|
|||||||
# transformers
|
# transformers
|
||||||
omegaconf==2.3.0
|
omegaconf==2.3.0
|
||||||
# via effdet
|
# via effdet
|
||||||
onnx==1.16.0
|
onnx==1.16.1
|
||||||
# via
|
# via
|
||||||
# -r ./extra-pdf-image.in
|
# -r ./extra-pdf-image.in
|
||||||
# unstructured-inference
|
# unstructured-inference
|
||||||
@ -278,7 +278,7 @@ tqdm==4.66.4
|
|||||||
# transformers
|
# transformers
|
||||||
transformers==4.41.1
|
transformers==4.41.1
|
||||||
# via unstructured-inference
|
# via unstructured-inference
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c ./base.txt
|
||||||
# huggingface-hub
|
# huggingface-hub
|
||||||
|
@ -102,7 +102,7 @@ tqdm==4.66.4
|
|||||||
# transformers
|
# transformers
|
||||||
transformers==4.41.1
|
transformers==4.41.1
|
||||||
# via -r ./huggingface.in
|
# via -r ./huggingface.in
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c ./base.txt
|
||||||
# huggingface-hub
|
# huggingface-hub
|
||||||
|
@ -31,7 +31,7 @@ requests==2.32.2
|
|||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# pyairtable
|
# pyairtable
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# pyairtable
|
# pyairtable
|
||||||
|
@ -34,7 +34,7 @@ six==1.16.0
|
|||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# azure-core
|
# azure-core
|
||||||
# isodate
|
# isodate
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# azure-core
|
# azure-core
|
||||||
|
@ -93,7 +93,7 @@ six==1.16.0
|
|||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# azure-core
|
# azure-core
|
||||||
# isodate
|
# isodate
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# azure-core
|
# azure-core
|
||||||
|
@ -198,7 +198,7 @@ typer==0.9.0
|
|||||||
# via
|
# via
|
||||||
# -r ./ingest/chroma.in
|
# -r ./ingest/chroma.in
|
||||||
# chromadb
|
# chromadb
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# chromadb
|
# chromadb
|
||||||
|
@ -15,7 +15,7 @@ charset-normalizer==3.3.2
|
|||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# requests
|
# requests
|
||||||
databricks-sdk==0.27.1
|
databricks-sdk==0.28.0
|
||||||
# via -r ./ingest/databricks-volumes.in
|
# via -r ./ingest/databricks-volumes.in
|
||||||
google-auth==2.29.0
|
google-auth==2.29.0
|
||||||
# via databricks-sdk
|
# via databricks-sdk
|
||||||
|
@ -11,7 +11,7 @@ certifi==2024.2.2
|
|||||||
# elastic-transport
|
# elastic-transport
|
||||||
elastic-transport==8.13.0
|
elastic-transport==8.13.0
|
||||||
# via elasticsearch
|
# via elasticsearch
|
||||||
elasticsearch==8.13.1
|
elasticsearch==8.13.2
|
||||||
# via -r ./ingest/elasticsearch.in
|
# via -r ./ingest/elasticsearch.in
|
||||||
urllib3==1.26.18
|
urllib3==1.26.18
|
||||||
# via
|
# via
|
||||||
|
@ -37,7 +37,6 @@ charset-normalizer==3.3.2
|
|||||||
dataclasses-json==0.6.6
|
dataclasses-json==0.6.6
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# langchain
|
|
||||||
# langchain-community
|
# langchain-community
|
||||||
frozenlist==1.4.1
|
frozenlist==1.4.1
|
||||||
# via
|
# via
|
||||||
@ -56,9 +55,9 @@ jsonpatch==1.33
|
|||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==2.4
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain==0.2.0
|
langchain==0.2.1
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
langchain-community==0.2.0
|
langchain-community==0.2.1
|
||||||
# via -r ./ingest/embed-aws-bedrock.in
|
# via -r ./ingest/embed-aws-bedrock.in
|
||||||
langchain-core==0.2.1
|
langchain-core==0.2.1
|
||||||
# via
|
# via
|
||||||
@ -67,8 +66,9 @@ langchain-core==0.2.1
|
|||||||
# langchain-text-splitters
|
# langchain-text-splitters
|
||||||
langchain-text-splitters==0.2.0
|
langchain-text-splitters==0.2.0
|
||||||
# via langchain
|
# via langchain
|
||||||
langsmith==0.1.61
|
langsmith==0.1.62
|
||||||
# via
|
# via
|
||||||
|
# -c ./ingest/../deps/constraints.txt
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-community
|
# langchain-community
|
||||||
# langchain-core
|
# langchain-core
|
||||||
@ -135,7 +135,7 @@ tenacity==8.3.0
|
|||||||
# langchain
|
# langchain
|
||||||
# langchain-community
|
# langchain-community
|
||||||
# langchain-core
|
# langchain-core
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# pydantic
|
# pydantic
|
||||||
|
@ -30,7 +30,6 @@ charset-normalizer==3.3.2
|
|||||||
dataclasses-json==0.6.6
|
dataclasses-json==0.6.6
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# langchain
|
|
||||||
# langchain-community
|
# langchain-community
|
||||||
filelock==3.14.0
|
filelock==3.14.0
|
||||||
# via
|
# via
|
||||||
@ -68,9 +67,9 @@ jsonpatch==1.33
|
|||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==2.4
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain==0.2.0
|
langchain==0.2.1
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
langchain-community==0.2.0
|
langchain-community==0.2.1
|
||||||
# via -r ./ingest/embed-huggingface.in
|
# via -r ./ingest/embed-huggingface.in
|
||||||
langchain-core==0.2.1
|
langchain-core==0.2.1
|
||||||
# via
|
# via
|
||||||
@ -79,8 +78,9 @@ langchain-core==0.2.1
|
|||||||
# langchain-text-splitters
|
# langchain-text-splitters
|
||||||
langchain-text-splitters==0.2.0
|
langchain-text-splitters==0.2.0
|
||||||
# via langchain
|
# via langchain
|
||||||
langsmith==0.1.61
|
langsmith==0.1.62
|
||||||
# via
|
# via
|
||||||
|
# -c ./ingest/../deps/constraints.txt
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-community
|
# langchain-community
|
||||||
# langchain-core
|
# langchain-core
|
||||||
@ -188,7 +188,7 @@ tqdm==4.66.4
|
|||||||
# transformers
|
# transformers
|
||||||
transformers==4.41.1
|
transformers==4.41.1
|
||||||
# via sentence-transformers
|
# via sentence-transformers
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# huggingface-hub
|
# huggingface-hub
|
||||||
|
@ -38,7 +38,7 @@ idna==3.7
|
|||||||
# anyio
|
# anyio
|
||||||
# httpx
|
# httpx
|
||||||
# requests
|
# requests
|
||||||
openai==1.30.1
|
openai==1.30.3
|
||||||
# via -r ./ingest/embed-octoai.in
|
# via -r ./ingest/embed-octoai.in
|
||||||
pydantic==2.7.1
|
pydantic==2.7.1
|
||||||
# via openai
|
# via openai
|
||||||
@ -63,7 +63,7 @@ tqdm==4.66.4
|
|||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# openai
|
# openai
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# openai
|
# openai
|
||||||
|
@ -37,7 +37,6 @@ charset-normalizer==3.3.2
|
|||||||
dataclasses-json==0.6.6
|
dataclasses-json==0.6.6
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# langchain
|
|
||||||
# langchain-community
|
# langchain-community
|
||||||
distro==1.9.0
|
distro==1.9.0
|
||||||
# via openai
|
# via openai
|
||||||
@ -64,9 +63,9 @@ jsonpatch==1.33
|
|||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==2.4
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain==0.2.0
|
langchain==0.2.1
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
langchain-community==0.2.0
|
langchain-community==0.2.1
|
||||||
# via -r ./ingest/embed-openai.in
|
# via -r ./ingest/embed-openai.in
|
||||||
langchain-core==0.2.1
|
langchain-core==0.2.1
|
||||||
# via
|
# via
|
||||||
@ -75,8 +74,9 @@ langchain-core==0.2.1
|
|||||||
# langchain-text-splitters
|
# langchain-text-splitters
|
||||||
langchain-text-splitters==0.2.0
|
langchain-text-splitters==0.2.0
|
||||||
# via langchain
|
# via langchain
|
||||||
langsmith==0.1.61
|
langsmith==0.1.62
|
||||||
# via
|
# via
|
||||||
|
# -c ./ingest/../deps/constraints.txt
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-community
|
# langchain-community
|
||||||
# langchain-core
|
# langchain-core
|
||||||
@ -98,7 +98,7 @@ numpy==1.26.4
|
|||||||
# -c ./ingest/../deps/constraints.txt
|
# -c ./ingest/../deps/constraints.txt
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-community
|
# langchain-community
|
||||||
openai==1.30.1
|
openai==1.30.3
|
||||||
# via -r ./ingest/embed-openai.in
|
# via -r ./ingest/embed-openai.in
|
||||||
orjson==3.10.3
|
orjson==3.10.3
|
||||||
# via langsmith
|
# via langsmith
|
||||||
@ -152,7 +152,7 @@ tqdm==4.66.4
|
|||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# openai
|
# openai
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# openai
|
# openai
|
||||||
|
@ -32,7 +32,6 @@ charset-normalizer==3.3.2
|
|||||||
dataclasses-json==0.6.6
|
dataclasses-json==0.6.6
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# langchain
|
|
||||||
# langchain-community
|
# langchain-community
|
||||||
docstring-parser==0.16
|
docstring-parser==0.16
|
||||||
# via google-cloud-aiplatform
|
# via google-cloud-aiplatform
|
||||||
@ -101,11 +100,11 @@ jsonpatch==1.33
|
|||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==2.4
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain==0.2.0
|
langchain==0.2.1
|
||||||
# via
|
# via
|
||||||
# -r ./ingest/embed-vertexai.in
|
# -r ./ingest/embed-vertexai.in
|
||||||
# langchain-community
|
# langchain-community
|
||||||
langchain-community==0.2.0
|
langchain-community==0.2.1
|
||||||
# via -r ./ingest/embed-vertexai.in
|
# via -r ./ingest/embed-vertexai.in
|
||||||
langchain-core==0.2.1
|
langchain-core==0.2.1
|
||||||
# via
|
# via
|
||||||
@ -117,8 +116,9 @@ langchain-google-vertexai==1.0.4
|
|||||||
# via -r ./ingest/embed-vertexai.in
|
# via -r ./ingest/embed-vertexai.in
|
||||||
langchain-text-splitters==0.2.0
|
langchain-text-splitters==0.2.0
|
||||||
# via langchain
|
# via langchain
|
||||||
langsmith==0.1.61
|
langsmith==0.1.62
|
||||||
# via
|
# via
|
||||||
|
# -c ./ingest/../deps/constraints.txt
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-community
|
# langchain-community
|
||||||
# langchain-core
|
# langchain-core
|
||||||
@ -215,7 +215,7 @@ tenacity==8.3.0
|
|||||||
# langchain
|
# langchain
|
||||||
# langchain-community
|
# langchain-community
|
||||||
# langchain-core
|
# langchain-core
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# pydantic
|
# pydantic
|
||||||
|
4
requirements/ingest/embed-voyageai.in
Normal file
4
requirements/ingest/embed-voyageai.in
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
-c ../deps/constraints.txt
|
||||||
|
-c ../base.txt
|
||||||
|
langchain
|
||||||
|
langchain-voyageai
|
116
requirements/ingest/embed-voyageai.txt
Normal file
116
requirements/ingest/embed-voyageai.txt
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
#
|
||||||
|
# This file is autogenerated by pip-compile with Python 3.9
|
||||||
|
# by the following command:
|
||||||
|
#
|
||||||
|
# pip-compile ./ingest/embed-voyageai.in
|
||||||
|
#
|
||||||
|
aiohttp==3.9.5
|
||||||
|
# via
|
||||||
|
# langchain
|
||||||
|
# voyageai
|
||||||
|
aiolimiter==1.1.0
|
||||||
|
# via voyageai
|
||||||
|
aiosignal==1.3.1
|
||||||
|
# via aiohttp
|
||||||
|
annotated-types==0.7.0
|
||||||
|
# via pydantic
|
||||||
|
async-timeout==4.0.3
|
||||||
|
# via
|
||||||
|
# aiohttp
|
||||||
|
# langchain
|
||||||
|
attrs==23.2.0
|
||||||
|
# via aiohttp
|
||||||
|
certifi==2024.2.2
|
||||||
|
# via
|
||||||
|
# -c ./ingest/../base.txt
|
||||||
|
# -c ./ingest/../deps/constraints.txt
|
||||||
|
# requests
|
||||||
|
charset-normalizer==3.3.2
|
||||||
|
# via
|
||||||
|
# -c ./ingest/../base.txt
|
||||||
|
# requests
|
||||||
|
frozenlist==1.4.1
|
||||||
|
# via
|
||||||
|
# aiohttp
|
||||||
|
# aiosignal
|
||||||
|
idna==3.7
|
||||||
|
# via
|
||||||
|
# -c ./ingest/../base.txt
|
||||||
|
# requests
|
||||||
|
# yarl
|
||||||
|
jsonpatch==1.33
|
||||||
|
# via langchain-core
|
||||||
|
jsonpointer==2.4
|
||||||
|
# via jsonpatch
|
||||||
|
langchain==0.2.1
|
||||||
|
# via -r ./ingest/embed-voyageai.in
|
||||||
|
langchain-core==0.2.1
|
||||||
|
# via
|
||||||
|
# langchain
|
||||||
|
# langchain-text-splitters
|
||||||
|
# langchain-voyageai
|
||||||
|
langchain-text-splitters==0.2.0
|
||||||
|
# via langchain
|
||||||
|
langchain-voyageai==0.1.1
|
||||||
|
# via -r ./ingest/embed-voyageai.in
|
||||||
|
langsmith==0.1.62
|
||||||
|
# via
|
||||||
|
# -c ./ingest/../deps/constraints.txt
|
||||||
|
# langchain
|
||||||
|
# langchain-core
|
||||||
|
multidict==6.0.5
|
||||||
|
# via
|
||||||
|
# aiohttp
|
||||||
|
# yarl
|
||||||
|
numpy==1.26.4
|
||||||
|
# via
|
||||||
|
# -c ./ingest/../base.txt
|
||||||
|
# -c ./ingest/../deps/constraints.txt
|
||||||
|
# langchain
|
||||||
|
# voyageai
|
||||||
|
orjson==3.10.3
|
||||||
|
# via langsmith
|
||||||
|
packaging==23.2
|
||||||
|
# via
|
||||||
|
# -c ./ingest/../base.txt
|
||||||
|
# -c ./ingest/../deps/constraints.txt
|
||||||
|
# langchain-core
|
||||||
|
pydantic==2.7.1
|
||||||
|
# via
|
||||||
|
# langchain
|
||||||
|
# langchain-core
|
||||||
|
# langsmith
|
||||||
|
pydantic-core==2.18.2
|
||||||
|
# via pydantic
|
||||||
|
pyyaml==6.0.1
|
||||||
|
# via
|
||||||
|
# langchain
|
||||||
|
# langchain-core
|
||||||
|
requests==2.32.2
|
||||||
|
# via
|
||||||
|
# -c ./ingest/../base.txt
|
||||||
|
# langchain
|
||||||
|
# langsmith
|
||||||
|
# voyageai
|
||||||
|
sqlalchemy==2.0.30
|
||||||
|
# via langchain
|
||||||
|
tenacity==8.3.0
|
||||||
|
# via
|
||||||
|
# langchain
|
||||||
|
# langchain-core
|
||||||
|
# voyageai
|
||||||
|
typing-extensions==4.12.0
|
||||||
|
# via
|
||||||
|
# -c ./ingest/../base.txt
|
||||||
|
# pydantic
|
||||||
|
# pydantic-core
|
||||||
|
# sqlalchemy
|
||||||
|
urllib3==1.26.18
|
||||||
|
# via
|
||||||
|
# -c ./ingest/../base.txt
|
||||||
|
# -c ./ingest/../deps/constraints.txt
|
||||||
|
# requests
|
||||||
|
voyageai==0.2.2
|
||||||
|
# via langchain-voyageai
|
||||||
|
yarl==1.9.4
|
||||||
|
# via aiohttp
|
@ -37,7 +37,7 @@ requests==2.32.2
|
|||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# pygithub
|
# pygithub
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# pygithub
|
# pygithub
|
||||||
|
@ -17,7 +17,7 @@ charset-normalizer==3.3.2
|
|||||||
# requests
|
# requests
|
||||||
google-api-core==2.19.0
|
google-api-core==2.19.0
|
||||||
# via google-api-python-client
|
# via google-api-python-client
|
||||||
google-api-python-client==2.129.0
|
google-api-python-client==2.130.0
|
||||||
# via -r ./ingest/google-drive.in
|
# via -r ./ingest/google-drive.in
|
||||||
google-auth==2.29.0
|
google-auth==2.29.0
|
||||||
# via
|
# via
|
||||||
|
@ -15,7 +15,7 @@ tqdm==4.66.4
|
|||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# pinecone-client
|
# pinecone-client
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# pinecone-client
|
# pinecone-client
|
||||||
|
@ -62,7 +62,7 @@ sniffio==1.3.1
|
|||||||
# via
|
# via
|
||||||
# anyio
|
# anyio
|
||||||
# httpx
|
# httpx
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# pydantic
|
# pydantic
|
||||||
|
@ -51,7 +51,7 @@ six==1.16.0
|
|||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# python-dateutil
|
# python-dateutil
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# aioitertools
|
# aioitertools
|
||||||
|
@ -58,7 +58,7 @@ six==1.16.0
|
|||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# isodate
|
# isodate
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# simple-salesforce
|
# simple-salesforce
|
||||||
|
@ -69,7 +69,7 @@ sniffio==1.3.1
|
|||||||
# via
|
# via
|
||||||
# anyio
|
# anyio
|
||||||
# httpx
|
# httpx
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# pydantic
|
# pydantic
|
||||||
|
@ -130,7 +130,7 @@ rpds-py==0.18.1
|
|||||||
# via
|
# via
|
||||||
# jsonschema
|
# jsonschema
|
||||||
# referencing
|
# referencing
|
||||||
ruff==0.4.4
|
ruff==0.4.5
|
||||||
# via -r ./test.in
|
# via -r ./test.in
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
# via
|
# via
|
||||||
@ -153,7 +153,7 @@ types-tabulate==0.9.0.20240106
|
|||||||
# via -r ./test.in
|
# via -r ./test.in
|
||||||
types-urllib3==1.26.25.14
|
types-urllib3==1.26.25.14
|
||||||
# via types-requests
|
# via types-requests
|
||||||
typing-extensions==4.11.0
|
typing-extensions==4.12.0
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c ./base.txt
|
||||||
# black
|
# black
|
||||||
|
@ -12,5 +12,3 @@ pushd ./requirements || exit
|
|||||||
make clean
|
make clean
|
||||||
make all
|
make all
|
||||||
popd || exit
|
popd || exit
|
||||||
|
|
||||||
cp requirements/build.txt docs/requirements.txt
|
|
||||||
|
1
setup.py
1
setup.py
@ -171,6 +171,7 @@ setup(
|
|||||||
"embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"),
|
"embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"),
|
||||||
"embed-octoai": load_requirements("requirements/ingest/embed-octoai.in"),
|
"embed-octoai": load_requirements("requirements/ingest/embed-octoai.in"),
|
||||||
"embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.in"),
|
"embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.in"),
|
||||||
|
"embed-voyageai": load_requirements("requirements/ingest/embed-voyageai.in"),
|
||||||
"openai": load_requirements("requirements/ingest/embed-openai.in"),
|
"openai": load_requirements("requirements/ingest/embed-openai.in"),
|
||||||
"bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"),
|
"bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"),
|
||||||
"databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.in"),
|
"databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.in"),
|
||||||
|
21
test_unstructured/embed/test_voyageai.py
Normal file
21
test_unstructured/embed/test_voyageai.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from unstructured.documents.elements import Text
|
||||||
|
from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
|
||||||
|
|
||||||
|
|
||||||
|
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
||||||
|
# Mocked client with the desired behavior for embed_documents
|
||||||
|
mock_client = mocker.MagicMock()
|
||||||
|
mock_client.embed_documents.return_value = [1, 2]
|
||||||
|
|
||||||
|
# Mock create_client to return our mock_client
|
||||||
|
mocker.patch.object(VoyageAIEmbeddingEncoder, "create_client", return_value=mock_client)
|
||||||
|
|
||||||
|
encoder = VoyageAIEmbeddingEncoder(
|
||||||
|
config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-law-2")
|
||||||
|
)
|
||||||
|
elements = encoder.embed_documents(
|
||||||
|
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
|
||||||
|
)
|
||||||
|
assert len(elements) == 2
|
||||||
|
assert elements[0].to_dict()["text"] == "This is sentence 1"
|
||||||
|
assert elements[1].to_dict()["text"] == "This is sentence 2"
|
File diff suppressed because it is too large
Load Diff
41
test_unstructured_ingest/src/local-embed-voyageai.sh
Executable file
41
test_unstructured_ingest/src/local-embed-voyageai.sh
Executable file
@ -0,0 +1,41 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SRC_PATH=$(dirname "$(realpath "$0")")
|
||||||
|
SCRIPT_DIR=$(dirname "$SRC_PATH")
|
||||||
|
cd "$SCRIPT_DIR"/.. || exit 1
|
||||||
|
OUTPUT_FOLDER_NAME=embed-voyageai
|
||||||
|
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||||
|
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
|
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||||
|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||||
|
VOYAGE_API_KEY=${VOYAGE_API_KEY:-$VOYAGE_API_KEY}
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
function cleanup() {
|
||||||
|
cleanup_dir "$OUTPUT_DIR"
|
||||||
|
cleanup_dir "$WORK_DIR"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
||||||
|
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||||
|
local \
|
||||||
|
--num-processes "$max_processes" \
|
||||||
|
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||||
|
--output-dir "$OUTPUT_DIR" \
|
||||||
|
--verbose \
|
||||||
|
--reprocess \
|
||||||
|
--input-path example-docs/book-war-and-peace-1p.txt \
|
||||||
|
--work-dir "$WORK_DIR" \
|
||||||
|
--embedding-provider "langchain-voyageai" \
|
||||||
|
--embedding-api-key "$VOYAGE_API_KEY" \
|
||||||
|
--embedding-model-name "voyage-large-2"
|
||||||
|
|
||||||
|
set +e
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
@ -59,6 +59,7 @@ all_tests=(
|
|||||||
'local-embed-bedrock.sh'
|
'local-embed-bedrock.sh'
|
||||||
'local-embed-octoai.sh'
|
'local-embed-octoai.sh'
|
||||||
'local-embed-vertexai.sh'
|
'local-embed-vertexai.sh'
|
||||||
|
'local-embed-voyageai.sh'
|
||||||
'sftp.sh'
|
'sftp.sh'
|
||||||
'opensearch.sh'
|
'opensearch.sh'
|
||||||
# NOTE(robinson) - mongo conflicts with astra because it ships with its
|
# NOTE(robinson) - mongo conflicts with astra because it ships with its
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.14.3-dev4" # pragma: no cover
|
__version__ = "0.14.3-dev5" # pragma: no cover
|
||||||
|
@ -3,11 +3,13 @@ from unstructured.embed.huggingface import HuggingFaceEmbeddingEncoder
|
|||||||
from unstructured.embed.octoai import OctoAIEmbeddingEncoder
|
from unstructured.embed.octoai import OctoAIEmbeddingEncoder
|
||||||
from unstructured.embed.openai import OpenAIEmbeddingEncoder
|
from unstructured.embed.openai import OpenAIEmbeddingEncoder
|
||||||
from unstructured.embed.vertexai import VertexAIEmbeddingEncoder
|
from unstructured.embed.vertexai import VertexAIEmbeddingEncoder
|
||||||
|
from unstructured.embed.voyageai import VoyageAIEmbeddingEncoder
|
||||||
|
|
||||||
EMBEDDING_PROVIDER_TO_CLASS_MAP = {
|
EMBEDDING_PROVIDER_TO_CLASS_MAP = {
|
||||||
"langchain-openai": OpenAIEmbeddingEncoder,
|
"langchain-openai": OpenAIEmbeddingEncoder,
|
||||||
"langchain-huggingface": HuggingFaceEmbeddingEncoder,
|
"langchain-huggingface": HuggingFaceEmbeddingEncoder,
|
||||||
"langchain-aws-bedrock": BedrockEmbeddingEncoder,
|
"langchain-aws-bedrock": BedrockEmbeddingEncoder,
|
||||||
"langchain-vertexai": VertexAIEmbeddingEncoder,
|
"langchain-vertexai": VertexAIEmbeddingEncoder,
|
||||||
|
"langchain-voyageai": VoyageAIEmbeddingEncoder,
|
||||||
"octoai": OctoAIEmbeddingEncoder,
|
"octoai": OctoAIEmbeddingEncoder,
|
||||||
}
|
}
|
||||||
|
82
unstructured/embed/voyageai.py
Normal file
82
unstructured/embed/voyageai.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import TYPE_CHECKING, List, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from unstructured.documents.elements import Element
|
||||||
|
from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
||||||
|
from unstructured.ingest.error import EmbeddingEncoderConnectionError
|
||||||
|
from unstructured.utils import requires_dependencies
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from langchain_voyageai import VoyageAIEmbeddings
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VoyageAIEmbeddingConfig(EmbeddingConfig):
|
||||||
|
api_key: str
|
||||||
|
model_name: str
|
||||||
|
batch_size: Optional[int] = None
|
||||||
|
truncation: Optional[bool] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
||||||
|
config: VoyageAIEmbeddingConfig
|
||||||
|
_client: Optional["VoyageAIEmbeddings"] = field(init=False, default=None)
|
||||||
|
_exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def client(self) -> "VoyageAIEmbeddings":
|
||||||
|
if self._client is None:
|
||||||
|
self._client = self.create_client()
|
||||||
|
return self._client
|
||||||
|
|
||||||
|
@property
|
||||||
|
def exemplary_embedding(self) -> List[float]:
|
||||||
|
if self._exemplary_embedding is None:
|
||||||
|
self._exemplary_embedding = self.client.embed_query("A sample query.")
|
||||||
|
return self._exemplary_embedding
|
||||||
|
|
||||||
|
def initialize(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_of_dimensions(self) -> tuple[int, ...]:
|
||||||
|
return np.shape(self.exemplary_embedding)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_unit_vector(self) -> bool:
|
||||||
|
return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0)
|
||||||
|
|
||||||
|
def embed_documents(self, elements: List[Element]) -> List[Element]:
|
||||||
|
embeddings = self.client.embed_documents([str(e) for e in elements])
|
||||||
|
return self._add_embeddings_to_elements(elements, embeddings)
|
||||||
|
|
||||||
|
def embed_query(self, query: str) -> List[float]:
|
||||||
|
return self.client.embed_query(query)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _add_embeddings_to_elements(elements, embeddings) -> List[Element]:
|
||||||
|
assert len(elements) == len(embeddings)
|
||||||
|
elements_w_embedding = []
|
||||||
|
for i, element in enumerate(elements):
|
||||||
|
element.embeddings = embeddings[i]
|
||||||
|
elements_w_embedding.append(element)
|
||||||
|
return elements
|
||||||
|
|
||||||
|
@EmbeddingEncoderConnectionError.wrap
|
||||||
|
@requires_dependencies(
|
||||||
|
["langchain", "langchain_voyageai"],
|
||||||
|
extras="embed-voyageai",
|
||||||
|
)
|
||||||
|
def create_client(self) -> "VoyageAIEmbeddings":
|
||||||
|
"""Creates a Langchain VoyageAI python client to embed elements."""
|
||||||
|
from langchain_voyageai import VoyageAIEmbeddings
|
||||||
|
|
||||||
|
return VoyageAIEmbeddings(
|
||||||
|
voyage_api_key=self.config.api_key,
|
||||||
|
model=self.config.model_name,
|
||||||
|
batch_size=self.config.batch_size,
|
||||||
|
truncation=self.config.truncation,
|
||||||
|
)
|
@ -234,6 +234,13 @@ class EmbeddingConfig(BaseConfig):
|
|||||||
)
|
)
|
||||||
|
|
||||||
return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
|
return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
|
||||||
|
elif self.provider == "langchain-voyageai":
|
||||||
|
from unstructured.embed.voyageai import (
|
||||||
|
VoyageAIEmbeddingConfig,
|
||||||
|
VoyageAIEmbeddingEncoder,
|
||||||
|
)
|
||||||
|
|
||||||
|
return VoyageAIEmbeddingEncoder(config=VoyageAIEmbeddingConfig(**kwargs))
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"{self.provider} not a recognized encoder")
|
raise ValueError(f"{self.provider} not a recognized encoder")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user