mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 06:04:53 +00:00
feat: add vertexai embeddings (#2693)
This PR: - Adds VertexAI embeddings as an embedding provider Testing - Tested with pinecone destination connector on [this](https://github.com/Unstructured-IO/unstructured/actions/runs/8429035114/job/23082700074?pr=2693) job run. --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
This commit is contained in:
parent
887e6c9094
commit
d46792214a
@ -1,11 +1,12 @@
|
||||
## 0.13.0-dev13
|
||||
## 0.13.0-dev14
|
||||
|
||||
### Enhancements
|
||||
### Enhancements
|
||||
|
||||
* **Add `.metadata.is_continuation` to text-split chunks.** `.metadata.is_continuation=True` is added to second-and-later chunks formed by text-splitting an oversized `Table` element but not to their counterpart `Text` element splits. Add this indicator for `CompositeElement` to allow text-split continuation chunks to be identified for downstream processes that may wish to skip intentionally redundant metadata values in continuation chunks.
|
||||
* **Add `compound_structure_acc` metric to table eval.** Add a new property to `unstructured.metrics.table_eval.TableEvaluation`: `composite_structure_acc`, which is computed from the element level row and column index and content accuracy scores
|
||||
* **Add `.metadata.orig_elements` to chunks.** `.metadata.orig_elements: list[Element]` is added to chunks during the chunking process (when requested) to allow access to information from the elements each chunk was formed from. This is useful for example to recover metadata fields that cannot be consolidated to a single value for a chunk, like `page_number`, `coordinates`, and `image_base64`.
|
||||
* **Add `--include_orig_elements` option to Ingest CLI.** By default, when chunking, the original elements used to form each chunk are added to `chunk.metadata.orig_elements` for each chunk. * The `include_orig_elements` parameter allows the user to turn off this behavior to produce a smaller payload when they don't need this metadata.
|
||||
* **Add Google VertexAI embedder** Adds VertexAI embeddings to support embedding via Google Vertex AI.
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -171,6 +171,59 @@ To obtain an api key, visit: https://octo.ai/docs/getting-started/how-to-create-
|
||||
query = "This is the query"
|
||||
query_embedding = embedding_encoder.embed_query(query=query)
|
||||
|
||||
[print(e.embeddings, e) for e in elements]
|
||||
print(query_embedding, query)
|
||||
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
|
||||
|
||||
``VertexAIEmbeddingEncoder``
|
||||
--------------------------
|
||||
|
||||
The ``VertexAIEmbeddingEncoder`` class connects to the GCP VertexAI to obtain embeddings for pieces of text.
|
||||
|
||||
``embed_documents`` will receive a list of Elements, and return an updated list which
|
||||
includes the ``embeddings`` attribute for each Element.
|
||||
|
||||
``embed_query`` will receive a query as a string, and return a list of floats which is the
|
||||
embedding vector for the given query string.
|
||||
|
||||
``num_of_dimensions`` is a metadata property that denotes the number of dimensions in any
|
||||
embedding vector obtained via this class.
|
||||
|
||||
``is_unit_vector`` is a metadata property that denotes if embedding vectors obtained via
|
||||
this class are unit vectors.
|
||||
|
||||
The following code block shows an example of how to use ``VertexAIEmbeddingEncoder``. You will
|
||||
see the updated elements list (with the ``embeddings`` attribute included for each element),
|
||||
the embedding vector for the query string, and some metadata properties about the embedding model.
|
||||
|
||||
To use Vertex AI PaLM tou will need to:
|
||||
- either, pass the full json content of your GCP VertexAI application credentials to the
|
||||
VertexAIEmbeddingConfig as the api_key parameter. (This will create a file in the ``/tmp``
|
||||
directory with the content of the json, and set the GOOGLE_APPLICATION_CREDENTIALS environment
|
||||
variable to the **path** of the created file.)
|
||||
- or, you'll need to store the path to a manually created service account JSON file as the
|
||||
GOOGLE_APPLICATION_CREDENTIALS environment variable. (For more information:
|
||||
https://python.langchain.com/docs/integrations/text_embedding/google_vertex_ai_palm)
|
||||
- or, you'll need to have the credentials configured for your environment (gcloud,
|
||||
workload identity, etc…)
|
||||
|
||||
.. code:: python
|
||||
|
||||
import os
|
||||
|
||||
from unstructured.documents.elements import Text
|
||||
from unstructured.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
|
||||
|
||||
embedding_encoder = VertexAIEmbeddingEncoder(
|
||||
config=VertexAIEmbeddingConfig(api_key=os.environ["VERTEXAI_GCP_APP_CREDS_JSON_CONTENT"])
|
||||
)
|
||||
elements = embedding_encoder.embed_documents(
|
||||
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
|
||||
)
|
||||
|
||||
query = "This is the query"
|
||||
query_embedding = embedding_encoder.embed_query(query=query)
|
||||
|
||||
[print(e.embeddings, e) for e in elements]
|
||||
print(query_embedding, query)
|
||||
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
|
||||
30
examples/embed/example_vertexai.py
Normal file
30
examples/embed/example_vertexai.py
Normal file
@ -0,0 +1,30 @@
|
||||
import os
|
||||
|
||||
from unstructured.documents.elements import Text
|
||||
from unstructured.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
|
||||
|
||||
# To use Vertex AI PaLM tou will need to:
|
||||
# - either, pass the full json content of your GCP VertexAI application credentials to the
|
||||
# VertexAIEmbeddingConfig as the api_key parameter. (This will create a file in the ``/tmp``
|
||||
# directory with the content of the json, and set the GOOGLE_APPLICATION_CREDENTIALS environment
|
||||
# variable to the **path** of the created file.)
|
||||
# - or, you'll need to store the path to a manually created service account JSON file as the
|
||||
# GOOGLE_APPLICATION_CREDENTIALS environment variable. (For more information:
|
||||
# https://python.langchain.com/docs/integrations/text_embedding/google_vertex_ai_palm)
|
||||
# - or, you'll need to have the credentials configured for your environment (gcloud,
|
||||
# workload identity, etc…)
|
||||
|
||||
embedding_encoder = VertexAIEmbeddingEncoder(
|
||||
config=VertexAIEmbeddingConfig(api_key=os.environ["VERTEXAI_GCP_APP_CREDS_JSON_CONTENT"])
|
||||
)
|
||||
|
||||
elements = embedding_encoder.embed_documents(
|
||||
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
|
||||
)
|
||||
|
||||
query = "This is the query"
|
||||
query_embedding = embedding_encoder.embed_query(query=query)
|
||||
|
||||
[print(e.embeddings, e) for e in elements]
|
||||
print(query_embedding, query)
|
||||
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
|
||||
4
requirements/ingest/embed-octoai.in
Normal file
4
requirements/ingest/embed-octoai.in
Normal file
@ -0,0 +1,4 @@
|
||||
-c ../constraints.in
|
||||
-c ../base.txt
|
||||
openai
|
||||
tiktoken
|
||||
72
requirements/ingest/embed-octoai.txt
Normal file
72
requirements/ingest/embed-octoai.txt
Normal file
@ -0,0 +1,72 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --output-file=ingest/embed-octoai.txt ingest/embed-octoai.in
|
||||
#
|
||||
anyio==3.7.1
|
||||
# via
|
||||
# -c ingest/../constraints.in
|
||||
# httpx
|
||||
# openai
|
||||
certifi==2024.2.2
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
charset-normalizer==3.3.2
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# requests
|
||||
distro==1.9.0
|
||||
# via openai
|
||||
exceptiongroup==1.2.0
|
||||
# via anyio
|
||||
h11==0.14.0
|
||||
# via httpcore
|
||||
httpcore==1.0.4
|
||||
# via httpx
|
||||
httpx==0.27.0
|
||||
# via openai
|
||||
idna==3.6
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
openai==1.14.3
|
||||
# via -r ingest/embed-octoai.in
|
||||
pydantic==1.10.14
|
||||
# via
|
||||
# -c ingest/../constraints.in
|
||||
# openai
|
||||
regex==2023.12.25
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# tiktoken
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# tiktoken
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# anyio
|
||||
# httpx
|
||||
# openai
|
||||
tiktoken==0.6.0
|
||||
# via -r ingest/embed-octoai.in
|
||||
tqdm==4.66.2
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# openai
|
||||
typing-extensions==4.10.0
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# openai
|
||||
# pydantic
|
||||
urllib3==2.2.1
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# requests
|
||||
5
requirements/ingest/embed-vertexai.in
Normal file
5
requirements/ingest/embed-vertexai.in
Normal file
@ -0,0 +1,5 @@
|
||||
-c ../constraints.in
|
||||
-c ../base.txt
|
||||
langchain
|
||||
langchain-community
|
||||
langchain-google-vertexai
|
||||
243
requirements/ingest/embed-vertexai.txt
Normal file
243
requirements/ingest/embed-vertexai.txt
Normal file
@ -0,0 +1,243 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --output-file=ingest/embed-vertexai.txt ingest/embed-vertexai.in
|
||||
#
|
||||
aiohttp==3.9.3
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
anyio==3.7.1
|
||||
# via
|
||||
# -c ingest/../constraints.in
|
||||
# langchain-core
|
||||
async-timeout==4.0.3
|
||||
# via
|
||||
# aiohttp
|
||||
# langchain
|
||||
attrs==23.2.0
|
||||
# via aiohttp
|
||||
cachetools==5.3.3
|
||||
# via google-auth
|
||||
certifi==2024.2.2
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# requests
|
||||
charset-normalizer==3.3.2
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# requests
|
||||
dataclasses-json==0.6.4
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
exceptiongroup==1.2.0
|
||||
# via anyio
|
||||
frozenlist==1.4.1
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
google-api-core[grpc]==2.18.0
|
||||
# via
|
||||
# google-cloud-aiplatform
|
||||
# google-cloud-bigquery
|
||||
# google-cloud-core
|
||||
# google-cloud-resource-manager
|
||||
# google-cloud-storage
|
||||
google-auth==2.29.0
|
||||
# via
|
||||
# google-api-core
|
||||
# google-cloud-aiplatform
|
||||
# google-cloud-bigquery
|
||||
# google-cloud-core
|
||||
# google-cloud-resource-manager
|
||||
# google-cloud-storage
|
||||
google-cloud-aiplatform==1.44.0
|
||||
# via langchain-google-vertexai
|
||||
google-cloud-bigquery==3.19.0
|
||||
# via google-cloud-aiplatform
|
||||
google-cloud-core==2.4.1
|
||||
# via
|
||||
# google-cloud-bigquery
|
||||
# google-cloud-storage
|
||||
google-cloud-resource-manager==1.12.3
|
||||
# via google-cloud-aiplatform
|
||||
google-cloud-storage==2.16.0
|
||||
# via
|
||||
# google-cloud-aiplatform
|
||||
# langchain-google-vertexai
|
||||
google-crc32c==1.5.0
|
||||
# via
|
||||
# google-cloud-storage
|
||||
# google-resumable-media
|
||||
google-resumable-media==2.7.0
|
||||
# via
|
||||
# google-cloud-bigquery
|
||||
# google-cloud-storage
|
||||
googleapis-common-protos[grpc]==1.63.0
|
||||
# via
|
||||
# google-api-core
|
||||
# grpc-google-iam-v1
|
||||
# grpcio-status
|
||||
grpc-google-iam-v1==0.13.0
|
||||
# via google-cloud-resource-manager
|
||||
grpcio==1.62.1
|
||||
# via
|
||||
# google-api-core
|
||||
# googleapis-common-protos
|
||||
# grpc-google-iam-v1
|
||||
# grpcio-status
|
||||
grpcio-status==1.62.1
|
||||
# via google-api-core
|
||||
idna==3.6
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# anyio
|
||||
# requests
|
||||
# yarl
|
||||
jsonpatch==1.33
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.1.13
|
||||
# via -r ingest/embed-vertexai.in
|
||||
langchain-community==0.0.29
|
||||
# via
|
||||
# -r ingest/embed-vertexai.in
|
||||
# langchain
|
||||
langchain-core==0.1.33
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-google-vertexai
|
||||
# langchain-text-splitters
|
||||
langchain-google-vertexai==0.1.1
|
||||
# via -r ingest/embed-vertexai.in
|
||||
langchain-text-splitters==0.0.1
|
||||
# via langchain
|
||||
langsmith==0.1.31
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
marshmallow==3.21.1
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# dataclasses-json
|
||||
multidict==6.0.5
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
mypy-extensions==1.0.0
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# typing-inspect
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
# shapely
|
||||
orjson==3.9.15
|
||||
# via langsmith
|
||||
packaging==23.2
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# google-cloud-aiplatform
|
||||
# google-cloud-bigquery
|
||||
# langchain-core
|
||||
# marshmallow
|
||||
proto-plus==1.23.0
|
||||
# via
|
||||
# google-api-core
|
||||
# google-cloud-aiplatform
|
||||
# google-cloud-resource-manager
|
||||
protobuf==4.23.4
|
||||
# via
|
||||
# -c ingest/../constraints.in
|
||||
# google-api-core
|
||||
# google-cloud-aiplatform
|
||||
# google-cloud-resource-manager
|
||||
# googleapis-common-protos
|
||||
# grpc-google-iam-v1
|
||||
# grpcio-status
|
||||
# proto-plus
|
||||
pyasn1==0.5.1
|
||||
# via
|
||||
# pyasn1-modules
|
||||
# rsa
|
||||
pyasn1-modules==0.3.0
|
||||
# via google-auth
|
||||
pydantic==1.10.14
|
||||
# via
|
||||
# -c ingest/../constraints.in
|
||||
# langchain
|
||||
# langchain-core
|
||||
# langsmith
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# google-cloud-bigquery
|
||||
pyyaml==6.0.1
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# google-api-core
|
||||
# google-cloud-bigquery
|
||||
# google-cloud-storage
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
# langsmith
|
||||
rsa==4.9
|
||||
# via google-auth
|
||||
shapely==2.0.3
|
||||
# via google-cloud-aiplatform
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# python-dateutil
|
||||
sniffio==1.3.1
|
||||
# via anyio
|
||||
sqlalchemy==2.0.29
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
tenacity==8.2.3
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-core
|
||||
types-protobuf==4.24.0.20240311
|
||||
# via langchain-google-vertexai
|
||||
types-requests==2.31.0.20240311
|
||||
# via langchain-google-vertexai
|
||||
typing-extensions==4.10.0
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# pydantic
|
||||
# sqlalchemy
|
||||
# typing-inspect
|
||||
typing-inspect==0.9.0
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# dataclasses-json
|
||||
urllib3==2.2.1
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# requests
|
||||
# types-requests
|
||||
yarl==1.9.4
|
||||
# via aiohttp
|
||||
2
setup.py
2
setup.py
@ -169,6 +169,8 @@ setup(
|
||||
"local-inference": all_doc_reqs,
|
||||
"paddleocr": load_requirements("requirements/extra-paddleocr.txt"),
|
||||
"embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.txt"),
|
||||
"embed-octoai": load_requirements("requirements/ingest/embed-octoai.txt"),
|
||||
"embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.txt"),
|
||||
"openai": load_requirements("requirements/ingest/embed-openai.txt"),
|
||||
"bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.txt"),
|
||||
"databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.txt"),
|
||||
|
||||
19
test_unstructured/embed/test_vertexai.py
Normal file
19
test_unstructured/embed/test_vertexai.py
Normal file
@ -0,0 +1,19 @@
|
||||
from unstructured.documents.elements import Text
|
||||
from unstructured.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
|
||||
|
||||
|
||||
def test_embed_documents_does_not_break_element_to_dict(mocker):
|
||||
# Mocked client with the desired behavior for embed_documents
|
||||
mock_client = mocker.MagicMock()
|
||||
mock_client.embed_documents.return_value = [1, 2]
|
||||
|
||||
# Mock create_client to return our mock_client
|
||||
mocker.patch.object(VertexAIEmbeddingEncoder, "create_client", return_value=mock_client)
|
||||
|
||||
encoder = VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(api_key="api_key"))
|
||||
elements = encoder.embed_documents(
|
||||
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
|
||||
)
|
||||
assert len(elements) == 2
|
||||
assert elements[0].to_dict()["text"] == "This is sentence 1"
|
||||
assert elements[1].to_dict()["text"] == "This is sentence 2"
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
40
test_unstructured_ingest/src/local-embed-octoai.sh
Executable file
40
test_unstructured_ingest/src/local-embed-octoai.sh
Executable file
@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
SRC_PATH=$(dirname "$(realpath "$0")")
|
||||
SCRIPT_DIR=$(dirname "$SRC_PATH")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=embed-octoai
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
OCTOAI_API_KEY=${OCTOAI_API_KEY:-$OCTOAI_API_KEY}
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
function cleanup() {
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
||||
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
local \
|
||||
--num-processes "$max_processes" \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--verbose \
|
||||
--reprocess \
|
||||
--input-path example-docs/book-war-and-peace-1p.txt \
|
||||
--work-dir "$WORK_DIR" \
|
||||
--embedding-provider "octoai" \
|
||||
--embedding-api-key "$OCTOAI_API_KEY"
|
||||
|
||||
set +e
|
||||
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||
41
test_unstructured_ingest/src/local-embed-vertexai.sh
Executable file
41
test_unstructured_ingest/src/local-embed-vertexai.sh
Executable file
@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
SRC_PATH=$(dirname "$(realpath "$0")")
|
||||
SCRIPT_DIR=$(dirname "$SRC_PATH")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=embed-vertexai
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
GCP_INGEST_SERVICE_KEY=${GCP_INGEST_SERVICE_KEY:-$GCP_INGEST_SERVICE_KEY}
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
function cleanup() {
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
||||
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
local \
|
||||
--num-processes "$max_processes" \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--verbose \
|
||||
--reprocess \
|
||||
--input-path example-docs/book-war-and-peace-1p.txt \
|
||||
--work-dir "$WORK_DIR" \
|
||||
--embedding-provider "langchain-vertexai" \
|
||||
--embedding-api-key "$GCP_INGEST_SERVICE_KEY" \
|
||||
--embedding-model-name "textembedding-gecko@001"
|
||||
|
||||
set +e
|
||||
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||
@ -57,6 +57,8 @@ all_tests=(
|
||||
'hubspot.sh'
|
||||
'local-embed.sh'
|
||||
'local-embed-bedrock.sh'
|
||||
'local-embed-octoai.sh'
|
||||
'local-embed-vertexai.sh'
|
||||
'sftp.sh'
|
||||
'mongodb.sh'
|
||||
'opensearch.sh'
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.13.0-dev13" # pragma: no cover
|
||||
__version__ = "0.13.0-dev14" # pragma: no cover
|
||||
|
||||
@ -2,10 +2,12 @@ from unstructured.embed.bedrock import BedrockEmbeddingEncoder
|
||||
from unstructured.embed.huggingface import HuggingFaceEmbeddingEncoder
|
||||
from unstructured.embed.octoai import OctoAIEmbeddingEncoder
|
||||
from unstructured.embed.openai import OpenAIEmbeddingEncoder
|
||||
from unstructured.embed.vertexai import VertexAIEmbeddingEncoder
|
||||
|
||||
EMBEDDING_PROVIDER_TO_CLASS_MAP = {
|
||||
"langchain-openai": OpenAIEmbeddingEncoder,
|
||||
"langchain-huggingface": HuggingFaceEmbeddingEncoder,
|
||||
"langchain-aws-bedrock": BedrockEmbeddingEncoder,
|
||||
"langchain-vertexai": VertexAIEmbeddingEncoder,
|
||||
"octoai": OctoAIEmbeddingEncoder,
|
||||
}
|
||||
|
||||
@ -70,7 +70,7 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
||||
@EmbeddingEncoderConnectionError.wrap
|
||||
@requires_dependencies(
|
||||
["openai", "tiktoken"],
|
||||
extras="embed-openai",
|
||||
extras="embed-octoai",
|
||||
)
|
||||
def create_client(self) -> "OpenAI":
|
||||
"""Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
|
||||
|
||||
90
unstructured/embed/vertexai.py
Normal file
90
unstructured/embed/vertexai.py
Normal file
@ -0,0 +1,90 @@
|
||||
# type: ignore
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
)
|
||||
from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
||||
from unstructured.ingest.error import EmbeddingEncoderConnectionError
|
||||
from unstructured.utils import FileHandler, requires_dependencies
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_google_vertexai import VertexAIEmbeddings
|
||||
|
||||
|
||||
@dataclass
|
||||
class VertexAIEmbeddingConfig(EmbeddingConfig):
|
||||
api_key: str
|
||||
model_name: Optional[str] = "textembedding-gecko@001"
|
||||
|
||||
|
||||
@dataclass
|
||||
class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
||||
config: VertexAIEmbeddingConfig
|
||||
_client: Optional["VertexAIEmbeddings"] = field(init=False, default=None)
|
||||
_exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
|
||||
|
||||
@property
|
||||
def client(self) -> "VertexAIEmbeddings":
|
||||
if self._client is None:
|
||||
self._client = self.create_client()
|
||||
return self._client
|
||||
|
||||
@property
|
||||
def exemplary_embedding(self) -> List[float]:
|
||||
if self._exemplary_embedding is None:
|
||||
self._exemplary_embedding = self.client.embed_query("A sample query.")
|
||||
return self._exemplary_embedding
|
||||
|
||||
def initialize(self):
|
||||
pass
|
||||
|
||||
def num_of_dimensions(self):
|
||||
return np.shape(self.exemplary_embedding)
|
||||
|
||||
def is_unit_vector(self):
|
||||
return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0)
|
||||
|
||||
def embed_query(self, query):
|
||||
result = self.client.embed_query(str(query))
|
||||
return result
|
||||
|
||||
def embed_documents(self, elements: List[Element]) -> List[Element]:
|
||||
embeddings = self.client.embed_documents([str(e) for e in elements])
|
||||
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
||||
return elements_with_embeddings
|
||||
|
||||
def _add_embeddings_to_elements(self, elements, embeddings) -> List[Element]:
|
||||
assert len(elements) == len(embeddings)
|
||||
elements_w_embedding = []
|
||||
for i, element in enumerate(elements):
|
||||
element.embeddings = embeddings[i]
|
||||
elements_w_embedding.append(element)
|
||||
return elements
|
||||
|
||||
@property
|
||||
def application_credentials_path(self):
|
||||
return os.path.join("/tmp", "google-vertex-app-credentials.json")
|
||||
|
||||
def register_application_credentials(self):
|
||||
credentials_file = FileHandler(self.application_credentials_path)
|
||||
credentials_file.write_file(json.dumps(json.loads(self.config.api_key)))
|
||||
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.application_credentials_path
|
||||
|
||||
@EmbeddingEncoderConnectionError.wrap
|
||||
@requires_dependencies(
|
||||
["langchain", "langchain_google_vertexai"],
|
||||
extras="embed-vertexai",
|
||||
)
|
||||
def create_client(self) -> "VertexAIEmbeddings":
|
||||
"""Creates a Langchain VertexAI python client to embed elements."""
|
||||
from langchain_google_vertexai import VertexAIEmbeddings
|
||||
|
||||
self.register_application_credentials()
|
||||
vertexai_client = VertexAIEmbeddings(model_name=self.config.model_name)
|
||||
return vertexai_client
|
||||
@ -222,6 +222,13 @@ class EmbeddingConfig(BaseConfig):
|
||||
region_name=self.aws_region,
|
||||
)
|
||||
)
|
||||
elif self.provider == "langchain-vertexai":
|
||||
from unstructured.embed.vertexai import (
|
||||
VertexAIEmbeddingConfig,
|
||||
VertexAIEmbeddingEncoder,
|
||||
)
|
||||
|
||||
return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
|
||||
else:
|
||||
raise ValueError(f"{self.provider} not a recognized encoder")
|
||||
|
||||
|
||||
@ -7,6 +7,7 @@ import json
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from functools import wraps
|
||||
from itertools import combinations
|
||||
@ -773,3 +774,25 @@ def catch_overlapping_and_nested_bboxes(
|
||||
document_with_overlapping_flag = True
|
||||
|
||||
return document_with_overlapping_flag, overlapping_cases
|
||||
|
||||
|
||||
class FileHandler:
|
||||
def __init__(self, file_path: str):
|
||||
self.file_path = file_path
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def read_file(self):
|
||||
with self.lock:
|
||||
with open(self.file_path) as file:
|
||||
data = file.read()
|
||||
return data
|
||||
|
||||
def write_file(self, data: str) -> None:
|
||||
with self.lock:
|
||||
with open(self.file_path, "w") as file:
|
||||
file.write(data)
|
||||
|
||||
def cleanup_file(self):
|
||||
with self.lock:
|
||||
if os.path.exists(self.file_path):
|
||||
os.remove(self.file_path)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user