diff --git a/CHANGELOG.md b/CHANGELOG.md index 0eeda7a50..b05ff4247 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,11 @@ -## 0.10.25-dev0 +## 0.10.25-dev1 ### Enhancements ### Features +* **Add AWS bedrock embedding connector** `unstructured.embed.bedrock` now provides a connector to use AWS bedrock's `titan-embed-text` model to generate embeddings for elements. This features requires valid AWS bedrock setup and an internet connectionto run. + ### Fixes ## 0.10.24 diff --git a/docs/requirements.txt b/docs/requirements.txt index d2834bd86..8092f581e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/build.in +# pip-compile --constraint=requirements/constraints.in requirements/build.in # alabaster==0.7.13 # via sphinx @@ -116,7 +116,7 @@ sphinxcontrib-serializinghtml==1.1.5 # via # -r requirements/build.in # sphinx -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/docs/source/bricks/embedding.rst b/docs/source/bricks/embedding.rst index 42a569ca5..450f6aa6e 100644 --- a/docs/source/bricks/embedding.rst +++ b/docs/source/bricks/embedding.rst @@ -45,14 +45,69 @@ To obtain an api key, visit: https://platform.openai.com/account/api-keys from unstructured.documents.elements import Text from unstructured.embed.openai import OpenAIEmbeddingEncoder + # Initialize the encoder with OpenAI credentials embedding_encoder = OpenAIEmbeddingEncoder(api_key=os.environ["OPENAI_API_KEY"]) + + # Embed a list of Elements elements = embedding_encoder.embed_documents( elements=[Text("This is sentence 1"), Text("This is sentence 2")], ) + # Embed a single query string query = "This is the query" query_embedding = embedding_encoder.embed_query(query=query) + # Print embeddings [print(e.embeddings, e) for e in elements] print(query_embedding, query) print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions()) + +``BedrockEmbeddingEncoder`` +-------------------------- + +The ``BedrockEmbeddingEncoder`` class provides an interface to obtain embeddings for text using the Bedrock embeddings via the langchain integration. It connects to the Bedrock Runtime using AWS's boto3 package. + +Key methods and attributes include: + +``embed_documents``: This function takes a list of Elements as its input and returns the same list with an updated embeddings attribute for each Element. + +``embed_query``: This method takes a query as a string and returns the embedding vector for the given query string. + +``num_of_dimensions``: A metadata property that signifies the number of dimensions in any embedding vector obtained via this class. + +``is_unit_vector``: A metadata property that checks if embedding vectors obtained via this class are unit vectors. + +Initialization: +To create an instance of the `BedrockEmbeddingEncoder`, AWS credentials and the region name are required. + +.. code:: python + + import os + + from unstructured.documents.elements import Text + from unstructured.embed.bedrock import BedrockEmbeddingEncoder + + # Initialize the encoder with AWS credentials + embedding_encoder = BedrockEmbeddingEncoder( + aws_access_key_id="YOUR_AWS_ACCESS_KEY_ID", + aws_secret_access_key="YOUR_AWS_SECRET_ACCESS_KEY", + region_name="us-west-2" + ) + + # Embed a list of Elements + elements = embedding_encoder.embed_documents( + elements=[Text("Sentence A"), Text("Sentence B")] + ) + + # Embed a single query string + query = "Example query" + query_embedding = embedding_encoder.embed_query(query=query) + + # Print embeddings + [print(e.embeddings, e) for e in elements] + print(query_embedding, query) + print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions()) + + +Dependencies: +This class relies on several dependencies which include boto3, numpy, and langchain. Ensure these are installed and available in the environment where this class is utilized. \ No newline at end of file diff --git a/requirements/base.txt b/requirements/base.txt index 6ef4347a2..4be522c96 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/base.in +# pip-compile --constraint=requirements/constraints.in requirements/base.in # backoff==2.2.1 # via -r requirements/base.in @@ -66,7 +66,7 @@ typing-extensions==4.8.0 # via typing-inspect typing-inspect==0.9.0 # via dataclasses-json -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/constraints.in # requests diff --git a/requirements/build.txt b/requirements/build.txt index d2834bd86..8092f581e 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/build.in +# pip-compile --constraint=requirements/constraints.in requirements/build.in # alabaster==0.7.13 # via sphinx @@ -116,7 +116,7 @@ sphinxcontrib-serializinghtml==1.1.5 # via # -r requirements/build.in # sphinx -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/constraints.in b/requirements/constraints.in index c76c72a52..f6bf2b4aa 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -5,6 +5,8 @@ #################################################################################################### # NOTE(alan): Pinning to avoid conflicts with downstream ingest-s3 urllib3<1.27, >=1.25.4 +boto3<1.28.18 +botocore<1.31.18 # consistency with local-inference-pin protobuf<4.24 # NOTE(robinson) - Required pins for security scans diff --git a/requirements/dev.txt b/requirements/dev.txt index d7062469d..a26cf65d4 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/dev.in +# pip-compile --constraint=requirements/constraints.in requirements/dev.in # anyio==3.7.1 # via @@ -213,7 +213,7 @@ nest-asyncio==1.5.8 # via ipykernel nodeenv==1.8.0 # via pre-commit -notebook==7.0.5 +notebook==7.0.6 # via jupyter notebook-shim==0.2.3 # via @@ -390,7 +390,7 @@ typing-extensions==4.8.0 # ipython uri-template==1.3.0 # via jsonschema -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index b015ffa6f..2f3f1775a 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/extra-csv.in +# pip-compile --constraint=requirements/constraints.in requirements/extra-csv.in # numpy==1.24.4 # via diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 749c48c81..cdaba2489 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/extra-docx.in +# pip-compile --constraint=requirements/constraints.in requirements/extra-docx.in # lxml==4.9.3 # via diff --git a/requirements/extra-epub.txt b/requirements/extra-epub.txt index f16a8c905..9337a7729 100644 --- a/requirements/extra-epub.txt +++ b/requirements/extra-epub.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/extra-epub.in +# pip-compile --constraint=requirements/constraints.in requirements/extra-epub.in # ebooklib==0.18 # via -r requirements/extra-epub.in diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index ec0989c5e..468f69916 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/extra-markdown.in +# pip-compile --constraint=requirements/constraints.in requirements/extra-markdown.in # importlib-metadata==6.8.0 # via markdown diff --git a/requirements/extra-msg.txt b/requirements/extra-msg.txt index 2ab29747e..e6a57a63f 100644 --- a/requirements/extra-msg.txt +++ b/requirements/extra-msg.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/extra-msg.in +# pip-compile --constraint=requirements/constraints.in requirements/extra-msg.in # msg-parser==1.2.0 # via -r requirements/extra-msg.in diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index 22492ce52..25afb1a67 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/extra-odt.in +# pip-compile --constraint=requirements/constraints.in requirements/extra-odt.in # lxml==4.9.3 # via diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 9706a826d..0286533d5 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/extra-paddleocr.in +# pip-compile --constraint=requirements/constraints.in requirements/extra-paddleocr.in # attrdict==2.0.1 # via unstructured-paddleocr @@ -35,7 +35,7 @@ cssutils==2.9.0 # via premailer cycler==0.12.1 # via matplotlib -cython==3.0.3 +cython==3.0.4 # via unstructured-paddleocr et-xmlfile==1.1.0 # via openpyxl @@ -213,7 +213,7 @@ tzdata==2023.3 # via pandas unstructured-paddleocr==2.6.1.3 # via -r requirements/extra-paddleocr.in -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/extra-pandoc.txt b/requirements/extra-pandoc.txt index aa0bc7b95..d79b36307 100644 --- a/requirements/extra-pandoc.txt +++ b/requirements/extra-pandoc.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/extra-pandoc.in +# pip-compile --constraint=requirements/constraints.in requirements/extra-pandoc.in # pypandoc==1.12 # via -r requirements/extra-pandoc.in diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 9eb226303..861479336 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/extra-pdf-image.in +# pip-compile --constraint=requirements/constraints.in requirements/extra-pdf-image.in # antlr4-python3-runtime==4.9.3 # via omegaconf @@ -223,7 +223,7 @@ tqdm==4.66.1 # huggingface-hub # iopath # transformers -transformers==4.34.0 +transformers==4.34.1 # via unstructured-inference typing-extensions==4.8.0 # via @@ -234,13 +234,13 @@ typing-extensions==4.8.0 # torch tzdata==2023.3 # via pandas -unstructured-inference==0.7.5 +unstructured-inference==0.7.7 # via -r requirements/extra-pdf-image.in unstructured-pytesseract==0.3.12 # via # -c requirements/constraints.in # -r requirements/extra-pdf-image.in -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 2ecd1b4ce..062f11a00 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/extra-pptx.in +# pip-compile --constraint=requirements/constraints.in requirements/extra-pptx.in # lxml==4.9.3 # via python-pptx diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index 2e47d267c..5fc3cdbe5 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/extra-xlsx.in +# pip-compile --constraint=requirements/constraints.in requirements/extra-xlsx.in # et-xmlfile==1.1.0 # via openpyxl diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index af3d1dd0e..1ab233286 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/huggingface.in +# pip-compile --constraint=requirements/constraints.in requirements/huggingface.in # certifi==2023.7.22 # via @@ -102,14 +102,14 @@ tqdm==4.66.1 # huggingface-hub # sacremoses # transformers -transformers==4.34.0 +transformers==4.34.1 # via -r requirements/huggingface.in typing-extensions==4.8.0 # via # -c requirements/base.txt # huggingface-hub # torch -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-airtable.txt b/requirements/ingest-airtable.txt index 52467ffc7..122f32c4e 100644 --- a/requirements/ingest-airtable.txt +++ b/requirements/ingest-airtable.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-airtable.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-airtable.in # certifi==2023.7.22 # via @@ -34,7 +34,7 @@ typing-extensions==4.8.0 # -c requirements/base.txt # pyairtable # pydantic -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-azure-cognitive-search.txt b/requirements/ingest-azure-cognitive-search.txt index ced625eda..c3be136ed 100644 --- a/requirements/ingest-azure-cognitive-search.txt +++ b/requirements/ingest-azure-cognitive-search.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-azure-cognitive-search.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-azure-cognitive-search.in # azure-common==1.1.28 # via azure-search-documents @@ -50,7 +50,7 @@ typing-extensions==4.8.0 # -c requirements/base.txt # azure-core # azure-search-documents -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index d4e7acc4f..c621e4b12 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -2,9 +2,9 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-azure.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-azure.in # -adlfs==2023.9.0 +adlfs==2023.10.0 # via -r requirements/ingest-azure.in aiohttp==3.8.6 # via adlfs @@ -94,7 +94,7 @@ typing-extensions==4.8.0 # -c requirements/base.txt # azure-core # azure-storage-blob -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-bedrock.in b/requirements/ingest-bedrock.in new file mode 100644 index 000000000..3c3d1b070 --- /dev/null +++ b/requirements/ingest-bedrock.in @@ -0,0 +1,5 @@ +-c constraints.in +-c base.txt + +boto3 +langchain diff --git a/requirements/ingest-bedrock.txt b/requirements/ingest-bedrock.txt new file mode 100644 index 000000000..68bcc0eaf --- /dev/null +++ b/requirements/ingest-bedrock.txt @@ -0,0 +1,132 @@ +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# pip-compile --constraint=requirements/constraints.in requirements/ingest-bedrock.in +# +aiohttp==3.8.6 + # via langchain +aiosignal==1.3.1 + # via aiohttp +anyio==3.7.1 + # via + # -c requirements/constraints.in + # langchain +async-timeout==4.0.3 + # via + # aiohttp + # langchain +attrs==23.1.0 + # via aiohttp +boto3==1.28.17 + # via + # -c requirements/constraints.in + # -r requirements/ingest-bedrock.in +botocore==1.31.17 + # via + # -c requirements/constraints.in + # boto3 + # s3transfer +certifi==2023.7.22 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests +charset-normalizer==3.3.0 + # via + # -c requirements/base.txt + # aiohttp + # requests +dataclasses-json==0.6.1 + # via + # -c requirements/base.txt + # langchain +exceptiongroup==1.1.3 + # via anyio +frozenlist==1.4.0 + # via + # aiohttp + # aiosignal +idna==3.4 + # via + # -c requirements/base.txt + # anyio + # requests + # yarl +jmespath==1.0.1 + # via + # boto3 + # botocore +jsonpatch==1.33 + # via langchain +jsonpointer==2.4 + # via jsonpatch +langchain==0.0.317 + # via -r requirements/ingest-bedrock.in +langsmith==0.0.46 + # via langchain +marshmallow==3.20.1 + # via + # -c requirements/base.txt + # dataclasses-json +multidict==6.0.4 + # via + # aiohttp + # yarl +mypy-extensions==1.0.0 + # via + # -c requirements/base.txt + # typing-inspect +numpy==1.24.4 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # langchain +packaging==23.2 + # via + # -c requirements/base.txt + # marshmallow +pydantic==1.10.13 + # via + # -c requirements/constraints.in + # langchain + # langsmith +python-dateutil==2.8.2 + # via botocore +pyyaml==6.0.1 + # via langchain +requests==2.31.0 + # via + # -c requirements/base.txt + # langchain + # langsmith +s3transfer==0.6.2 + # via boto3 +six==1.16.0 + # via + # -c requirements/base.txt + # python-dateutil +sniffio==1.3.0 + # via anyio +sqlalchemy==2.0.22 + # via langchain +tenacity==8.2.3 + # via langchain +typing-extensions==4.8.0 + # via + # -c requirements/base.txt + # pydantic + # sqlalchemy + # typing-inspect +typing-inspect==0.9.0 + # via + # -c requirements/base.txt + # dataclasses-json +urllib3==1.26.18 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # botocore + # requests +yarl==1.9.2 + # via aiohttp diff --git a/requirements/ingest-biomed.txt b/requirements/ingest-biomed.txt index 069b11a6a..c287d597c 100644 --- a/requirements/ingest-biomed.txt +++ b/requirements/ingest-biomed.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-biomed.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-biomed.in # beautifulsoup4==4.12.2 # via diff --git a/requirements/ingest-box.txt b/requirements/ingest-box.txt index ee21de809..5fa92bb39 100644 --- a/requirements/ingest-box.txt +++ b/requirements/ingest-box.txt @@ -2,13 +2,13 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-box.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-box.in # attrs==23.1.0 # via boxsdk boxfs==0.2.1 # via -r requirements/ingest-box.in -boxsdk[jwt]==3.9.1 +boxsdk[jwt]==3.9.2 # via boxfs certifi==2023.7.22 # via @@ -49,7 +49,7 @@ six==1.16.0 # via # -c requirements/base.txt # python-dateutil -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-confluence.txt b/requirements/ingest-confluence.txt index 64218b189..197504d4d 100644 --- a/requirements/ingest-confluence.txt +++ b/requirements/ingest-confluence.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-confluence.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in # atlassian-python-api==3.41.2 # via -r requirements/ingest-confluence.in @@ -36,7 +36,7 @@ six==1.16.0 # via # -c requirements/base.txt # atlassian-python-api -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-delta-table.txt b/requirements/ingest-delta-table.txt index d3c45a04d..ae74deb76 100644 --- a/requirements/ingest-delta-table.txt +++ b/requirements/ingest-delta-table.txt @@ -2,9 +2,9 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-delta-table.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-delta-table.in # -deltalake==0.11.0 +deltalake==0.12.0 # via -r requirements/ingest-delta-table.in fsspec==2023.9.1 # via diff --git a/requirements/ingest-discord.txt b/requirements/ingest-discord.txt index 922afb131..852cfeb9d 100644 --- a/requirements/ingest-discord.txt +++ b/requirements/ingest-discord.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-discord.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-discord.in # aiohttp==3.8.6 # via discord-py diff --git a/requirements/ingest-dropbox.txt b/requirements/ingest-dropbox.txt index e3ad8ce8a..6e69a9076 100644 --- a/requirements/ingest-dropbox.txt +++ b/requirements/ingest-dropbox.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-dropbox.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-dropbox.in # certifi==2023.7.22 # via @@ -40,7 +40,7 @@ six==1.16.0 # stone stone==3.3.1 # via dropbox -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-elasticsearch.txt b/requirements/ingest-elasticsearch.txt index 7d3cb79d4..84bb4d090 100644 --- a/requirements/ingest-elasticsearch.txt +++ b/requirements/ingest-elasticsearch.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-elasticsearch.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-elasticsearch.in # certifi==2023.7.22 # via @@ -15,7 +15,7 @@ elasticsearch==8.10.1 # via -r requirements/ingest-elasticsearch.in jq==1.6.0 # via -r requirements/ingest-elasticsearch.in -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt index 76344f6c3..5814d7296 100644 --- a/requirements/ingest-gcs.txt +++ b/requirements/ingest-gcs.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-gcs.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-gcs.in # aiohttp==3.8.6 # via gcsfs @@ -105,7 +105,7 @@ soupsieve==2.5 # via # -c requirements/base.txt # beautifulsoup4 -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-github.txt b/requirements/ingest-github.txt index e0eae5b2a..dd6fece5b 100644 --- a/requirements/ingest-github.txt +++ b/requirements/ingest-github.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-github.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-github.in # certifi==2023.7.22 # via @@ -47,7 +47,7 @@ typing-extensions==4.8.0 # via # -c requirements/base.txt # pygithub -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-gitlab.txt b/requirements/ingest-gitlab.txt index 02c9f868c..66f0799bc 100644 --- a/requirements/ingest-gitlab.txt +++ b/requirements/ingest-gitlab.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-gitlab.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-gitlab.in # certifi==2023.7.22 # via @@ -26,7 +26,7 @@ requests==2.31.0 # requests-toolbelt requests-toolbelt==1.0.0 # via python-gitlab -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-google-drive.txt b/requirements/ingest-google-drive.txt index b11ae5729..bd36a74b3 100644 --- a/requirements/ingest-google-drive.txt +++ b/requirements/ingest-google-drive.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-google-drive.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-google-drive.in # cachetools==5.3.1 # via google-auth @@ -17,7 +17,7 @@ charset-normalizer==3.3.0 # requests google-api-core==2.12.0 # via google-api-python-client -google-api-python-client==2.103.0 +google-api-python-client==2.104.0 # via -r requirements/ingest-google-drive.in google-auth==2.23.3 # via @@ -59,7 +59,7 @@ rsa==4.9 # via google-auth uritemplate==4.1.1 # via google-api-python-client -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-jira.txt b/requirements/ingest-jira.txt index 3681c6b68..c56b7f624 100644 --- a/requirements/ingest-jira.txt +++ b/requirements/ingest-jira.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-jira.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in # atlassian-python-api==3.41.2 # via -r requirements/ingest-jira.in @@ -36,7 +36,7 @@ six==1.16.0 # via # -c requirements/base.txt # atlassian-python-api -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-notion.txt b/requirements/ingest-notion.txt index b200c2562..f12076057 100644 --- a/requirements/ingest-notion.txt +++ b/requirements/ingest-notion.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-notion.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-notion.in # anyio==3.7.1 # via diff --git a/requirements/ingest-onedrive.txt b/requirements/ingest-onedrive.txt index 7dd0eaa8f..bd0baa86f 100644 --- a/requirements/ingest-onedrive.txt +++ b/requirements/ingest-onedrive.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-onedrive.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-onedrive.in # beautifulsoup4==4.12.2 # via @@ -52,7 +52,7 @@ soupsieve==2.5 # via # -c requirements/base.txt # beautifulsoup4 -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-openai.txt b/requirements/ingest-openai.txt index 84b0f9e0f..90b1df16c 100644 --- a/requirements/ingest-openai.txt +++ b/requirements/ingest-openai.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-openai.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-openai.in # aiohttp==3.8.6 # via @@ -50,9 +50,9 @@ jsonpatch==1.33 # via langchain jsonpointer==2.4 # via jsonpatch -langchain==0.0.315 +langchain==0.0.317 # via -r requirements/ingest-openai.in -langsmith==0.0.44 +langsmith==0.0.46 # via langchain marshmallow==3.20.1 # via @@ -117,7 +117,7 @@ typing-inspect==0.9.0 # via # -c requirements/base.txt # dataclasses-json -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-outlook.txt b/requirements/ingest-outlook.txt index 225004c7b..b8a8aa5b1 100644 --- a/requirements/ingest-outlook.txt +++ b/requirements/ingest-outlook.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-outlook.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-outlook.in # certifi==2023.7.22 # via @@ -42,7 +42,7 @@ requests==2.31.0 # -c requirements/base.txt # msal # office365-rest-python-client -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-reddit.txt b/requirements/ingest-reddit.txt index 6b34c0770..c895ab4d2 100644 --- a/requirements/ingest-reddit.txt +++ b/requirements/ingest-reddit.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-reddit.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-reddit.in # certifi==2023.7.22 # via @@ -28,7 +28,7 @@ requests==2.31.0 # update-checker update-checker==0.18.0 # via praw -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-s3.txt b/requirements/ingest-s3.txt index f99195606..cb66d70fc 100644 --- a/requirements/ingest-s3.txt +++ b/requirements/ingest-s3.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-s3.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-s3.in # aiobotocore==2.5.4 # via s3fs @@ -19,7 +19,9 @@ async-timeout==4.0.3 attrs==23.1.0 # via aiohttp botocore==1.31.17 - # via aiobotocore + # via + # -c requirements/constraints.in + # aiobotocore charset-normalizer==3.3.0 # via # -c requirements/base.txt @@ -55,7 +57,7 @@ typing-extensions==4.8.0 # via # -c requirements/base.txt # aioitertools -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-salesforce.txt b/requirements/ingest-salesforce.txt index 55ded68d6..724141ba5 100644 --- a/requirements/ingest-salesforce.txt +++ b/requirements/ingest-salesforce.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-salesforce.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-salesforce.in # attrs==23.1.0 # via zeep @@ -66,7 +66,7 @@ six==1.16.0 # isodate # python-dateutil # requests-file -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-sharepoint.txt b/requirements/ingest-sharepoint.txt index 1c2c7f5f6..6388f4610 100644 --- a/requirements/ingest-sharepoint.txt +++ b/requirements/ingest-sharepoint.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-sharepoint.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-sharepoint.in # certifi==2023.7.22 # via @@ -42,7 +42,7 @@ requests==2.31.0 # -c requirements/base.txt # msal # office365-rest-python-client -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/ingest-slack.txt b/requirements/ingest-slack.txt index 43cb8f756..28ddf0f9a 100644 --- a/requirements/ingest-slack.txt +++ b/requirements/ingest-slack.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-slack.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-slack.in # slack-sdk==3.23.0 # via -r requirements/ingest-slack.in diff --git a/requirements/ingest-wikipedia.txt b/requirements/ingest-wikipedia.txt index bfefd071b..25d83aa41 100644 --- a/requirements/ingest-wikipedia.txt +++ b/requirements/ingest-wikipedia.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/ingest-wikipedia.in +# pip-compile --constraint=requirements/constraints.in requirements/ingest-wikipedia.in # beautifulsoup4==4.12.2 # via @@ -29,7 +29,7 @@ soupsieve==2.5 # via # -c requirements/base.txt # beautifulsoup4 -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/requirements/test.txt b/requirements/test.txt index 403c35e47..10ac13c3f 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -2,13 +2,13 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile requirements/test.in +# pip-compile --constraint=requirements/constraints.in requirements/test.in # appdirs==1.4.4 # via label-studio-tools autoflake==2.2.1 # via -r requirements/test.in -black==23.9.1 +black==23.10.0 # via -r requirements/test.in certifi==2023.7.22 # via @@ -56,7 +56,7 @@ mccabe==0.7.0 # via flake8 multidict==6.0.4 # via yarl -mypy==1.6.0 +mypy==1.6.1 # via -r requirements/test.in mypy-extensions==1.0.0 # via @@ -132,7 +132,7 @@ typing-extensions==4.8.0 # black # mypy # pydantic -urllib3==1.26.17 +urllib3==1.26.18 # via # -c requirements/base.txt # -c requirements/constraints.in diff --git a/scripts/pip-compile.sh b/scripts/pip-compile.sh index 0bc23e11d..4891e121f 100755 --- a/scripts/pip-compile.sh +++ b/scripts/pip-compile.sh @@ -13,6 +13,6 @@ for file in requirements/*.in; do continue; fi; echo "running: pip-compile --upgrade $file" - pip-compile --upgrade "$file" + pip-compile --upgrade "$file" -c requirements/constraints.in done cp requirements/build.txt docs/requirements.txt diff --git a/setup.py b/setup.py index cbde874f6..ad98eb693 100644 --- a/setup.py +++ b/setup.py @@ -158,6 +158,7 @@ setup( "local-inference": all_doc_reqs, "paddleocr": load_requirements("requirements/extra-paddleocr.in"), "openai": load_requirements("requirements/ingest-openai.in"), + "bedrock": load_requirements("requirements/ingest-bedrock.in"), }, package_dir={"unstructured": "unstructured"}, package_data={"unstructured": ["nlp/*.txt"]}, diff --git a/unstructured/__version__.py b/unstructured/__version__.py index b1a5c77b4..b252958c0 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.25-dev0" # pragma: no cover +__version__ = "0.10.25-dev1" # pragma: no cover diff --git a/unstructured/embed/bedrock.py b/unstructured/embed/bedrock.py new file mode 100644 index 000000000..e46e1d4d4 --- /dev/null +++ b/unstructured/embed/bedrock.py @@ -0,0 +1,72 @@ +from typing import List + +import numpy as np + +from unstructured.documents.elements import ( + Element, +) +from unstructured.embed.interfaces import BaseEmbeddingEncoder +from unstructured.ingest.error import EmbeddingEncoderConnectionError +from unstructured.utils import requires_dependencies + + +class BedrockEmbeddingEncoder(BaseEmbeddingEncoder): + def __init__( + self, + aws_access_key_id: str, + aws_secret_access_key: str, + region_name: str = "us-west-2", + ): + self.aws_access_key_id = aws_access_key_id + self.aws_secret_access_key = aws_secret_access_key + self.region_name = region_name + self.initialize() + + def initialize(self): + self.bedrock_client = self.get_bedrock_client() + + def num_of_dimensions(self): + return np.shape(self.examplary_embedding) + + def is_unit_vector(self): + return np.isclose(np.linalg.norm(self.examplary_embedding), 1.0) + + def embed_query(self, query): + return np.array(self.bedrock_client.embed_query(query)) + + def embed_documents(self, elements: List[Element]) -> List[Element]: + embeddings = self.bedrock_client.embed_documents([str(e) for e in elements]) + elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings) + return elements_with_embeddings + + def _add_embeddings_to_elements(self, elements, embeddings) -> List[Element]: + assert len(elements) == len(embeddings) + elements_w_embedding = [] + for i, element in enumerate(elements): + element.embeddings = embeddings[i] + elements_w_embedding.append(element) + return elements + + @EmbeddingEncoderConnectionError.wrap + @requires_dependencies( + ["boto3", "numpy", "langchain"], + extras="bedrock", + ) + def get_bedrock_client(self): + if getattr(self, "bedrock_client", None): + return self.bedrock_client + + # delay import only when needed + import boto3 + from langchain.embeddings import BedrockEmbeddings + + bedrock_runtime = boto3.client( + service_name="bedrock-runtime", + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + region_name=self.region_name, + ) + + bedrock_client = BedrockEmbeddings(client=bedrock_runtime) + self.examplary_embedding = np.array(bedrock_client.embed_query("Q")) + return bedrock_client