Added AWS Bedrock embeddings (#1738)

Summary: Added support for AWS Bedrock embeddings. Leverages
"amazon.titan-tg1-large" for the embedding model.

Test

- find your aws secret access key and key id; make sure the account has
access to bedrock's tian embed model
- follow the instructions in
d5e797cd44/docs/source/bricks/embedding.rst (bedrockembeddingencoder)

---------

Co-authored-by: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com>
Co-authored-by: Yao You <yao@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Ahmet Melek <ahmetmeleq@gmail.com>
This commit is contained in:
Jack Retterer 2023-10-18 17:36:51 -07:00 committed by GitHub
parent 98d54e3184
commit b8f24ba67e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
51 changed files with 358 additions and 87 deletions

View File

@ -1,9 +1,11 @@
## 0.10.25-dev0
## 0.10.25-dev1
### Enhancements
### Features
* **Add AWS bedrock embedding connector** `unstructured.embed.bedrock` now provides a connector to use AWS bedrock's `titan-embed-text` model to generate embeddings for elements. This features requires valid AWS bedrock setup and an internet connectionto run.
### Fixes
## 0.10.24

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/build.in
# pip-compile --constraint=requirements/constraints.in requirements/build.in
#
alabaster==0.7.13
# via sphinx
@ -116,7 +116,7 @@ sphinxcontrib-serializinghtml==1.1.5
# via
# -r requirements/build.in
# sphinx
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -45,14 +45,69 @@ To obtain an api key, visit: https://platform.openai.com/account/api-keys
from unstructured.documents.elements import Text
from unstructured.embed.openai import OpenAIEmbeddingEncoder
# Initialize the encoder with OpenAI credentials
embedding_encoder = OpenAIEmbeddingEncoder(api_key=os.environ["OPENAI_API_KEY"])
# Embed a list of Elements
elements = embedding_encoder.embed_documents(
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
)
# Embed a single query string
query = "This is the query"
query_embedding = embedding_encoder.embed_query(query=query)
# Print embeddings
[print(e.embeddings, e) for e in elements]
print(query_embedding, query)
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
``BedrockEmbeddingEncoder``
--------------------------
The ``BedrockEmbeddingEncoder`` class provides an interface to obtain embeddings for text using the Bedrock embeddings via the langchain integration. It connects to the Bedrock Runtime using AWS's boto3 package.
Key methods and attributes include:
``embed_documents``: This function takes a list of Elements as its input and returns the same list with an updated embeddings attribute for each Element.
``embed_query``: This method takes a query as a string and returns the embedding vector for the given query string.
``num_of_dimensions``: A metadata property that signifies the number of dimensions in any embedding vector obtained via this class.
``is_unit_vector``: A metadata property that checks if embedding vectors obtained via this class are unit vectors.
Initialization:
To create an instance of the `BedrockEmbeddingEncoder`, AWS credentials and the region name are required.
.. code:: python
import os
from unstructured.documents.elements import Text
from unstructured.embed.bedrock import BedrockEmbeddingEncoder
# Initialize the encoder with AWS credentials
embedding_encoder = BedrockEmbeddingEncoder(
aws_access_key_id="YOUR_AWS_ACCESS_KEY_ID",
aws_secret_access_key="YOUR_AWS_SECRET_ACCESS_KEY",
region_name="us-west-2"
)
# Embed a list of Elements
elements = embedding_encoder.embed_documents(
elements=[Text("Sentence A"), Text("Sentence B")]
)
# Embed a single query string
query = "Example query"
query_embedding = embedding_encoder.embed_query(query=query)
# Print embeddings
[print(e.embeddings, e) for e in elements]
print(query_embedding, query)
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
Dependencies:
This class relies on several dependencies which include boto3, numpy, and langchain. Ensure these are installed and available in the environment where this class is utilized.

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/base.in
# pip-compile --constraint=requirements/constraints.in requirements/base.in
#
backoff==2.2.1
# via -r requirements/base.in
@ -66,7 +66,7 @@ typing-extensions==4.8.0
# via typing-inspect
typing-inspect==0.9.0
# via dataclasses-json
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/constraints.in
# requests

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/build.in
# pip-compile --constraint=requirements/constraints.in requirements/build.in
#
alabaster==0.7.13
# via sphinx
@ -116,7 +116,7 @@ sphinxcontrib-serializinghtml==1.1.5
# via
# -r requirements/build.in
# sphinx
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -5,6 +5,8 @@
####################################################################################################
# NOTE(alan): Pinning to avoid conflicts with downstream ingest-s3
urllib3<1.27, >=1.25.4
boto3<1.28.18
botocore<1.31.18
# consistency with local-inference-pin
protobuf<4.24
# NOTE(robinson) - Required pins for security scans

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/dev.in
# pip-compile --constraint=requirements/constraints.in requirements/dev.in
#
anyio==3.7.1
# via
@ -213,7 +213,7 @@ nest-asyncio==1.5.8
# via ipykernel
nodeenv==1.8.0
# via pre-commit
notebook==7.0.5
notebook==7.0.6
# via jupyter
notebook-shim==0.2.3
# via
@ -390,7 +390,7 @@ typing-extensions==4.8.0
# ipython
uri-template==1.3.0
# via jsonschema
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/extra-csv.in
# pip-compile --constraint=requirements/constraints.in requirements/extra-csv.in
#
numpy==1.24.4
# via

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/extra-docx.in
# pip-compile --constraint=requirements/constraints.in requirements/extra-docx.in
#
lxml==4.9.3
# via

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/extra-epub.in
# pip-compile --constraint=requirements/constraints.in requirements/extra-epub.in
#
ebooklib==0.18
# via -r requirements/extra-epub.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/extra-markdown.in
# pip-compile --constraint=requirements/constraints.in requirements/extra-markdown.in
#
importlib-metadata==6.8.0
# via markdown

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/extra-msg.in
# pip-compile --constraint=requirements/constraints.in requirements/extra-msg.in
#
msg-parser==1.2.0
# via -r requirements/extra-msg.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/extra-odt.in
# pip-compile --constraint=requirements/constraints.in requirements/extra-odt.in
#
lxml==4.9.3
# via

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/extra-paddleocr.in
# pip-compile --constraint=requirements/constraints.in requirements/extra-paddleocr.in
#
attrdict==2.0.1
# via unstructured-paddleocr
@ -35,7 +35,7 @@ cssutils==2.9.0
# via premailer
cycler==0.12.1
# via matplotlib
cython==3.0.3
cython==3.0.4
# via unstructured-paddleocr
et-xmlfile==1.1.0
# via openpyxl
@ -213,7 +213,7 @@ tzdata==2023.3
# via pandas
unstructured-paddleocr==2.6.1.3
# via -r requirements/extra-paddleocr.in
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/extra-pandoc.in
# pip-compile --constraint=requirements/constraints.in requirements/extra-pandoc.in
#
pypandoc==1.12
# via -r requirements/extra-pandoc.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/extra-pdf-image.in
# pip-compile --constraint=requirements/constraints.in requirements/extra-pdf-image.in
#
antlr4-python3-runtime==4.9.3
# via omegaconf
@ -223,7 +223,7 @@ tqdm==4.66.1
# huggingface-hub
# iopath
# transformers
transformers==4.34.0
transformers==4.34.1
# via unstructured-inference
typing-extensions==4.8.0
# via
@ -234,13 +234,13 @@ typing-extensions==4.8.0
# torch
tzdata==2023.3
# via pandas
unstructured-inference==0.7.5
unstructured-inference==0.7.7
# via -r requirements/extra-pdf-image.in
unstructured-pytesseract==0.3.12
# via
# -c requirements/constraints.in
# -r requirements/extra-pdf-image.in
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/extra-pptx.in
# pip-compile --constraint=requirements/constraints.in requirements/extra-pptx.in
#
lxml==4.9.3
# via python-pptx

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/extra-xlsx.in
# pip-compile --constraint=requirements/constraints.in requirements/extra-xlsx.in
#
et-xmlfile==1.1.0
# via openpyxl

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/huggingface.in
# pip-compile --constraint=requirements/constraints.in requirements/huggingface.in
#
certifi==2023.7.22
# via
@ -102,14 +102,14 @@ tqdm==4.66.1
# huggingface-hub
# sacremoses
# transformers
transformers==4.34.0
transformers==4.34.1
# via -r requirements/huggingface.in
typing-extensions==4.8.0
# via
# -c requirements/base.txt
# huggingface-hub
# torch
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-airtable.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-airtable.in
#
certifi==2023.7.22
# via
@ -34,7 +34,7 @@ typing-extensions==4.8.0
# -c requirements/base.txt
# pyairtable
# pydantic
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-azure-cognitive-search.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-azure-cognitive-search.in
#
azure-common==1.1.28
# via azure-search-documents
@ -50,7 +50,7 @@ typing-extensions==4.8.0
# -c requirements/base.txt
# azure-core
# azure-search-documents
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,9 +2,9 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-azure.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-azure.in
#
adlfs==2023.9.0
adlfs==2023.10.0
# via -r requirements/ingest-azure.in
aiohttp==3.8.6
# via adlfs
@ -94,7 +94,7 @@ typing-extensions==4.8.0
# -c requirements/base.txt
# azure-core
# azure-storage-blob
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -0,0 +1,5 @@
-c constraints.in
-c base.txt
boto3
langchain

View File

@ -0,0 +1,132 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile --constraint=requirements/constraints.in requirements/ingest-bedrock.in
#
aiohttp==3.8.6
# via langchain
aiosignal==1.3.1
# via aiohttp
anyio==3.7.1
# via
# -c requirements/constraints.in
# langchain
async-timeout==4.0.3
# via
# aiohttp
# langchain
attrs==23.1.0
# via aiohttp
boto3==1.28.17
# via
# -c requirements/constraints.in
# -r requirements/ingest-bedrock.in
botocore==1.31.17
# via
# -c requirements/constraints.in
# boto3
# s3transfer
certifi==2023.7.22
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
# aiohttp
# requests
dataclasses-json==0.6.1
# via
# -c requirements/base.txt
# langchain
exceptiongroup==1.1.3
# via anyio
frozenlist==1.4.0
# via
# aiohttp
# aiosignal
idna==3.4
# via
# -c requirements/base.txt
# anyio
# requests
# yarl
jmespath==1.0.1
# via
# boto3
# botocore
jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.317
# via -r requirements/ingest-bedrock.in
langsmith==0.0.46
# via langchain
marshmallow==3.20.1
# via
# -c requirements/base.txt
# dataclasses-json
multidict==6.0.4
# via
# aiohttp
# yarl
mypy-extensions==1.0.0
# via
# -c requirements/base.txt
# typing-inspect
numpy==1.24.4
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# langchain
packaging==23.2
# via
# -c requirements/base.txt
# marshmallow
pydantic==1.10.13
# via
# -c requirements/constraints.in
# langchain
# langsmith
python-dateutil==2.8.2
# via botocore
pyyaml==6.0.1
# via langchain
requests==2.31.0
# via
# -c requirements/base.txt
# langchain
# langsmith
s3transfer==0.6.2
# via boto3
six==1.16.0
# via
# -c requirements/base.txt
# python-dateutil
sniffio==1.3.0
# via anyio
sqlalchemy==2.0.22
# via langchain
tenacity==8.2.3
# via langchain
typing-extensions==4.8.0
# via
# -c requirements/base.txt
# pydantic
# sqlalchemy
# typing-inspect
typing-inspect==0.9.0
# via
# -c requirements/base.txt
# dataclasses-json
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# botocore
# requests
yarl==1.9.2
# via aiohttp

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-biomed.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-biomed.in
#
beautifulsoup4==4.12.2
# via

View File

@ -2,13 +2,13 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-box.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-box.in
#
attrs==23.1.0
# via boxsdk
boxfs==0.2.1
# via -r requirements/ingest-box.in
boxsdk[jwt]==3.9.1
boxsdk[jwt]==3.9.2
# via boxfs
certifi==2023.7.22
# via
@ -49,7 +49,7 @@ six==1.16.0
# via
# -c requirements/base.txt
# python-dateutil
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-confluence.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in
#
atlassian-python-api==3.41.2
# via -r requirements/ingest-confluence.in
@ -36,7 +36,7 @@ six==1.16.0
# via
# -c requirements/base.txt
# atlassian-python-api
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,9 +2,9 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-delta-table.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-delta-table.in
#
deltalake==0.11.0
deltalake==0.12.0
# via -r requirements/ingest-delta-table.in
fsspec==2023.9.1
# via

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-discord.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-discord.in
#
aiohttp==3.8.6
# via discord-py

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-dropbox.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-dropbox.in
#
certifi==2023.7.22
# via
@ -40,7 +40,7 @@ six==1.16.0
# stone
stone==3.3.1
# via dropbox
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-elasticsearch.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-elasticsearch.in
#
certifi==2023.7.22
# via
@ -15,7 +15,7 @@ elasticsearch==8.10.1
# via -r requirements/ingest-elasticsearch.in
jq==1.6.0
# via -r requirements/ingest-elasticsearch.in
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-gcs.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-gcs.in
#
aiohttp==3.8.6
# via gcsfs
@ -105,7 +105,7 @@ soupsieve==2.5
# via
# -c requirements/base.txt
# beautifulsoup4
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-github.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-github.in
#
certifi==2023.7.22
# via
@ -47,7 +47,7 @@ typing-extensions==4.8.0
# via
# -c requirements/base.txt
# pygithub
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-gitlab.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-gitlab.in
#
certifi==2023.7.22
# via
@ -26,7 +26,7 @@ requests==2.31.0
# requests-toolbelt
requests-toolbelt==1.0.0
# via python-gitlab
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-google-drive.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-google-drive.in
#
cachetools==5.3.1
# via google-auth
@ -17,7 +17,7 @@ charset-normalizer==3.3.0
# requests
google-api-core==2.12.0
# via google-api-python-client
google-api-python-client==2.103.0
google-api-python-client==2.104.0
# via -r requirements/ingest-google-drive.in
google-auth==2.23.3
# via
@ -59,7 +59,7 @@ rsa==4.9
# via google-auth
uritemplate==4.1.1
# via google-api-python-client
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-jira.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in
#
atlassian-python-api==3.41.2
# via -r requirements/ingest-jira.in
@ -36,7 +36,7 @@ six==1.16.0
# via
# -c requirements/base.txt
# atlassian-python-api
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-notion.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-notion.in
#
anyio==3.7.1
# via

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-onedrive.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-onedrive.in
#
beautifulsoup4==4.12.2
# via
@ -52,7 +52,7 @@ soupsieve==2.5
# via
# -c requirements/base.txt
# beautifulsoup4
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-openai.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-openai.in
#
aiohttp==3.8.6
# via
@ -50,9 +50,9 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.315
langchain==0.0.317
# via -r requirements/ingest-openai.in
langsmith==0.0.44
langsmith==0.0.46
# via langchain
marshmallow==3.20.1
# via
@ -117,7 +117,7 @@ typing-inspect==0.9.0
# via
# -c requirements/base.txt
# dataclasses-json
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-outlook.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-outlook.in
#
certifi==2023.7.22
# via
@ -42,7 +42,7 @@ requests==2.31.0
# -c requirements/base.txt
# msal
# office365-rest-python-client
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-reddit.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-reddit.in
#
certifi==2023.7.22
# via
@ -28,7 +28,7 @@ requests==2.31.0
# update-checker
update-checker==0.18.0
# via praw
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-s3.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-s3.in
#
aiobotocore==2.5.4
# via s3fs
@ -19,7 +19,9 @@ async-timeout==4.0.3
attrs==23.1.0
# via aiohttp
botocore==1.31.17
# via aiobotocore
# via
# -c requirements/constraints.in
# aiobotocore
charset-normalizer==3.3.0
# via
# -c requirements/base.txt
@ -55,7 +57,7 @@ typing-extensions==4.8.0
# via
# -c requirements/base.txt
# aioitertools
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-salesforce.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-salesforce.in
#
attrs==23.1.0
# via zeep
@ -66,7 +66,7 @@ six==1.16.0
# isodate
# python-dateutil
# requests-file
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-sharepoint.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-sharepoint.in
#
certifi==2023.7.22
# via
@ -42,7 +42,7 @@ requests==2.31.0
# -c requirements/base.txt
# msal
# office365-rest-python-client
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-slack.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-slack.in
#
slack-sdk==3.23.0
# via -r requirements/ingest-slack.in

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-wikipedia.in
# pip-compile --constraint=requirements/constraints.in requirements/ingest-wikipedia.in
#
beautifulsoup4==4.12.2
# via
@ -29,7 +29,7 @@ soupsieve==2.5
# via
# -c requirements/base.txt
# beautifulsoup4
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -2,13 +2,13 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/test.in
# pip-compile --constraint=requirements/constraints.in requirements/test.in
#
appdirs==1.4.4
# via label-studio-tools
autoflake==2.2.1
# via -r requirements/test.in
black==23.9.1
black==23.10.0
# via -r requirements/test.in
certifi==2023.7.22
# via
@ -56,7 +56,7 @@ mccabe==0.7.0
# via flake8
multidict==6.0.4
# via yarl
mypy==1.6.0
mypy==1.6.1
# via -r requirements/test.in
mypy-extensions==1.0.0
# via
@ -132,7 +132,7 @@ typing-extensions==4.8.0
# black
# mypy
# pydantic
urllib3==1.26.17
urllib3==1.26.18
# via
# -c requirements/base.txt
# -c requirements/constraints.in

View File

@ -13,6 +13,6 @@ for file in requirements/*.in; do
continue;
fi;
echo "running: pip-compile --upgrade $file"
pip-compile --upgrade "$file"
pip-compile --upgrade "$file" -c requirements/constraints.in
done
cp requirements/build.txt docs/requirements.txt

View File

@ -158,6 +158,7 @@ setup(
"local-inference": all_doc_reqs,
"paddleocr": load_requirements("requirements/extra-paddleocr.in"),
"openai": load_requirements("requirements/ingest-openai.in"),
"bedrock": load_requirements("requirements/ingest-bedrock.in"),
},
package_dir={"unstructured": "unstructured"},
package_data={"unstructured": ["nlp/*.txt"]},

View File

@ -1 +1 @@
__version__ = "0.10.25-dev0" # pragma: no cover
__version__ = "0.10.25-dev1" # pragma: no cover

View File

@ -0,0 +1,72 @@
from typing import List
import numpy as np
from unstructured.documents.elements import (
Element,
)
from unstructured.embed.interfaces import BaseEmbeddingEncoder
from unstructured.ingest.error import EmbeddingEncoderConnectionError
from unstructured.utils import requires_dependencies
class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
def __init__(
self,
aws_access_key_id: str,
aws_secret_access_key: str,
region_name: str = "us-west-2",
):
self.aws_access_key_id = aws_access_key_id
self.aws_secret_access_key = aws_secret_access_key
self.region_name = region_name
self.initialize()
def initialize(self):
self.bedrock_client = self.get_bedrock_client()
def num_of_dimensions(self):
return np.shape(self.examplary_embedding)
def is_unit_vector(self):
return np.isclose(np.linalg.norm(self.examplary_embedding), 1.0)
def embed_query(self, query):
return np.array(self.bedrock_client.embed_query(query))
def embed_documents(self, elements: List[Element]) -> List[Element]:
embeddings = self.bedrock_client.embed_documents([str(e) for e in elements])
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
return elements_with_embeddings
def _add_embeddings_to_elements(self, elements, embeddings) -> List[Element]:
assert len(elements) == len(embeddings)
elements_w_embedding = []
for i, element in enumerate(elements):
element.embeddings = embeddings[i]
elements_w_embedding.append(element)
return elements
@EmbeddingEncoderConnectionError.wrap
@requires_dependencies(
["boto3", "numpy", "langchain"],
extras="bedrock",
)
def get_bedrock_client(self):
if getattr(self, "bedrock_client", None):
return self.bedrock_client
# delay import only when needed
import boto3
from langchain.embeddings import BedrockEmbeddings
bedrock_runtime = boto3.client(
service_name="bedrock-runtime",
aws_access_key_id=self.aws_access_key_id,
aws_secret_access_key=self.aws_secret_access_key,
region_name=self.region_name,
)
bedrock_client = BedrockEmbeddings(client=bedrock_runtime)
self.examplary_embedding = np.array(bedrock_client.embed_query("Q"))
return bedrock_client