mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 21:55:33 +00:00
Added AWS Bedrock embeddings (#1738)
Summary: Added support for AWS Bedrock embeddings. Leverages
"amazon.titan-tg1-large" for the embedding model.
Test
- find your aws secret access key and key id; make sure the account has
access to bedrock's tian embed model
- follow the instructions in
d5e797cd44/docs/source/bricks/embedding.rst (bedrockembeddingencoder)
---------
Co-authored-by: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com>
Co-authored-by: Yao You <yao@unstructured.io>
Co-authored-by: Yao You <theyaoyou@gmail.com>
Co-authored-by: Ahmet Melek <ahmetmeleq@gmail.com>
This commit is contained in:
parent
98d54e3184
commit
b8f24ba67e
@ -1,9 +1,11 @@
|
||||
## 0.10.25-dev0
|
||||
## 0.10.25-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
* **Add AWS bedrock embedding connector** `unstructured.embed.bedrock` now provides a connector to use AWS bedrock's `titan-embed-text` model to generate embeddings for elements. This features requires valid AWS bedrock setup and an internet connectionto run.
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.10.24
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/build.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/build.in
|
||||
#
|
||||
alabaster==0.7.13
|
||||
# via sphinx
|
||||
@ -116,7 +116,7 @@ sphinxcontrib-serializinghtml==1.1.5
|
||||
# via
|
||||
# -r requirements/build.in
|
||||
# sphinx
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -45,14 +45,69 @@ To obtain an api key, visit: https://platform.openai.com/account/api-keys
|
||||
from unstructured.documents.elements import Text
|
||||
from unstructured.embed.openai import OpenAIEmbeddingEncoder
|
||||
|
||||
# Initialize the encoder with OpenAI credentials
|
||||
embedding_encoder = OpenAIEmbeddingEncoder(api_key=os.environ["OPENAI_API_KEY"])
|
||||
|
||||
# Embed a list of Elements
|
||||
elements = embedding_encoder.embed_documents(
|
||||
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
|
||||
)
|
||||
|
||||
# Embed a single query string
|
||||
query = "This is the query"
|
||||
query_embedding = embedding_encoder.embed_query(query=query)
|
||||
|
||||
# Print embeddings
|
||||
[print(e.embeddings, e) for e in elements]
|
||||
print(query_embedding, query)
|
||||
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
|
||||
|
||||
``BedrockEmbeddingEncoder``
|
||||
--------------------------
|
||||
|
||||
The ``BedrockEmbeddingEncoder`` class provides an interface to obtain embeddings for text using the Bedrock embeddings via the langchain integration. It connects to the Bedrock Runtime using AWS's boto3 package.
|
||||
|
||||
Key methods and attributes include:
|
||||
|
||||
``embed_documents``: This function takes a list of Elements as its input and returns the same list with an updated embeddings attribute for each Element.
|
||||
|
||||
``embed_query``: This method takes a query as a string and returns the embedding vector for the given query string.
|
||||
|
||||
``num_of_dimensions``: A metadata property that signifies the number of dimensions in any embedding vector obtained via this class.
|
||||
|
||||
``is_unit_vector``: A metadata property that checks if embedding vectors obtained via this class are unit vectors.
|
||||
|
||||
Initialization:
|
||||
To create an instance of the `BedrockEmbeddingEncoder`, AWS credentials and the region name are required.
|
||||
|
||||
.. code:: python
|
||||
|
||||
import os
|
||||
|
||||
from unstructured.documents.elements import Text
|
||||
from unstructured.embed.bedrock import BedrockEmbeddingEncoder
|
||||
|
||||
# Initialize the encoder with AWS credentials
|
||||
embedding_encoder = BedrockEmbeddingEncoder(
|
||||
aws_access_key_id="YOUR_AWS_ACCESS_KEY_ID",
|
||||
aws_secret_access_key="YOUR_AWS_SECRET_ACCESS_KEY",
|
||||
region_name="us-west-2"
|
||||
)
|
||||
|
||||
# Embed a list of Elements
|
||||
elements = embedding_encoder.embed_documents(
|
||||
elements=[Text("Sentence A"), Text("Sentence B")]
|
||||
)
|
||||
|
||||
# Embed a single query string
|
||||
query = "Example query"
|
||||
query_embedding = embedding_encoder.embed_query(query=query)
|
||||
|
||||
# Print embeddings
|
||||
[print(e.embeddings, e) for e in elements]
|
||||
print(query_embedding, query)
|
||||
print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
|
||||
|
||||
|
||||
Dependencies:
|
||||
This class relies on several dependencies which include boto3, numpy, and langchain. Ensure these are installed and available in the environment where this class is utilized.
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/base.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/base.in
|
||||
#
|
||||
backoff==2.2.1
|
||||
# via -r requirements/base.in
|
||||
@ -66,7 +66,7 @@ typing-extensions==4.8.0
|
||||
# via typing-inspect
|
||||
typing-inspect==0.9.0
|
||||
# via dataclasses-json
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/build.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/build.in
|
||||
#
|
||||
alabaster==0.7.13
|
||||
# via sphinx
|
||||
@ -116,7 +116,7 @@ sphinxcontrib-serializinghtml==1.1.5
|
||||
# via
|
||||
# -r requirements/build.in
|
||||
# sphinx
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -5,6 +5,8 @@
|
||||
####################################################################################################
|
||||
# NOTE(alan): Pinning to avoid conflicts with downstream ingest-s3
|
||||
urllib3<1.27, >=1.25.4
|
||||
boto3<1.28.18
|
||||
botocore<1.31.18
|
||||
# consistency with local-inference-pin
|
||||
protobuf<4.24
|
||||
# NOTE(robinson) - Required pins for security scans
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/dev.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/dev.in
|
||||
#
|
||||
anyio==3.7.1
|
||||
# via
|
||||
@ -213,7 +213,7 @@ nest-asyncio==1.5.8
|
||||
# via ipykernel
|
||||
nodeenv==1.8.0
|
||||
# via pre-commit
|
||||
notebook==7.0.5
|
||||
notebook==7.0.6
|
||||
# via jupyter
|
||||
notebook-shim==0.2.3
|
||||
# via
|
||||
@ -390,7 +390,7 @@ typing-extensions==4.8.0
|
||||
# ipython
|
||||
uri-template==1.3.0
|
||||
# via jsonschema
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-csv.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/extra-csv.in
|
||||
#
|
||||
numpy==1.24.4
|
||||
# via
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-docx.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/extra-docx.in
|
||||
#
|
||||
lxml==4.9.3
|
||||
# via
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-epub.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/extra-epub.in
|
||||
#
|
||||
ebooklib==0.18
|
||||
# via -r requirements/extra-epub.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-markdown.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/extra-markdown.in
|
||||
#
|
||||
importlib-metadata==6.8.0
|
||||
# via markdown
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-msg.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/extra-msg.in
|
||||
#
|
||||
msg-parser==1.2.0
|
||||
# via -r requirements/extra-msg.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-odt.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/extra-odt.in
|
||||
#
|
||||
lxml==4.9.3
|
||||
# via
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-paddleocr.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/extra-paddleocr.in
|
||||
#
|
||||
attrdict==2.0.1
|
||||
# via unstructured-paddleocr
|
||||
@ -35,7 +35,7 @@ cssutils==2.9.0
|
||||
# via premailer
|
||||
cycler==0.12.1
|
||||
# via matplotlib
|
||||
cython==3.0.3
|
||||
cython==3.0.4
|
||||
# via unstructured-paddleocr
|
||||
et-xmlfile==1.1.0
|
||||
# via openpyxl
|
||||
@ -213,7 +213,7 @@ tzdata==2023.3
|
||||
# via pandas
|
||||
unstructured-paddleocr==2.6.1.3
|
||||
# via -r requirements/extra-paddleocr.in
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-pandoc.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/extra-pandoc.in
|
||||
#
|
||||
pypandoc==1.12
|
||||
# via -r requirements/extra-pandoc.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-pdf-image.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/extra-pdf-image.in
|
||||
#
|
||||
antlr4-python3-runtime==4.9.3
|
||||
# via omegaconf
|
||||
@ -223,7 +223,7 @@ tqdm==4.66.1
|
||||
# huggingface-hub
|
||||
# iopath
|
||||
# transformers
|
||||
transformers==4.34.0
|
||||
transformers==4.34.1
|
||||
# via unstructured-inference
|
||||
typing-extensions==4.8.0
|
||||
# via
|
||||
@ -234,13 +234,13 @@ typing-extensions==4.8.0
|
||||
# torch
|
||||
tzdata==2023.3
|
||||
# via pandas
|
||||
unstructured-inference==0.7.5
|
||||
unstructured-inference==0.7.7
|
||||
# via -r requirements/extra-pdf-image.in
|
||||
unstructured-pytesseract==0.3.12
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# -r requirements/extra-pdf-image.in
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-pptx.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/extra-pptx.in
|
||||
#
|
||||
lxml==4.9.3
|
||||
# via python-pptx
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/extra-xlsx.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/extra-xlsx.in
|
||||
#
|
||||
et-xmlfile==1.1.0
|
||||
# via openpyxl
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/huggingface.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/huggingface.in
|
||||
#
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -102,14 +102,14 @@ tqdm==4.66.1
|
||||
# huggingface-hub
|
||||
# sacremoses
|
||||
# transformers
|
||||
transformers==4.34.0
|
||||
transformers==4.34.1
|
||||
# via -r requirements/huggingface.in
|
||||
typing-extensions==4.8.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# huggingface-hub
|
||||
# torch
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-airtable.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-airtable.in
|
||||
#
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -34,7 +34,7 @@ typing-extensions==4.8.0
|
||||
# -c requirements/base.txt
|
||||
# pyairtable
|
||||
# pydantic
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-azure-cognitive-search.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-azure-cognitive-search.in
|
||||
#
|
||||
azure-common==1.1.28
|
||||
# via azure-search-documents
|
||||
@ -50,7 +50,7 @@ typing-extensions==4.8.0
|
||||
# -c requirements/base.txt
|
||||
# azure-core
|
||||
# azure-search-documents
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,9 +2,9 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-azure.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-azure.in
|
||||
#
|
||||
adlfs==2023.9.0
|
||||
adlfs==2023.10.0
|
||||
# via -r requirements/ingest-azure.in
|
||||
aiohttp==3.8.6
|
||||
# via adlfs
|
||||
@ -94,7 +94,7 @@ typing-extensions==4.8.0
|
||||
# -c requirements/base.txt
|
||||
# azure-core
|
||||
# azure-storage-blob
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
5
requirements/ingest-bedrock.in
Normal file
5
requirements/ingest-bedrock.in
Normal file
@ -0,0 +1,5 @@
|
||||
-c constraints.in
|
||||
-c base.txt
|
||||
|
||||
boto3
|
||||
langchain
|
||||
132
requirements/ingest-bedrock.txt
Normal file
132
requirements/ingest-bedrock.txt
Normal file
@ -0,0 +1,132 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-bedrock.in
|
||||
#
|
||||
aiohttp==3.8.6
|
||||
# via langchain
|
||||
aiosignal==1.3.1
|
||||
# via aiohttp
|
||||
anyio==3.7.1
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# langchain
|
||||
async-timeout==4.0.3
|
||||
# via
|
||||
# aiohttp
|
||||
# langchain
|
||||
attrs==23.1.0
|
||||
# via aiohttp
|
||||
boto3==1.28.17
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# -r requirements/ingest-bedrock.in
|
||||
botocore==1.31.17
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# boto3
|
||||
# s3transfer
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
charset-normalizer==3.3.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# aiohttp
|
||||
# requests
|
||||
dataclasses-json==0.6.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# langchain
|
||||
exceptiongroup==1.1.3
|
||||
# via anyio
|
||||
frozenlist==1.4.0
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
idna==3.4
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# anyio
|
||||
# requests
|
||||
# yarl
|
||||
jmespath==1.0.1
|
||||
# via
|
||||
# boto3
|
||||
# botocore
|
||||
jsonpatch==1.33
|
||||
# via langchain
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.0.317
|
||||
# via -r requirements/ingest-bedrock.in
|
||||
langsmith==0.0.46
|
||||
# via langchain
|
||||
marshmallow==3.20.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# dataclasses-json
|
||||
multidict==6.0.4
|
||||
# via
|
||||
# aiohttp
|
||||
# yarl
|
||||
mypy-extensions==1.0.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# typing-inspect
|
||||
numpy==1.24.4
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# langchain
|
||||
packaging==23.2
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# marshmallow
|
||||
pydantic==1.10.13
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# langchain
|
||||
# langsmith
|
||||
python-dateutil==2.8.2
|
||||
# via botocore
|
||||
pyyaml==6.0.1
|
||||
# via langchain
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# langchain
|
||||
# langsmith
|
||||
s3transfer==0.6.2
|
||||
# via boto3
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-dateutil
|
||||
sniffio==1.3.0
|
||||
# via anyio
|
||||
sqlalchemy==2.0.22
|
||||
# via langchain
|
||||
tenacity==8.2.3
|
||||
# via langchain
|
||||
typing-extensions==4.8.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# pydantic
|
||||
# sqlalchemy
|
||||
# typing-inspect
|
||||
typing-inspect==0.9.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# dataclasses-json
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# botocore
|
||||
# requests
|
||||
yarl==1.9.2
|
||||
# via aiohttp
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-biomed.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-biomed.in
|
||||
#
|
||||
beautifulsoup4==4.12.2
|
||||
# via
|
||||
|
||||
@ -2,13 +2,13 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-box.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-box.in
|
||||
#
|
||||
attrs==23.1.0
|
||||
# via boxsdk
|
||||
boxfs==0.2.1
|
||||
# via -r requirements/ingest-box.in
|
||||
boxsdk[jwt]==3.9.1
|
||||
boxsdk[jwt]==3.9.2
|
||||
# via boxfs
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -49,7 +49,7 @@ six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-dateutil
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-confluence.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-confluence.in
|
||||
#
|
||||
atlassian-python-api==3.41.2
|
||||
# via -r requirements/ingest-confluence.in
|
||||
@ -36,7 +36,7 @@ six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# atlassian-python-api
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,9 +2,9 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-delta-table.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-delta-table.in
|
||||
#
|
||||
deltalake==0.11.0
|
||||
deltalake==0.12.0
|
||||
# via -r requirements/ingest-delta-table.in
|
||||
fsspec==2023.9.1
|
||||
# via
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-discord.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-discord.in
|
||||
#
|
||||
aiohttp==3.8.6
|
||||
# via discord-py
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-dropbox.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-dropbox.in
|
||||
#
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -40,7 +40,7 @@ six==1.16.0
|
||||
# stone
|
||||
stone==3.3.1
|
||||
# via dropbox
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-elasticsearch.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-elasticsearch.in
|
||||
#
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -15,7 +15,7 @@ elasticsearch==8.10.1
|
||||
# via -r requirements/ingest-elasticsearch.in
|
||||
jq==1.6.0
|
||||
# via -r requirements/ingest-elasticsearch.in
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-gcs.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-gcs.in
|
||||
#
|
||||
aiohttp==3.8.6
|
||||
# via gcsfs
|
||||
@ -105,7 +105,7 @@ soupsieve==2.5
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# beautifulsoup4
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-github.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-github.in
|
||||
#
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -47,7 +47,7 @@ typing-extensions==4.8.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# pygithub
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-gitlab.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-gitlab.in
|
||||
#
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -26,7 +26,7 @@ requests==2.31.0
|
||||
# requests-toolbelt
|
||||
requests-toolbelt==1.0.0
|
||||
# via python-gitlab
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-google-drive.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-google-drive.in
|
||||
#
|
||||
cachetools==5.3.1
|
||||
# via google-auth
|
||||
@ -17,7 +17,7 @@ charset-normalizer==3.3.0
|
||||
# requests
|
||||
google-api-core==2.12.0
|
||||
# via google-api-python-client
|
||||
google-api-python-client==2.103.0
|
||||
google-api-python-client==2.104.0
|
||||
# via -r requirements/ingest-google-drive.in
|
||||
google-auth==2.23.3
|
||||
# via
|
||||
@ -59,7 +59,7 @@ rsa==4.9
|
||||
# via google-auth
|
||||
uritemplate==4.1.1
|
||||
# via google-api-python-client
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-jira.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-jira.in
|
||||
#
|
||||
atlassian-python-api==3.41.2
|
||||
# via -r requirements/ingest-jira.in
|
||||
@ -36,7 +36,7 @@ six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# atlassian-python-api
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-notion.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-notion.in
|
||||
#
|
||||
anyio==3.7.1
|
||||
# via
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-onedrive.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-onedrive.in
|
||||
#
|
||||
beautifulsoup4==4.12.2
|
||||
# via
|
||||
@ -52,7 +52,7 @@ soupsieve==2.5
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# beautifulsoup4
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-openai.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-openai.in
|
||||
#
|
||||
aiohttp==3.8.6
|
||||
# via
|
||||
@ -50,9 +50,9 @@ jsonpatch==1.33
|
||||
# via langchain
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.0.315
|
||||
langchain==0.0.317
|
||||
# via -r requirements/ingest-openai.in
|
||||
langsmith==0.0.44
|
||||
langsmith==0.0.46
|
||||
# via langchain
|
||||
marshmallow==3.20.1
|
||||
# via
|
||||
@ -117,7 +117,7 @@ typing-inspect==0.9.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# dataclasses-json
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-outlook.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-outlook.in
|
||||
#
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -42,7 +42,7 @@ requests==2.31.0
|
||||
# -c requirements/base.txt
|
||||
# msal
|
||||
# office365-rest-python-client
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-reddit.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-reddit.in
|
||||
#
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -28,7 +28,7 @@ requests==2.31.0
|
||||
# update-checker
|
||||
update-checker==0.18.0
|
||||
# via praw
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-s3.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-s3.in
|
||||
#
|
||||
aiobotocore==2.5.4
|
||||
# via s3fs
|
||||
@ -19,7 +19,9 @@ async-timeout==4.0.3
|
||||
attrs==23.1.0
|
||||
# via aiohttp
|
||||
botocore==1.31.17
|
||||
# via aiobotocore
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# aiobotocore
|
||||
charset-normalizer==3.3.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
@ -55,7 +57,7 @@ typing-extensions==4.8.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# aioitertools
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-salesforce.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-salesforce.in
|
||||
#
|
||||
attrs==23.1.0
|
||||
# via zeep
|
||||
@ -66,7 +66,7 @@ six==1.16.0
|
||||
# isodate
|
||||
# python-dateutil
|
||||
# requests-file
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-sharepoint.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-sharepoint.in
|
||||
#
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -42,7 +42,7 @@ requests==2.31.0
|
||||
# -c requirements/base.txt
|
||||
# msal
|
||||
# office365-rest-python-client
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-slack.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-slack.in
|
||||
#
|
||||
slack-sdk==3.23.0
|
||||
# via -r requirements/ingest-slack.in
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/ingest-wikipedia.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest-wikipedia.in
|
||||
#
|
||||
beautifulsoup4==4.12.2
|
||||
# via
|
||||
@ -29,7 +29,7 @@ soupsieve==2.5
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# beautifulsoup4
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -2,13 +2,13 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/test.in
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/test.in
|
||||
#
|
||||
appdirs==1.4.4
|
||||
# via label-studio-tools
|
||||
autoflake==2.2.1
|
||||
# via -r requirements/test.in
|
||||
black==23.9.1
|
||||
black==23.10.0
|
||||
# via -r requirements/test.in
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -56,7 +56,7 @@ mccabe==0.7.0
|
||||
# via flake8
|
||||
multidict==6.0.4
|
||||
# via yarl
|
||||
mypy==1.6.0
|
||||
mypy==1.6.1
|
||||
# via -r requirements/test.in
|
||||
mypy-extensions==1.0.0
|
||||
# via
|
||||
@ -132,7 +132,7 @@ typing-extensions==4.8.0
|
||||
# black
|
||||
# mypy
|
||||
# pydantic
|
||||
urllib3==1.26.17
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
|
||||
@ -13,6 +13,6 @@ for file in requirements/*.in; do
|
||||
continue;
|
||||
fi;
|
||||
echo "running: pip-compile --upgrade $file"
|
||||
pip-compile --upgrade "$file"
|
||||
pip-compile --upgrade "$file" -c requirements/constraints.in
|
||||
done
|
||||
cp requirements/build.txt docs/requirements.txt
|
||||
|
||||
1
setup.py
1
setup.py
@ -158,6 +158,7 @@ setup(
|
||||
"local-inference": all_doc_reqs,
|
||||
"paddleocr": load_requirements("requirements/extra-paddleocr.in"),
|
||||
"openai": load_requirements("requirements/ingest-openai.in"),
|
||||
"bedrock": load_requirements("requirements/ingest-bedrock.in"),
|
||||
},
|
||||
package_dir={"unstructured": "unstructured"},
|
||||
package_data={"unstructured": ["nlp/*.txt"]},
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.25-dev0" # pragma: no cover
|
||||
__version__ = "0.10.25-dev1" # pragma: no cover
|
||||
|
||||
72
unstructured/embed/bedrock.py
Normal file
72
unstructured/embed/bedrock.py
Normal file
@ -0,0 +1,72 @@
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
)
|
||||
from unstructured.embed.interfaces import BaseEmbeddingEncoder
|
||||
from unstructured.ingest.error import EmbeddingEncoderConnectionError
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
|
||||
class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
|
||||
def __init__(
|
||||
self,
|
||||
aws_access_key_id: str,
|
||||
aws_secret_access_key: str,
|
||||
region_name: str = "us-west-2",
|
||||
):
|
||||
self.aws_access_key_id = aws_access_key_id
|
||||
self.aws_secret_access_key = aws_secret_access_key
|
||||
self.region_name = region_name
|
||||
self.initialize()
|
||||
|
||||
def initialize(self):
|
||||
self.bedrock_client = self.get_bedrock_client()
|
||||
|
||||
def num_of_dimensions(self):
|
||||
return np.shape(self.examplary_embedding)
|
||||
|
||||
def is_unit_vector(self):
|
||||
return np.isclose(np.linalg.norm(self.examplary_embedding), 1.0)
|
||||
|
||||
def embed_query(self, query):
|
||||
return np.array(self.bedrock_client.embed_query(query))
|
||||
|
||||
def embed_documents(self, elements: List[Element]) -> List[Element]:
|
||||
embeddings = self.bedrock_client.embed_documents([str(e) for e in elements])
|
||||
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
||||
return elements_with_embeddings
|
||||
|
||||
def _add_embeddings_to_elements(self, elements, embeddings) -> List[Element]:
|
||||
assert len(elements) == len(embeddings)
|
||||
elements_w_embedding = []
|
||||
for i, element in enumerate(elements):
|
||||
element.embeddings = embeddings[i]
|
||||
elements_w_embedding.append(element)
|
||||
return elements
|
||||
|
||||
@EmbeddingEncoderConnectionError.wrap
|
||||
@requires_dependencies(
|
||||
["boto3", "numpy", "langchain"],
|
||||
extras="bedrock",
|
||||
)
|
||||
def get_bedrock_client(self):
|
||||
if getattr(self, "bedrock_client", None):
|
||||
return self.bedrock_client
|
||||
|
||||
# delay import only when needed
|
||||
import boto3
|
||||
from langchain.embeddings import BedrockEmbeddings
|
||||
|
||||
bedrock_runtime = boto3.client(
|
||||
service_name="bedrock-runtime",
|
||||
aws_access_key_id=self.aws_access_key_id,
|
||||
aws_secret_access_key=self.aws_secret_access_key,
|
||||
region_name=self.region_name,
|
||||
)
|
||||
|
||||
bedrock_client = BedrockEmbeddings(client=bedrock_runtime)
|
||||
self.examplary_embedding = np.array(bedrock_client.embed_query("Q"))
|
||||
return bedrock_client
|
||||
Loading…
x
Reference in New Issue
Block a user