mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-17 11:09:41 +00:00
add mongo db destination connector (#2068)
### Description This adds the basic implementation of pushing the generated json output of partition to mongodb. None of this code provisions the mondo db instance so things like adding a search index around the embedding content must be done by the user. Any sort of schema validation would also have to take place via user-specific configuration on the database. This update makes no assumptions about the configuration of the database itself.
This commit is contained in:
parent
ead2a7f1eb
commit
b8af2f18bb
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@ -446,6 +446,8 @@ jobs:
|
||||
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
|
||||
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
MONGODB_URI: ${{ secrets.MONGODB_URI }}
|
||||
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }}
|
||||
TABLE_OCR: "tesseract"
|
||||
OCR_AGENT: "tesseract"
|
||||
CI: "true"
|
||||
|
@ -8,6 +8,7 @@
|
||||
### Features
|
||||
|
||||
* **Add ad-hoc fields to ElementMetadata instance.** End-users can now add their own metadata fields simply by assigning to an element-metadata attribute-name of their choice, like `element.metadata.coefficient = 0.58`. These fields will round-trip through JSON and can be accessed with dotted notation.
|
||||
* **MongoDB Destination Connector** New destination connector added to all CLI ingest commands to support writing partitioned json output to mongodb.
|
||||
|
||||
### Fixes
|
||||
|
||||
|
68
docs/source/ingest/destination_connectors/mongodb.rst
Normal file
68
docs/source/ingest/destination_connectors/mongodb.rst
Normal file
@ -0,0 +1,68 @@
|
||||
MongoDB
|
||||
======================
|
||||
|
||||
Batch process all your records using ``unstructured-ingest`` to store structured outputs locally on your filesystem and upload those local files to an MongoDB collection.
|
||||
|
||||
First you'll need to install the MongoDB dependencies as shown here.
|
||||
|
||||
.. code:: shell
|
||||
|
||||
pip install "unstructured[mongodb]"
|
||||
|
||||
Run Locally
|
||||
-----------
|
||||
The upstream connector can be any of the ones supported, but for convenience here, showing a sample command using the
|
||||
upstream local connector.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. tab:: Shell
|
||||
|
||||
.. code:: shell
|
||||
|
||||
unstructured-ingest \
|
||||
local \
|
||||
--input-path example-docs/fake-memo.pdf \
|
||||
--anonymous \
|
||||
--output-dir local-output-to-mongo \
|
||||
--num-processes 2 \
|
||||
--verbose \
|
||||
--strategy fast \
|
||||
mongodb \
|
||||
--uri "$MONGODB_URI" \
|
||||
--database "$MONGODB_DATABASE_NAME" \
|
||||
--collection "$DESTINATION_MONGO_COLLECTION"
|
||||
|
||||
.. tab:: Python
|
||||
|
||||
.. code:: python
|
||||
|
||||
import os
|
||||
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ProcessorConfig, ReadConfig
|
||||
from unstructured.ingest.runner import LocalRunner
|
||||
|
||||
if __name__ == "__main__":
|
||||
runner = LocalRunner(
|
||||
processor_config=ProcessorConfig(
|
||||
verbose=True,
|
||||
output_dir="local-output-to-mongo",
|
||||
num_processes=2,
|
||||
),
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(),
|
||||
writer_type="mongodb",
|
||||
writer_kwargs={
|
||||
"uri": os.getenv("MONGODB_URI"),
|
||||
"database": os.getenv("MONGODB_DATABASE_NAME"),
|
||||
"collection": os.getenv("DESTINATION_MONGO_COLLECTION")
|
||||
}
|
||||
)
|
||||
runner.run(
|
||||
input_path="example-docs/fake-memo.pdf",
|
||||
)
|
||||
|
||||
|
||||
For a full list of the options the CLI accepts check ``unstructured-ingest <upstream connector> mongodb --help``.
|
||||
|
||||
NOTE: Keep in mind that you will need to have all the appropriate extras and dependencies for the file types of the documents contained in your data storage platform if you're running this locally. You can find more information about this in the `installation guide <https://unstructured-io.github.io/unstructured/installing.html>`_.
|
@ -18,7 +18,7 @@ charset-normalizer==3.3.2
|
||||
# via requests
|
||||
click==8.1.7
|
||||
# via nltk
|
||||
dataclasses-json==0.6.1
|
||||
dataclasses-json==0.6.2
|
||||
# via -r base.in
|
||||
emoji==2.8.0
|
||||
# via -r base.in
|
||||
|
@ -180,7 +180,7 @@ jupyterlab==4.0.8
|
||||
# via notebook
|
||||
jupyterlab-pygments==0.2.2
|
||||
# via nbconvert
|
||||
jupyterlab-server==2.25.0
|
||||
jupyterlab-server==2.25.1
|
||||
# via
|
||||
# jupyterlab
|
||||
# notebook
|
||||
@ -253,7 +253,7 @@ pre-commit==3.5.0
|
||||
# via -r dev.in
|
||||
prometheus-client==0.18.0
|
||||
# via jupyter-server
|
||||
prompt-toolkit==3.0.39
|
||||
prompt-toolkit==3.0.40
|
||||
# via
|
||||
# ipython
|
||||
# jupyter-console
|
||||
@ -340,7 +340,7 @@ soupsieve==2.5
|
||||
# beautifulsoup4
|
||||
stack-data==0.6.3
|
||||
# via ipython
|
||||
terminado==0.17.1
|
||||
terminado==0.18.0
|
||||
# via
|
||||
# jupyter-server
|
||||
# jupyter-server-terminals
|
||||
|
@ -8,7 +8,7 @@ attrdict==2.0.1
|
||||
# via unstructured-paddleocr
|
||||
babel==2.13.1
|
||||
# via flask-babel
|
||||
bce-python-sdk==0.8.92
|
||||
bce-python-sdk==0.8.96
|
||||
# via visualdl
|
||||
blinker==1.7.0
|
||||
# via flask
|
||||
|
@ -154,7 +154,7 @@ pyparsing==3.0.9
|
||||
# via
|
||||
# -c constraints.in
|
||||
# matplotlib
|
||||
pypdfium2==4.23.1
|
||||
pypdfium2==4.24.0
|
||||
# via pdfplumber
|
||||
pytesseract==0.3.10
|
||||
# via layoutparser
|
||||
|
@ -7,16 +7,13 @@
|
||||
azure-common==1.1.28
|
||||
# via azure-search-documents
|
||||
azure-core==1.29.5
|
||||
# via
|
||||
# azure-search-documents
|
||||
# msrest
|
||||
azure-search-documents==11.3.0
|
||||
# via azure-search-documents
|
||||
azure-search-documents==11.4.0
|
||||
# via -r ingest/azure-cognitive-search.in
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# msrest
|
||||
# requests
|
||||
charset-normalizer==3.3.2
|
||||
# via
|
||||
@ -27,19 +24,11 @@ idna==3.4
|
||||
# -c ingest/../base.txt
|
||||
# requests
|
||||
isodate==0.6.1
|
||||
# via msrest
|
||||
msrest==0.7.1
|
||||
# via azure-search-documents
|
||||
oauthlib==3.2.2
|
||||
# via requests-oauthlib
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# azure-core
|
||||
# msrest
|
||||
# requests-oauthlib
|
||||
requests-oauthlib==1.3.1
|
||||
# via msrest
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
@ -49,7 +38,6 @@ typing-extensions==4.8.0
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# azure-core
|
||||
# azure-search-documents
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
|
@ -23,7 +23,7 @@ azure-datalake-store==0.0.53
|
||||
# via adlfs
|
||||
azure-identity==1.15.0
|
||||
# via adlfs
|
||||
azure-storage-blob==12.18.3
|
||||
azure-storage-blob==12.19.0
|
||||
# via adlfs
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
|
@ -41,5 +41,5 @@ urllib3==1.26.18
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# requests
|
||||
wrapt==1.15.0
|
||||
wrapt==1.16.0
|
||||
# via deprecated
|
||||
|
@ -15,5 +15,5 @@ numpy==1.24.4
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# pyarrow
|
||||
pyarrow==14.0.0
|
||||
pyarrow==14.0.1
|
||||
# via deltalake
|
||||
|
@ -11,7 +11,7 @@ certifi==2023.7.22
|
||||
# elastic-transport
|
||||
elastic-transport==8.10.0
|
||||
# via elasticsearch
|
||||
elasticsearch==8.10.1
|
||||
elasticsearch==8.11.0
|
||||
# via -r ingest/elasticsearch.in
|
||||
jq==1.6.0
|
||||
# via -r ingest/elasticsearch.in
|
||||
|
@ -37,7 +37,7 @@ charset-normalizer==3.3.2
|
||||
# -c ingest/../base.txt
|
||||
# aiohttp
|
||||
# requests
|
||||
dataclasses-json==0.6.1
|
||||
dataclasses-json==0.6.2
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# langchain
|
||||
@ -47,8 +47,6 @@ frozenlist==1.4.0
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
greenlet==3.0.1
|
||||
# via sqlalchemy
|
||||
idna==3.4
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
@ -63,9 +61,9 @@ jsonpatch==1.33
|
||||
# via langchain
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.0.331
|
||||
langchain==0.0.335
|
||||
# via -r ingest/embed-aws-bedrock.in
|
||||
langsmith==0.0.60
|
||||
langsmith==0.0.63
|
||||
# via langchain
|
||||
marshmallow==3.20.1
|
||||
# via
|
||||
|
@ -32,7 +32,7 @@ click==8.1.7
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# nltk
|
||||
dataclasses-json==0.6.1
|
||||
dataclasses-json==0.6.2
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# langchain
|
||||
@ -52,8 +52,6 @@ fsspec==2023.9.1
|
||||
# -c ingest/../constraints.in
|
||||
# huggingface-hub
|
||||
# torch
|
||||
greenlet==3.0.1
|
||||
# via sqlalchemy
|
||||
huggingface==0.0.1
|
||||
# via -r ingest/embed-huggingface.in
|
||||
huggingface-hub==0.17.3
|
||||
@ -78,9 +76,9 @@ jsonpatch==1.33
|
||||
# via langchain
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.0.331
|
||||
langchain==0.0.335
|
||||
# via -r ingest/embed-huggingface.in
|
||||
langsmith==0.0.60
|
||||
langsmith==0.0.63
|
||||
# via langchain
|
||||
markupsafe==2.1.3
|
||||
# via jinja2
|
||||
|
@ -32,7 +32,7 @@ charset-normalizer==3.3.2
|
||||
# -c ingest/../base.txt
|
||||
# aiohttp
|
||||
# requests
|
||||
dataclasses-json==0.6.1
|
||||
dataclasses-json==0.6.2
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# langchain
|
||||
@ -44,11 +44,9 @@ frozenlist==1.4.0
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
greenlet==3.0.1
|
||||
# via sqlalchemy
|
||||
h11==0.14.0
|
||||
# via httpcore
|
||||
httpcore==1.0.1
|
||||
httpcore==1.0.2
|
||||
# via httpx
|
||||
httpx==0.25.1
|
||||
# via openai
|
||||
@ -63,9 +61,9 @@ jsonpatch==1.33
|
||||
# via langchain
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.0.331
|
||||
langchain==0.0.335
|
||||
# via -r ingest/embed-openai.in
|
||||
langsmith==0.0.60
|
||||
langsmith==0.0.63
|
||||
# via langchain
|
||||
marshmallow==3.20.1
|
||||
# via
|
||||
@ -84,7 +82,7 @@ numpy==1.24.4
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# langchain
|
||||
openai==1.1.1
|
||||
openai==1.2.3
|
||||
# via -r ingest/embed-openai.in
|
||||
packaging==23.2
|
||||
# via
|
||||
|
@ -43,7 +43,7 @@ fsspec==2023.9.1
|
||||
# gcsfs
|
||||
gcsfs==2023.9.1
|
||||
# via -r ingest/gcs.in
|
||||
google-api-core==2.12.0
|
||||
google-api-core==2.14.0
|
||||
# via
|
||||
# google-cloud-core
|
||||
# google-cloud-storage
|
||||
|
@ -53,5 +53,5 @@ urllib3==1.26.18
|
||||
# -c ingest/../constraints.in
|
||||
# pygithub
|
||||
# requests
|
||||
wrapt==1.15.0
|
||||
wrapt==1.16.0
|
||||
# via deprecated
|
||||
|
@ -15,7 +15,7 @@ charset-normalizer==3.3.2
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# requests
|
||||
google-api-core==2.12.0
|
||||
google-api-core==2.14.0
|
||||
# via google-api-python-client
|
||||
google-api-python-client==2.107.0
|
||||
# via -r ingest/google-drive.in
|
||||
|
@ -41,5 +41,5 @@ urllib3==1.26.18
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# requests
|
||||
wrapt==1.15.0
|
||||
wrapt==1.16.0
|
||||
# via deprecated
|
||||
|
3
requirements/ingest/mongodb.in
Normal file
3
requirements/ingest/mongodb.in
Normal file
@ -0,0 +1,3 @@
|
||||
-c ../constraints.in
|
||||
-c ../base.txt
|
||||
pymongo
|
10
requirements/ingest/mongodb.txt
Normal file
10
requirements/ingest/mongodb.txt
Normal file
@ -0,0 +1,10 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --output-file=ingest/mongodb.txt ingest/mongodb.in
|
||||
#
|
||||
dnspython==2.4.2
|
||||
# via pymongo
|
||||
pymongo==4.6.0
|
||||
# via -r ingest/mongodb.in
|
@ -20,7 +20,7 @@ h11==0.14.0
|
||||
# via httpcore
|
||||
htmlbuilder==1.0.0
|
||||
# via -r ingest/notion.in
|
||||
httpcore==1.0.1
|
||||
httpcore==1.0.2
|
||||
# via httpx
|
||||
httpx==0.25.1
|
||||
# via notion-client
|
||||
|
@ -62,7 +62,7 @@ urllib3==1.26.18
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# botocore
|
||||
wrapt==1.15.0
|
||||
wrapt==1.16.0
|
||||
# via aiobotocore
|
||||
yarl==1.9.2
|
||||
# via aiohttp
|
||||
|
@ -8,7 +8,7 @@ appdirs==1.4.4
|
||||
# via label-studio-tools
|
||||
autoflake==2.2.1
|
||||
# via -r test.in
|
||||
black==23.10.1
|
||||
black==23.11.0
|
||||
# via -r test.in
|
||||
certifi==2023.7.22
|
||||
# via
|
||||
@ -56,7 +56,7 @@ mccabe==0.7.0
|
||||
# via flake8
|
||||
multidict==6.0.4
|
||||
# via yarl
|
||||
mypy==1.6.1
|
||||
mypy==1.7.0
|
||||
# via -r test.in
|
||||
mypy-extensions==1.0.0
|
||||
# via
|
||||
@ -103,7 +103,7 @@ requests==2.31.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# label-studio-sdk
|
||||
ruff==0.1.4
|
||||
ruff==0.1.5
|
||||
# via -r test.in
|
||||
six==1.16.0
|
||||
# via
|
||||
@ -140,7 +140,7 @@ urllib3==1.26.18
|
||||
# vcrpy
|
||||
vcrpy==5.1.0
|
||||
# via -r test.in
|
||||
wrapt==1.15.0
|
||||
wrapt==1.16.0
|
||||
# via vcrpy
|
||||
yarl==1.9.2
|
||||
# via vcrpy
|
||||
|
1
setup.py
1
setup.py
@ -160,6 +160,7 @@ setup(
|
||||
"embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"),
|
||||
"openai": load_requirements("requirements/ingest/embed-openai.in"),
|
||||
"bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"),
|
||||
"mongodb": load_requirements("requirements/ingest/mongodb.in"),
|
||||
},
|
||||
package_dir={"unstructured": "unstructured"},
|
||||
package_data={"unstructured": ["nlp/*.txt"]},
|
||||
|
@ -11,7 +11,6 @@ WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||
OUTPUT_FOLDER_NAME=azure-cog-search-dest
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
DESTINATION_INDEX="utic-test-ingest-fixtures-output-$(uuidgen)"
|
||||
# The vector configs on the schema currently only exist on versions:
|
||||
# 2023-07-01-Preview, 2021-04-30-Preview, 2020-06-30-Preview
|
||||
@ -42,9 +41,6 @@ function cleanup {
|
||||
# Local file cleanup
|
||||
cleanup_dir "$WORK_DIR"
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
if [ "$CI" == "true" ]; then
|
||||
cleanup_dir "$DOWNLOAD_DIR"
|
||||
fi
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
@ -8,7 +8,6 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=delta-table-dest
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
CI=${CI:-"false"}
|
||||
@ -20,9 +19,6 @@ function cleanup() {
|
||||
cleanup_dir "$DESTINATION_TABLE"
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
if [ "$CI" == "true" ]; then
|
||||
cleanup_dir "$DOWNLOAD_DIR"
|
||||
fi
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
@ -28,9 +28,6 @@ source "$SCRIPT_DIR"/cleanup.sh
|
||||
function cleanup() {
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
if [ "$CI" == "true" ]; then
|
||||
cleanup_dir "$DOWNLOAD_DIR"
|
||||
fi
|
||||
|
||||
echo "deleting test folder $DESTINATION_DROPBOX"
|
||||
curl -X POST https://api.dropboxapi.com/2/files/delete_v2 \
|
||||
|
@ -12,7 +12,6 @@ WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
BUCKET="utic-test-ingest-fixtures-output"
|
||||
DIRECTORY=$(uuidgen)
|
||||
DIRECTORY="test"
|
||||
DESTINATION_GCS="gs://$BUCKET/$DIRECTORY"
|
||||
CI=${CI:-"false"}
|
||||
|
||||
@ -30,9 +29,6 @@ source "$SCRIPT_DIR"/cleanup.sh
|
||||
function cleanup() {
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
if [ "$CI" == "true" ]; then
|
||||
cleanup_dir "$DOWNLOAD_DIR"
|
||||
fi
|
||||
|
||||
python "$SCRIPT_DIR"/python/test-gcs-output.py down \
|
||||
--service-account-file "$GCP_INGEST_SERVICE_KEY_FILE" \
|
||||
|
69
test_unstructured_ingest/dest/mongodb.sh
Executable file
69
test_unstructured_ingest/dest/mongodb.sh
Executable file
@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
DEST_PATH=$(dirname "$(realpath "$0")")
|
||||
SCRIPT_DIR=$(dirname "$DEST_PATH")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=mongodb-dest
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
DESTINATION_MONGO_COLLECTION="utic-test-ingest-fixtures-output-$(uuidgen)"
|
||||
CI=${CI:-"false"}
|
||||
|
||||
if [ -z "$MONGODB_URI" ] && [ -z "$MONGODB_DATABASE_NAME" ]; then
|
||||
echo "Skipping MongoDB destination ingest test because the MONGODB_URI and MONGODB_DATABASE_NAME env var are not set."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
function cleanup() {
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
|
||||
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
||||
--uri "$MONGODB_URI" \
|
||||
--database "$MONGODB_DATABASE_NAME" \
|
||||
--collection "$DESTINATION_MONGO_COLLECTION" down
|
||||
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
||||
--uri "$MONGODB_URI" \
|
||||
--database "$MONGODB_DATABASE_NAME" \
|
||||
--collection "$DESTINATION_MONGO_COLLECTION" up
|
||||
|
||||
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
||||
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
local \
|
||||
--num-processes "$max_processes" \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--strategy fast \
|
||||
--verbose \
|
||||
--reprocess \
|
||||
--input-path example-docs/fake-memo.pdf \
|
||||
--work-dir "$WORK_DIR" \
|
||||
--embedding-provider "langchain-huggingface" \
|
||||
mongodb \
|
||||
--uri "$MONGODB_URI" \
|
||||
--database "$MONGODB_DATABASE_NAME" \
|
||||
--collection "$DESTINATION_MONGO_COLLECTION"
|
||||
|
||||
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
||||
--uri "$MONGODB_URI" \
|
||||
--database "$MONGODB_DATABASE_NAME" \
|
||||
--collection "$DESTINATION_MONGO_COLLECTION" \
|
||||
check --expected-records 5
|
||||
|
||||
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
||||
--uri "$MONGODB_URI" \
|
||||
--database "$MONGODB_DATABASE_NAME" \
|
||||
--collection "$DESTINATION_MONGO_COLLECTION" \
|
||||
check-vector \
|
||||
--output-json "$OUTPUT_ROOT"/structured-output/$OUTPUT_FOLDER_NAME/example-docs/fake-memo.pdf.json
|
@ -18,9 +18,6 @@ source "$SCRIPT_DIR"/cleanup.sh
|
||||
function cleanup() {
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
if [ "$CI" == "true" ]; then
|
||||
cleanup_dir "$DOWNLOAD_DIR"
|
||||
fi
|
||||
|
||||
if aws s3 ls "$DESTINATION_S3" --region us-east-2; then
|
||||
echo "deleting destination s3 location: $DESTINATION_S3"
|
||||
|
150
test_unstructured_ingest/python/test-ingest-mongodb.py
Executable file
150
test_unstructured_ingest/python/test-ingest-mongodb.py
Executable file
@ -0,0 +1,150 @@
|
||||
#!/usr/bin/env python
|
||||
import json
|
||||
import time
|
||||
|
||||
import click
|
||||
from pymongo.mongo_client import MongoClient
|
||||
from pymongo.operations import SearchIndexModel
|
||||
|
||||
|
||||
def get_client(uri: str) -> MongoClient:
|
||||
client = MongoClient(uri)
|
||||
client.admin.command("ping")
|
||||
print("Successfully connected to MongoDB")
|
||||
return client
|
||||
|
||||
|
||||
@click.group(name="mongo-ingest")
|
||||
@click.option("--uri", type=str, required=True)
|
||||
@click.option("--database", type=str, required=True)
|
||||
@click.option("--collection", type=str, required=True)
|
||||
@click.pass_context
|
||||
def cli(ctx, uri: str, database: str, collection: str):
|
||||
# ensure that ctx.obj exists and is a dict (in case `cli()` is called
|
||||
# by means other than the `if` block below)
|
||||
ctx.ensure_object(dict)
|
||||
|
||||
ctx.obj["client"] = get_client(uri)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.pass_context
|
||||
def up(ctx):
|
||||
client = ctx.obj["client"]
|
||||
collection_name = ctx.parent.params["collection"]
|
||||
db = client[ctx.parent.params["database"]]
|
||||
print(f"creating collection {collection_name}")
|
||||
collection = db.create_collection(name=collection_name)
|
||||
print(f"successfully created collection: {collection_name}")
|
||||
if "embeddings" in [c["name"] for c in collection.list_search_indexes()]:
|
||||
print("search index already exists, skipping creation")
|
||||
return
|
||||
|
||||
search_index_name = collection.create_search_index(
|
||||
model=SearchIndexModel(
|
||||
name="embeddings",
|
||||
definition={
|
||||
"mappings": {
|
||||
"dynamic": True,
|
||||
"fields": {
|
||||
"embeddings": [
|
||||
{"type": "knnVector", "dimensions": 384, "similarity": "euclidean"}
|
||||
]
|
||||
},
|
||||
}
|
||||
},
|
||||
)
|
||||
)
|
||||
print(f"Added search index: {search_index_name}")
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.pass_context
|
||||
def down(ctx):
|
||||
collection_name = ctx.parent.params["collection"]
|
||||
client = ctx.obj["client"]
|
||||
db = client[ctx.parent.params["database"]]
|
||||
if collection_name not in db.list_collection_names():
|
||||
print(
|
||||
"collection name {} does not exist amongst those in database: {}, "
|
||||
"skipping deletion".format(collection_name, ", ".join(db.list_collection_names()))
|
||||
)
|
||||
return
|
||||
print(f"deleting collection: {collection_name}")
|
||||
collection = db[collection_name]
|
||||
collection.drop()
|
||||
print(f"successfully deleted collection: {collection}")
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option("--expected-records", type=int, required=True)
|
||||
@click.pass_context
|
||||
def check(ctx, expected_records: int):
|
||||
client = ctx.obj["client"]
|
||||
db = client[ctx.parent.params["database"]]
|
||||
collection = db[ctx.parent.params["collection"]]
|
||||
count = collection.count_documents(filter={})
|
||||
print(f"checking the count in the db ({count}) matches what's expected: {expected_records}")
|
||||
assert (
|
||||
count == expected_records
|
||||
), f"expected count ({expected_records}) does not match how many records were found: {count}"
|
||||
print("successfully checked that the expected number of records was found in the db!")
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option("--output-json", type=click.File())
|
||||
@click.pass_context
|
||||
def check_vector(ctx, output_json):
|
||||
"""
|
||||
Checks the functionality of the vector search index by getting a score based on the
|
||||
exact result of one of the embeddings. Makes sure that the search index itself has finished
|
||||
indexing before running a query, then validated that the first item in the returned sorted
|
||||
list has a score of 1.0 given that the exact embedding is used as a match, and all others
|
||||
have a score less than 1.0.
|
||||
"""
|
||||
# Get the first embedding from the output file
|
||||
json_content = json.load(output_json)
|
||||
exact_embedding = json_content[0]["embeddings"]
|
||||
client = ctx.obj["client"]
|
||||
db = client[ctx.parent.params["database"]]
|
||||
collection = db[ctx.parent.params["collection"]]
|
||||
vector_index_name = "embeddings"
|
||||
status = [ind for ind in collection.list_search_indexes() if ind["name"] == vector_index_name][
|
||||
0
|
||||
].get("status")
|
||||
max_attempts = 30
|
||||
attempts = 0
|
||||
wait_seconds = 5
|
||||
while status != "READY" and attempts < max_attempts:
|
||||
print(
|
||||
f"status of search index: {status}, waiting another {wait_seconds} "
|
||||
f"seconds for it to be ready"
|
||||
)
|
||||
attempts += 1
|
||||
time.sleep(wait_seconds)
|
||||
status = [
|
||||
ind for ind in collection.list_search_indexes() if ind["name"] == vector_index_name
|
||||
][0].get("status")
|
||||
print(f"search index is ready to go ({status}), checking vector content")
|
||||
pipeline = [
|
||||
{
|
||||
"$vectorSearch": {
|
||||
"index": "embeddings",
|
||||
"path": "embeddings",
|
||||
"queryVector": exact_embedding,
|
||||
"numCandidates": 150,
|
||||
"limit": 10,
|
||||
},
|
||||
},
|
||||
{"$project": {"_id": 0, "text": 1, "score": {"$meta": "vectorSearchScore"}}},
|
||||
]
|
||||
result = list(collection.aggregate(pipeline=pipeline))
|
||||
print(f"vector query result: {result}")
|
||||
assert result[0]["score"] == 1.0, "score detected should be 1: {}".format(result[0]["score"])
|
||||
for r in result[1:]:
|
||||
assert r["score"] < 1.0, "score detected should be less than 1: {}".format(r["score"])
|
||||
print("successfully validated vector content!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
@ -15,6 +15,7 @@ all_tests=(
|
||||
'delta-table.sh'
|
||||
'dropbox.sh'
|
||||
'gcs.sh'
|
||||
'mongodb.sh'
|
||||
's3.sh'
|
||||
)
|
||||
|
||||
|
@ -28,6 +28,7 @@ from .gitlab import get_base_src_cmd as gitlab_base_src_cmd
|
||||
from .google_drive import get_base_src_cmd as google_drive_base_src_cmd
|
||||
from .jira import get_base_src_cmd as jira_base_src_cmd
|
||||
from .local import get_base_src_cmd as local_base_src_cmd
|
||||
from .mongodb import get_base_dest_cmd as mongo_base_dest_cmd
|
||||
from .notion import get_base_src_cmd as notion_base_src_cmd
|
||||
from .onedrive import get_base_src_cmd as onedrive_base_src_cmd
|
||||
from .outlook import get_base_src_cmd as outlook_base_src_cmd
|
||||
@ -89,6 +90,7 @@ base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [
|
||||
s3_base_dest_cmd,
|
||||
azure_cognitive_search_base_dest_cmd,
|
||||
delta_table_dest_cmd,
|
||||
mongo_base_dest_cmd,
|
||||
]
|
||||
|
||||
# Make sure there are not overlapping names
|
||||
|
60
unstructured/ingest/cli/cmds/mongodb.py
Normal file
60
unstructured/ingest/cli/cmds/mongodb.py
Normal file
@ -0,0 +1,60 @@
|
||||
import typing as t
|
||||
from dataclasses import dataclass
|
||||
|
||||
import click
|
||||
|
||||
from unstructured.ingest.cli.interfaces import CliConfig, DelimitedString, Dict
|
||||
from unstructured.ingest.connector.mongodb import MongoDBWriteConfig, SimpleMongoDBStorageConfig
|
||||
|
||||
CMD_NAME = "mongodb"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MongoDBCliConfig(SimpleMongoDBStorageConfig, CliConfig):
|
||||
@staticmethod
|
||||
def get_cli_options() -> t.List[click.Option]:
|
||||
options = [
|
||||
click.Option(
|
||||
["--uri"],
|
||||
help="URI to user when connecting",
|
||||
),
|
||||
click.Option(
|
||||
["--host"],
|
||||
type=DelimitedString(),
|
||||
help="hostname or IP address or Unix domain socket path of a single mongod or "
|
||||
"mongos instance to connect to, or a list of hostnames",
|
||||
),
|
||||
click.Option(["--port"], type=int, default=27017),
|
||||
click.Option(
|
||||
["--client-params"],
|
||||
type=Dict(),
|
||||
help="additional parameters to use when creating mongo client",
|
||||
),
|
||||
]
|
||||
return options
|
||||
|
||||
|
||||
@dataclass
|
||||
class MongoDBCliWriteConfig(MongoDBWriteConfig, CliConfig):
|
||||
@staticmethod
|
||||
def get_cli_options() -> t.List[click.Option]:
|
||||
options = [
|
||||
click.Option(
|
||||
["--database"], type=str, required=True, help="database name to connect to"
|
||||
),
|
||||
click.Option(
|
||||
["--collection"], required=True, type=str, help="collection name to connect to"
|
||||
),
|
||||
]
|
||||
return options
|
||||
|
||||
|
||||
def get_base_dest_cmd():
|
||||
from unstructured.ingest.cli.base.dest import BaseDestCmd
|
||||
|
||||
cmd_cls = BaseDestCmd(
|
||||
cmd_name=CMD_NAME,
|
||||
cli_config=MongoDBCliConfig,
|
||||
additional_cli_options=[MongoDBCliWriteConfig],
|
||||
)
|
||||
return cmd_cls
|
111
unstructured/ingest/connector/mongodb.py
Normal file
111
unstructured/ingest/connector/mongodb.py
Normal file
@ -0,0 +1,111 @@
|
||||
import json
|
||||
import typing as t
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from unstructured.ingest.error import DestinationConnectionError, WriteError
|
||||
from unstructured.ingest.interfaces import (
|
||||
BaseConnectorConfig,
|
||||
BaseDestinationConnector,
|
||||
BaseIngestDoc,
|
||||
WriteConfig,
|
||||
)
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from pymongo import MongoClient
|
||||
|
||||
|
||||
SERVER_API_VERSION = "1"
|
||||
|
||||
|
||||
@dataclass
|
||||
class SimpleMongoDBStorageConfig(BaseConnectorConfig):
|
||||
uri: t.Optional[str] = None
|
||||
host: t.Optional[str] = None
|
||||
port: int = 27017
|
||||
client_params: t.Dict[str, t.Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MongoDBWriteConfig(WriteConfig):
|
||||
database: str
|
||||
collection: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class MongoDBDestinationConnector(BaseDestinationConnector):
|
||||
write_config: MongoDBWriteConfig
|
||||
connector_config: SimpleMongoDBStorageConfig
|
||||
_client: t.Optional["MongoClient"] = field(init=False, default=None)
|
||||
|
||||
@requires_dependencies(["pymongo"], extras="mongodb")
|
||||
def generate_client(self) -> "MongoClient":
|
||||
from pymongo import MongoClient
|
||||
from pymongo.server_api import ServerApi
|
||||
|
||||
if self.connector_config.uri:
|
||||
return MongoClient(
|
||||
self.connector_config.uri,
|
||||
server_api=ServerApi(version=SERVER_API_VERSION),
|
||||
**self.connector_config.client_params,
|
||||
)
|
||||
else:
|
||||
return MongoClient(
|
||||
host=self.connector_config.host,
|
||||
port=self.connector_config.port,
|
||||
server_api=ServerApi(version=SERVER_API_VERSION),
|
||||
**self.connector_config.client_params,
|
||||
)
|
||||
|
||||
@property
|
||||
def client(self) -> "MongoClient":
|
||||
if self._client is None:
|
||||
self._client = self.generate_client()
|
||||
return self._client
|
||||
|
||||
@requires_dependencies(["pymongo"], extras="mongodb")
|
||||
def check_connection(self):
|
||||
try:
|
||||
self.client.admin.command("ping")
|
||||
except Exception as e:
|
||||
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
||||
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
||||
|
||||
def initialize(self):
|
||||
_ = self.client
|
||||
|
||||
def conform_dict(self, data: dict) -> None:
|
||||
pass
|
||||
|
||||
def get_collection(self):
|
||||
database = self.client[self.write_config.database]
|
||||
return database.get_collection(name=self.write_config.collection)
|
||||
|
||||
@requires_dependencies(["pymongo"], extras="mongodb")
|
||||
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
||||
logger.info(
|
||||
f"writing {len(elements_dict)} documents to destination "
|
||||
f"database {self.write_config.database}, at collection {self.write_config.collection}",
|
||||
)
|
||||
|
||||
collection = self.get_collection()
|
||||
try:
|
||||
collection.insert_many(elements_dict)
|
||||
except Exception as e:
|
||||
logger.error(f"failed to write records: {e}", exc_info=True)
|
||||
raise WriteError(f"failed to write records: {e}")
|
||||
|
||||
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
||||
json_list: t.List[t.Dict[str, t.Any]] = []
|
||||
for doc in docs:
|
||||
local_path = doc._output_filename
|
||||
with open(local_path) as json_file:
|
||||
json_content = json.load(json_file)
|
||||
for content in json_content:
|
||||
self.conform_dict(data=content)
|
||||
logger.info(
|
||||
f"appending {len(json_content)} json elements from content in {local_path}",
|
||||
)
|
||||
json_list.extend(json_content)
|
||||
self.write_dict(elements_dict=json_list)
|
@ -6,6 +6,7 @@ from .box import box_writer
|
||||
from .delta_table import delta_table_writer
|
||||
from .dropbox import dropbox_writer
|
||||
from .gcs import gcs_writer
|
||||
from .mongodb import mongodb_writer
|
||||
from .s3 import s3_writer
|
||||
|
||||
writer_map: t.Dict[str, t.Callable] = {
|
||||
@ -15,6 +16,7 @@ writer_map: t.Dict[str, t.Callable] = {
|
||||
"delta_table": delta_table_writer,
|
||||
"dropbox": dropbox_writer,
|
||||
"gcs": gcs_writer,
|
||||
"mongodb": mongodb_writer,
|
||||
"s3": s3_writer,
|
||||
}
|
||||
|
||||
|
29
unstructured/ingest/runner/writers/mongodb.py
Normal file
29
unstructured/ingest/runner/writers/mongodb.py
Normal file
@ -0,0 +1,29 @@
|
||||
import typing as t
|
||||
|
||||
from unstructured.ingest.interfaces import BaseDestinationConnector
|
||||
|
||||
|
||||
def mongodb_writer(
|
||||
database: str,
|
||||
collection: str,
|
||||
upsert: bool = False,
|
||||
uri: t.Optional[str] = None,
|
||||
host: t.Optional[str] = None,
|
||||
port: int = 27017,
|
||||
client_params: t.Optional[t.Dict[str, t.Any]] = None,
|
||||
verbose: bool = False,
|
||||
**kwargs,
|
||||
) -> BaseDestinationConnector:
|
||||
client_params = client_params if client_params else {}
|
||||
from unstructured.ingest.connector.mongodb import (
|
||||
MongoDBDestinationConnector,
|
||||
MongoDBWriteConfig,
|
||||
SimpleMongoDBStorageConfig,
|
||||
)
|
||||
|
||||
return MongoDBDestinationConnector(
|
||||
write_config=MongoDBWriteConfig(database=database, collection=collection),
|
||||
connector_config=SimpleMongoDBStorageConfig(
|
||||
uri=uri, host=host, port=port, client_params=client_params
|
||||
),
|
||||
)
|
Loading…
x
Reference in New Issue
Block a user