add mongo db destination connector (#2068)

### Description
This adds the basic implementation of pushing the generated json output
of partition to mongodb. None of this code provisions the mondo db
instance so things like adding a search index around the embedding
content must be done by the user. Any sort of schema validation would
also have to take place via user-specific configuration on the database.
This update makes no assumptions about the configuration of the database
itself.
This commit is contained in:
Roman Isecke 2023-11-16 14:40:22 -08:00 committed by GitHub
parent ead2a7f1eb
commit b8af2f18bb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
38 changed files with 542 additions and 69 deletions

View File

@ -446,6 +446,8 @@ jobs:
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
MONGODB_URI: ${{ secrets.MONGODB_URI }}
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }}
TABLE_OCR: "tesseract"
OCR_AGENT: "tesseract"
CI: "true"

View File

@ -8,6 +8,7 @@
### Features
* **Add ad-hoc fields to ElementMetadata instance.** End-users can now add their own metadata fields simply by assigning to an element-metadata attribute-name of their choice, like `element.metadata.coefficient = 0.58`. These fields will round-trip through JSON and can be accessed with dotted notation.
* **MongoDB Destination Connector** New destination connector added to all CLI ingest commands to support writing partitioned json output to mongodb.
### Fixes

View File

@ -0,0 +1,68 @@
MongoDB
======================
Batch process all your records using ``unstructured-ingest`` to store structured outputs locally on your filesystem and upload those local files to an MongoDB collection.
First you'll need to install the MongoDB dependencies as shown here.
.. code:: shell
pip install "unstructured[mongodb]"
Run Locally
-----------
The upstream connector can be any of the ones supported, but for convenience here, showing a sample command using the
upstream local connector.
.. tabs::
.. tab:: Shell
.. code:: shell
unstructured-ingest \
local \
--input-path example-docs/fake-memo.pdf \
--anonymous \
--output-dir local-output-to-mongo \
--num-processes 2 \
--verbose \
--strategy fast \
mongodb \
--uri "$MONGODB_URI" \
--database "$MONGODB_DATABASE_NAME" \
--collection "$DESTINATION_MONGO_COLLECTION"
.. tab:: Python
.. code:: python
import os
from unstructured.ingest.interfaces import PartitionConfig, ProcessorConfig, ReadConfig
from unstructured.ingest.runner import LocalRunner
if __name__ == "__main__":
runner = LocalRunner(
processor_config=ProcessorConfig(
verbose=True,
output_dir="local-output-to-mongo",
num_processes=2,
),
read_config=ReadConfig(),
partition_config=PartitionConfig(),
writer_type="mongodb",
writer_kwargs={
"uri": os.getenv("MONGODB_URI"),
"database": os.getenv("MONGODB_DATABASE_NAME"),
"collection": os.getenv("DESTINATION_MONGO_COLLECTION")
}
)
runner.run(
input_path="example-docs/fake-memo.pdf",
)
For a full list of the options the CLI accepts check ``unstructured-ingest <upstream connector> mongodb --help``.
NOTE: Keep in mind that you will need to have all the appropriate extras and dependencies for the file types of the documents contained in your data storage platform if you're running this locally. You can find more information about this in the `installation guide <https://unstructured-io.github.io/unstructured/installing.html>`_.

View File

@ -18,7 +18,7 @@ charset-normalizer==3.3.2
# via requests
click==8.1.7
# via nltk
dataclasses-json==0.6.1
dataclasses-json==0.6.2
# via -r base.in
emoji==2.8.0
# via -r base.in

View File

@ -180,7 +180,7 @@ jupyterlab==4.0.8
# via notebook
jupyterlab-pygments==0.2.2
# via nbconvert
jupyterlab-server==2.25.0
jupyterlab-server==2.25.1
# via
# jupyterlab
# notebook
@ -253,7 +253,7 @@ pre-commit==3.5.0
# via -r dev.in
prometheus-client==0.18.0
# via jupyter-server
prompt-toolkit==3.0.39
prompt-toolkit==3.0.40
# via
# ipython
# jupyter-console
@ -340,7 +340,7 @@ soupsieve==2.5
# beautifulsoup4
stack-data==0.6.3
# via ipython
terminado==0.17.1
terminado==0.18.0
# via
# jupyter-server
# jupyter-server-terminals

View File

@ -8,7 +8,7 @@ attrdict==2.0.1
# via unstructured-paddleocr
babel==2.13.1
# via flask-babel
bce-python-sdk==0.8.92
bce-python-sdk==0.8.96
# via visualdl
blinker==1.7.0
# via flask

View File

@ -154,7 +154,7 @@ pyparsing==3.0.9
# via
# -c constraints.in
# matplotlib
pypdfium2==4.23.1
pypdfium2==4.24.0
# via pdfplumber
pytesseract==0.3.10
# via layoutparser

View File

@ -7,16 +7,13 @@
azure-common==1.1.28
# via azure-search-documents
azure-core==1.29.5
# via
# azure-search-documents
# msrest
azure-search-documents==11.3.0
# via azure-search-documents
azure-search-documents==11.4.0
# via -r ingest/azure-cognitive-search.in
certifi==2023.7.22
# via
# -c ingest/../base.txt
# -c ingest/../constraints.in
# msrest
# requests
charset-normalizer==3.3.2
# via
@ -27,19 +24,11 @@ idna==3.4
# -c ingest/../base.txt
# requests
isodate==0.6.1
# via msrest
msrest==0.7.1
# via azure-search-documents
oauthlib==3.2.2
# via requests-oauthlib
requests==2.31.0
# via
# -c ingest/../base.txt
# azure-core
# msrest
# requests-oauthlib
requests-oauthlib==1.3.1
# via msrest
six==1.16.0
# via
# -c ingest/../base.txt
@ -49,7 +38,6 @@ typing-extensions==4.8.0
# via
# -c ingest/../base.txt
# azure-core
# azure-search-documents
urllib3==1.26.18
# via
# -c ingest/../base.txt

View File

@ -23,7 +23,7 @@ azure-datalake-store==0.0.53
# via adlfs
azure-identity==1.15.0
# via adlfs
azure-storage-blob==12.18.3
azure-storage-blob==12.19.0
# via adlfs
certifi==2023.7.22
# via

View File

@ -41,5 +41,5 @@ urllib3==1.26.18
# -c ingest/../base.txt
# -c ingest/../constraints.in
# requests
wrapt==1.15.0
wrapt==1.16.0
# via deprecated

View File

@ -15,5 +15,5 @@ numpy==1.24.4
# -c ingest/../base.txt
# -c ingest/../constraints.in
# pyarrow
pyarrow==14.0.0
pyarrow==14.0.1
# via deltalake

View File

@ -11,7 +11,7 @@ certifi==2023.7.22
# elastic-transport
elastic-transport==8.10.0
# via elasticsearch
elasticsearch==8.10.1
elasticsearch==8.11.0
# via -r ingest/elasticsearch.in
jq==1.6.0
# via -r ingest/elasticsearch.in

View File

@ -37,7 +37,7 @@ charset-normalizer==3.3.2
# -c ingest/../base.txt
# aiohttp
# requests
dataclasses-json==0.6.1
dataclasses-json==0.6.2
# via
# -c ingest/../base.txt
# langchain
@ -47,8 +47,6 @@ frozenlist==1.4.0
# via
# aiohttp
# aiosignal
greenlet==3.0.1
# via sqlalchemy
idna==3.4
# via
# -c ingest/../base.txt
@ -63,9 +61,9 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.331
langchain==0.0.335
# via -r ingest/embed-aws-bedrock.in
langsmith==0.0.60
langsmith==0.0.63
# via langchain
marshmallow==3.20.1
# via

View File

@ -32,7 +32,7 @@ click==8.1.7
# via
# -c ingest/../base.txt
# nltk
dataclasses-json==0.6.1
dataclasses-json==0.6.2
# via
# -c ingest/../base.txt
# langchain
@ -52,8 +52,6 @@ fsspec==2023.9.1
# -c ingest/../constraints.in
# huggingface-hub
# torch
greenlet==3.0.1
# via sqlalchemy
huggingface==0.0.1
# via -r ingest/embed-huggingface.in
huggingface-hub==0.17.3
@ -78,9 +76,9 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.331
langchain==0.0.335
# via -r ingest/embed-huggingface.in
langsmith==0.0.60
langsmith==0.0.63
# via langchain
markupsafe==2.1.3
# via jinja2

View File

@ -32,7 +32,7 @@ charset-normalizer==3.3.2
# -c ingest/../base.txt
# aiohttp
# requests
dataclasses-json==0.6.1
dataclasses-json==0.6.2
# via
# -c ingest/../base.txt
# langchain
@ -44,11 +44,9 @@ frozenlist==1.4.0
# via
# aiohttp
# aiosignal
greenlet==3.0.1
# via sqlalchemy
h11==0.14.0
# via httpcore
httpcore==1.0.1
httpcore==1.0.2
# via httpx
httpx==0.25.1
# via openai
@ -63,9 +61,9 @@ jsonpatch==1.33
# via langchain
jsonpointer==2.4
# via jsonpatch
langchain==0.0.331
langchain==0.0.335
# via -r ingest/embed-openai.in
langsmith==0.0.60
langsmith==0.0.63
# via langchain
marshmallow==3.20.1
# via
@ -84,7 +82,7 @@ numpy==1.24.4
# -c ingest/../base.txt
# -c ingest/../constraints.in
# langchain
openai==1.1.1
openai==1.2.3
# via -r ingest/embed-openai.in
packaging==23.2
# via

View File

@ -43,7 +43,7 @@ fsspec==2023.9.1
# gcsfs
gcsfs==2023.9.1
# via -r ingest/gcs.in
google-api-core==2.12.0
google-api-core==2.14.0
# via
# google-cloud-core
# google-cloud-storage

View File

@ -53,5 +53,5 @@ urllib3==1.26.18
# -c ingest/../constraints.in
# pygithub
# requests
wrapt==1.15.0
wrapt==1.16.0
# via deprecated

View File

@ -15,7 +15,7 @@ charset-normalizer==3.3.2
# via
# -c ingest/../base.txt
# requests
google-api-core==2.12.0
google-api-core==2.14.0
# via google-api-python-client
google-api-python-client==2.107.0
# via -r ingest/google-drive.in

View File

@ -41,5 +41,5 @@ urllib3==1.26.18
# -c ingest/../base.txt
# -c ingest/../constraints.in
# requests
wrapt==1.15.0
wrapt==1.16.0
# via deprecated

View File

@ -0,0 +1,3 @@
-c ../constraints.in
-c ../base.txt
pymongo

View File

@ -0,0 +1,10 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile --output-file=ingest/mongodb.txt ingest/mongodb.in
#
dnspython==2.4.2
# via pymongo
pymongo==4.6.0
# via -r ingest/mongodb.in

View File

@ -20,7 +20,7 @@ h11==0.14.0
# via httpcore
htmlbuilder==1.0.0
# via -r ingest/notion.in
httpcore==1.0.1
httpcore==1.0.2
# via httpx
httpx==0.25.1
# via notion-client

View File

@ -62,7 +62,7 @@ urllib3==1.26.18
# -c ingest/../base.txt
# -c ingest/../constraints.in
# botocore
wrapt==1.15.0
wrapt==1.16.0
# via aiobotocore
yarl==1.9.2
# via aiohttp

View File

@ -8,7 +8,7 @@ appdirs==1.4.4
# via label-studio-tools
autoflake==2.2.1
# via -r test.in
black==23.10.1
black==23.11.0
# via -r test.in
certifi==2023.7.22
# via
@ -56,7 +56,7 @@ mccabe==0.7.0
# via flake8
multidict==6.0.4
# via yarl
mypy==1.6.1
mypy==1.7.0
# via -r test.in
mypy-extensions==1.0.0
# via
@ -103,7 +103,7 @@ requests==2.31.0
# via
# -c base.txt
# label-studio-sdk
ruff==0.1.4
ruff==0.1.5
# via -r test.in
six==1.16.0
# via
@ -140,7 +140,7 @@ urllib3==1.26.18
# vcrpy
vcrpy==5.1.0
# via -r test.in
wrapt==1.15.0
wrapt==1.16.0
# via vcrpy
yarl==1.9.2
# via vcrpy

View File

@ -160,6 +160,7 @@ setup(
"embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"),
"openai": load_requirements("requirements/ingest/embed-openai.in"),
"bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"),
"mongodb": load_requirements("requirements/ingest/mongodb.in"),
},
package_dir={"unstructured": "unstructured"},
package_data={"unstructured": ["nlp/*.txt"]},

View File

@ -11,7 +11,6 @@ WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
OUTPUT_FOLDER_NAME=azure-cog-search-dest
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
DESTINATION_INDEX="utic-test-ingest-fixtures-output-$(uuidgen)"
# The vector configs on the schema currently only exist on versions:
# 2023-07-01-Preview, 2021-04-30-Preview, 2020-06-30-Preview
@ -42,9 +41,6 @@ function cleanup {
# Local file cleanup
cleanup_dir "$WORK_DIR"
cleanup_dir "$OUTPUT_DIR"
if [ "$CI" == "true" ]; then
cleanup_dir "$DOWNLOAD_DIR"
fi
}
trap cleanup EXIT

View File

@ -8,7 +8,6 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=delta-table-dest
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
CI=${CI:-"false"}
@ -20,9 +19,6 @@ function cleanup() {
cleanup_dir "$DESTINATION_TABLE"
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
if [ "$CI" == "true" ]; then
cleanup_dir "$DOWNLOAD_DIR"
fi
}
trap cleanup EXIT

View File

@ -28,9 +28,6 @@ source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
if [ "$CI" == "true" ]; then
cleanup_dir "$DOWNLOAD_DIR"
fi
echo "deleting test folder $DESTINATION_DROPBOX"
curl -X POST https://api.dropboxapi.com/2/files/delete_v2 \

View File

@ -12,7 +12,6 @@ WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
BUCKET="utic-test-ingest-fixtures-output"
DIRECTORY=$(uuidgen)
DIRECTORY="test"
DESTINATION_GCS="gs://$BUCKET/$DIRECTORY"
CI=${CI:-"false"}
@ -30,9 +29,6 @@ source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
if [ "$CI" == "true" ]; then
cleanup_dir "$DOWNLOAD_DIR"
fi
python "$SCRIPT_DIR"/python/test-gcs-output.py down \
--service-account-file "$GCP_INGEST_SERVICE_KEY_FILE" \

View File

@ -0,0 +1,69 @@
#!/usr/bin/env bash
set -e
DEST_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$DEST_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=mongodb-dest
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
DESTINATION_MONGO_COLLECTION="utic-test-ingest-fixtures-output-$(uuidgen)"
CI=${CI:-"false"}
if [ -z "$MONGODB_URI" ] && [ -z "$MONGODB_DATABASE_NAME" ]; then
echo "Skipping MongoDB destination ingest test because the MONGODB_URI and MONGODB_DATABASE_NAME env var are not set."
exit 0
fi
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
--uri "$MONGODB_URI" \
--database "$MONGODB_DATABASE_NAME" \
--collection "$DESTINATION_MONGO_COLLECTION" down
}
trap cleanup EXIT
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
--uri "$MONGODB_URI" \
--database "$MONGODB_DATABASE_NAME" \
--collection "$DESTINATION_MONGO_COLLECTION" up
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
--reprocess \
--input-path example-docs/fake-memo.pdf \
--work-dir "$WORK_DIR" \
--embedding-provider "langchain-huggingface" \
mongodb \
--uri "$MONGODB_URI" \
--database "$MONGODB_DATABASE_NAME" \
--collection "$DESTINATION_MONGO_COLLECTION"
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
--uri "$MONGODB_URI" \
--database "$MONGODB_DATABASE_NAME" \
--collection "$DESTINATION_MONGO_COLLECTION" \
check --expected-records 5
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
--uri "$MONGODB_URI" \
--database "$MONGODB_DATABASE_NAME" \
--collection "$DESTINATION_MONGO_COLLECTION" \
check-vector \
--output-json "$OUTPUT_ROOT"/structured-output/$OUTPUT_FOLDER_NAME/example-docs/fake-memo.pdf.json

View File

@ -18,9 +18,6 @@ source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
if [ "$CI" == "true" ]; then
cleanup_dir "$DOWNLOAD_DIR"
fi
if aws s3 ls "$DESTINATION_S3" --region us-east-2; then
echo "deleting destination s3 location: $DESTINATION_S3"

View File

@ -0,0 +1,150 @@
#!/usr/bin/env python
import json
import time
import click
from pymongo.mongo_client import MongoClient
from pymongo.operations import SearchIndexModel
def get_client(uri: str) -> MongoClient:
client = MongoClient(uri)
client.admin.command("ping")
print("Successfully connected to MongoDB")
return client
@click.group(name="mongo-ingest")
@click.option("--uri", type=str, required=True)
@click.option("--database", type=str, required=True)
@click.option("--collection", type=str, required=True)
@click.pass_context
def cli(ctx, uri: str, database: str, collection: str):
# ensure that ctx.obj exists and is a dict (in case `cli()` is called
# by means other than the `if` block below)
ctx.ensure_object(dict)
ctx.obj["client"] = get_client(uri)
@cli.command()
@click.pass_context
def up(ctx):
client = ctx.obj["client"]
collection_name = ctx.parent.params["collection"]
db = client[ctx.parent.params["database"]]
print(f"creating collection {collection_name}")
collection = db.create_collection(name=collection_name)
print(f"successfully created collection: {collection_name}")
if "embeddings" in [c["name"] for c in collection.list_search_indexes()]:
print("search index already exists, skipping creation")
return
search_index_name = collection.create_search_index(
model=SearchIndexModel(
name="embeddings",
definition={
"mappings": {
"dynamic": True,
"fields": {
"embeddings": [
{"type": "knnVector", "dimensions": 384, "similarity": "euclidean"}
]
},
}
},
)
)
print(f"Added search index: {search_index_name}")
@cli.command()
@click.pass_context
def down(ctx):
collection_name = ctx.parent.params["collection"]
client = ctx.obj["client"]
db = client[ctx.parent.params["database"]]
if collection_name not in db.list_collection_names():
print(
"collection name {} does not exist amongst those in database: {}, "
"skipping deletion".format(collection_name, ", ".join(db.list_collection_names()))
)
return
print(f"deleting collection: {collection_name}")
collection = db[collection_name]
collection.drop()
print(f"successfully deleted collection: {collection}")
@cli.command()
@click.option("--expected-records", type=int, required=True)
@click.pass_context
def check(ctx, expected_records: int):
client = ctx.obj["client"]
db = client[ctx.parent.params["database"]]
collection = db[ctx.parent.params["collection"]]
count = collection.count_documents(filter={})
print(f"checking the count in the db ({count}) matches what's expected: {expected_records}")
assert (
count == expected_records
), f"expected count ({expected_records}) does not match how many records were found: {count}"
print("successfully checked that the expected number of records was found in the db!")
@cli.command()
@click.option("--output-json", type=click.File())
@click.pass_context
def check_vector(ctx, output_json):
"""
Checks the functionality of the vector search index by getting a score based on the
exact result of one of the embeddings. Makes sure that the search index itself has finished
indexing before running a query, then validated that the first item in the returned sorted
list has a score of 1.0 given that the exact embedding is used as a match, and all others
have a score less than 1.0.
"""
# Get the first embedding from the output file
json_content = json.load(output_json)
exact_embedding = json_content[0]["embeddings"]
client = ctx.obj["client"]
db = client[ctx.parent.params["database"]]
collection = db[ctx.parent.params["collection"]]
vector_index_name = "embeddings"
status = [ind for ind in collection.list_search_indexes() if ind["name"] == vector_index_name][
0
].get("status")
max_attempts = 30
attempts = 0
wait_seconds = 5
while status != "READY" and attempts < max_attempts:
print(
f"status of search index: {status}, waiting another {wait_seconds} "
f"seconds for it to be ready"
)
attempts += 1
time.sleep(wait_seconds)
status = [
ind for ind in collection.list_search_indexes() if ind["name"] == vector_index_name
][0].get("status")
print(f"search index is ready to go ({status}), checking vector content")
pipeline = [
{
"$vectorSearch": {
"index": "embeddings",
"path": "embeddings",
"queryVector": exact_embedding,
"numCandidates": 150,
"limit": 10,
},
},
{"$project": {"_id": 0, "text": 1, "score": {"$meta": "vectorSearchScore"}}},
]
result = list(collection.aggregate(pipeline=pipeline))
print(f"vector query result: {result}")
assert result[0]["score"] == 1.0, "score detected should be 1: {}".format(result[0]["score"])
for r in result[1:]:
assert r["score"] < 1.0, "score detected should be less than 1: {}".format(r["score"])
print("successfully validated vector content!")
if __name__ == "__main__":
cli()

View File

@ -15,6 +15,7 @@ all_tests=(
'delta-table.sh'
'dropbox.sh'
'gcs.sh'
'mongodb.sh'
's3.sh'
)

View File

@ -28,6 +28,7 @@ from .gitlab import get_base_src_cmd as gitlab_base_src_cmd
from .google_drive import get_base_src_cmd as google_drive_base_src_cmd
from .jira import get_base_src_cmd as jira_base_src_cmd
from .local import get_base_src_cmd as local_base_src_cmd
from .mongodb import get_base_dest_cmd as mongo_base_dest_cmd
from .notion import get_base_src_cmd as notion_base_src_cmd
from .onedrive import get_base_src_cmd as onedrive_base_src_cmd
from .outlook import get_base_src_cmd as outlook_base_src_cmd
@ -89,6 +90,7 @@ base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [
s3_base_dest_cmd,
azure_cognitive_search_base_dest_cmd,
delta_table_dest_cmd,
mongo_base_dest_cmd,
]
# Make sure there are not overlapping names

View File

@ -0,0 +1,60 @@
import typing as t
from dataclasses import dataclass
import click
from unstructured.ingest.cli.interfaces import CliConfig, DelimitedString, Dict
from unstructured.ingest.connector.mongodb import MongoDBWriteConfig, SimpleMongoDBStorageConfig
CMD_NAME = "mongodb"
@dataclass
class MongoDBCliConfig(SimpleMongoDBStorageConfig, CliConfig):
@staticmethod
def get_cli_options() -> t.List[click.Option]:
options = [
click.Option(
["--uri"],
help="URI to user when connecting",
),
click.Option(
["--host"],
type=DelimitedString(),
help="hostname or IP address or Unix domain socket path of a single mongod or "
"mongos instance to connect to, or a list of hostnames",
),
click.Option(["--port"], type=int, default=27017),
click.Option(
["--client-params"],
type=Dict(),
help="additional parameters to use when creating mongo client",
),
]
return options
@dataclass
class MongoDBCliWriteConfig(MongoDBWriteConfig, CliConfig):
@staticmethod
def get_cli_options() -> t.List[click.Option]:
options = [
click.Option(
["--database"], type=str, required=True, help="database name to connect to"
),
click.Option(
["--collection"], required=True, type=str, help="collection name to connect to"
),
]
return options
def get_base_dest_cmd():
from unstructured.ingest.cli.base.dest import BaseDestCmd
cmd_cls = BaseDestCmd(
cmd_name=CMD_NAME,
cli_config=MongoDBCliConfig,
additional_cli_options=[MongoDBCliWriteConfig],
)
return cmd_cls

View File

@ -0,0 +1,111 @@
import json
import typing as t
from dataclasses import dataclass, field
from unstructured.ingest.error import DestinationConnectionError, WriteError
from unstructured.ingest.interfaces import (
BaseConnectorConfig,
BaseDestinationConnector,
BaseIngestDoc,
WriteConfig,
)
from unstructured.ingest.logger import logger
from unstructured.utils import requires_dependencies
if t.TYPE_CHECKING:
from pymongo import MongoClient
SERVER_API_VERSION = "1"
@dataclass
class SimpleMongoDBStorageConfig(BaseConnectorConfig):
uri: t.Optional[str] = None
host: t.Optional[str] = None
port: int = 27017
client_params: t.Dict[str, t.Any] = field(default_factory=dict)
@dataclass
class MongoDBWriteConfig(WriteConfig):
database: str
collection: str
@dataclass
class MongoDBDestinationConnector(BaseDestinationConnector):
write_config: MongoDBWriteConfig
connector_config: SimpleMongoDBStorageConfig
_client: t.Optional["MongoClient"] = field(init=False, default=None)
@requires_dependencies(["pymongo"], extras="mongodb")
def generate_client(self) -> "MongoClient":
from pymongo import MongoClient
from pymongo.server_api import ServerApi
if self.connector_config.uri:
return MongoClient(
self.connector_config.uri,
server_api=ServerApi(version=SERVER_API_VERSION),
**self.connector_config.client_params,
)
else:
return MongoClient(
host=self.connector_config.host,
port=self.connector_config.port,
server_api=ServerApi(version=SERVER_API_VERSION),
**self.connector_config.client_params,
)
@property
def client(self) -> "MongoClient":
if self._client is None:
self._client = self.generate_client()
return self._client
@requires_dependencies(["pymongo"], extras="mongodb")
def check_connection(self):
try:
self.client.admin.command("ping")
except Exception as e:
logger.error(f"failed to validate connection: {e}", exc_info=True)
raise DestinationConnectionError(f"failed to validate connection: {e}")
def initialize(self):
_ = self.client
def conform_dict(self, data: dict) -> None:
pass
def get_collection(self):
database = self.client[self.write_config.database]
return database.get_collection(name=self.write_config.collection)
@requires_dependencies(["pymongo"], extras="mongodb")
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
logger.info(
f"writing {len(elements_dict)} documents to destination "
f"database {self.write_config.database}, at collection {self.write_config.collection}",
)
collection = self.get_collection()
try:
collection.insert_many(elements_dict)
except Exception as e:
logger.error(f"failed to write records: {e}", exc_info=True)
raise WriteError(f"failed to write records: {e}")
def write(self, docs: t.List[BaseIngestDoc]) -> None:
json_list: t.List[t.Dict[str, t.Any]] = []
for doc in docs:
local_path = doc._output_filename
with open(local_path) as json_file:
json_content = json.load(json_file)
for content in json_content:
self.conform_dict(data=content)
logger.info(
f"appending {len(json_content)} json elements from content in {local_path}",
)
json_list.extend(json_content)
self.write_dict(elements_dict=json_list)

View File

@ -6,6 +6,7 @@ from .box import box_writer
from .delta_table import delta_table_writer
from .dropbox import dropbox_writer
from .gcs import gcs_writer
from .mongodb import mongodb_writer
from .s3 import s3_writer
writer_map: t.Dict[str, t.Callable] = {
@ -15,6 +16,7 @@ writer_map: t.Dict[str, t.Callable] = {
"delta_table": delta_table_writer,
"dropbox": dropbox_writer,
"gcs": gcs_writer,
"mongodb": mongodb_writer,
"s3": s3_writer,
}

View File

@ -0,0 +1,29 @@
import typing as t
from unstructured.ingest.interfaces import BaseDestinationConnector
def mongodb_writer(
database: str,
collection: str,
upsert: bool = False,
uri: t.Optional[str] = None,
host: t.Optional[str] = None,
port: int = 27017,
client_params: t.Optional[t.Dict[str, t.Any]] = None,
verbose: bool = False,
**kwargs,
) -> BaseDestinationConnector:
client_params = client_params if client_params else {}
from unstructured.ingest.connector.mongodb import (
MongoDBDestinationConnector,
MongoDBWriteConfig,
SimpleMongoDBStorageConfig,
)
return MongoDBDestinationConnector(
write_config=MongoDBWriteConfig(database=database, collection=collection),
connector_config=SimpleMongoDBStorageConfig(
uri=uri, host=host, port=port, client_params=client_params
),
)