mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-17 21:29:05 +00:00
rfctr: implement mongodb v2 destination connector (#3313)
This PR provides support for V2 mongodb destination connector.
This commit is contained in:
parent
c28deffbc4
commit
da29242dbd
@ -1,4 +1,4 @@
|
|||||||
## 0.14.10-dev5
|
## 0.14.10-dev6
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck disable=SC2012
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
@ -67,9 +68,10 @@ python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
|||||||
--collection "$DESTINATION_MONGO_COLLECTION" \
|
--collection "$DESTINATION_MONGO_COLLECTION" \
|
||||||
check --expected-records 5
|
check --expected-records 5
|
||||||
|
|
||||||
|
stage_file=$(ls -1 "$WORK_DIR"/upload_stage | head -n 1)
|
||||||
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
||||||
--uri "$MONGODB_URI" \
|
--uri "$MONGODB_URI" \
|
||||||
--database "$MONGODB_DATABASE_NAME" \
|
--database "$MONGODB_DATABASE_NAME" \
|
||||||
--collection "$DESTINATION_MONGO_COLLECTION" \
|
--collection "$DESTINATION_MONGO_COLLECTION" \
|
||||||
check-vector \
|
check-vector \
|
||||||
--output-json "$OUTPUT_ROOT"/structured-output/$OUTPUT_FOLDER_NAME/fake-memo.pdf.json
|
--output-json "$WORK_DIR"/upload_stage/"$stage_file"
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.14.10-dev5" # pragma: no cover
|
__version__ = "0.14.10-dev6" # pragma: no cover
|
||||||
|
@ -13,6 +13,7 @@ from .fsspec.s3 import s3_dest_cmd, s3_src_cmd
|
|||||||
from .fsspec.sftp import sftp_dest_cmd, sftp_src_cmd
|
from .fsspec.sftp import sftp_dest_cmd, sftp_src_cmd
|
||||||
from .google_drive import google_drive_src_cmd
|
from .google_drive import google_drive_src_cmd
|
||||||
from .local import local_dest_cmd, local_src_cmd
|
from .local import local_dest_cmd, local_src_cmd
|
||||||
|
from .mongodb import mongodb_dest_cmd
|
||||||
from .onedrive import onedrive_drive_src_cmd
|
from .onedrive import onedrive_drive_src_cmd
|
||||||
from .opensearch import opensearch_dest_cmd, opensearch_src_cmd
|
from .opensearch import opensearch_dest_cmd, opensearch_src_cmd
|
||||||
from .pinecone import pinecone_dest_cmd
|
from .pinecone import pinecone_dest_cmd
|
||||||
@ -55,6 +56,7 @@ dest_cmds = [
|
|||||||
s3_dest_cmd,
|
s3_dest_cmd,
|
||||||
sftp_dest_cmd,
|
sftp_dest_cmd,
|
||||||
weaviate_dest_cmd,
|
weaviate_dest_cmd,
|
||||||
|
mongodb_dest_cmd,
|
||||||
]
|
]
|
||||||
|
|
||||||
duplicate_dest_names = [
|
duplicate_dest_names = [
|
||||||
|
62
unstructured/ingest/v2/cli/cmds/mongodb.py
Normal file
62
unstructured/ingest/v2/cli/cmds/mongodb.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
from unstructured.ingest.v2.cli.base import DestCmd
|
||||||
|
from unstructured.ingest.v2.cli.interfaces import CliConfig
|
||||||
|
from unstructured.ingest.v2.processes.connectors.mongodb import CONNECTOR_TYPE
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MongoDBCliConnectionConfig(CliConfig):
|
||||||
|
@staticmethod
|
||||||
|
def get_cli_options() -> list[click.Option]:
|
||||||
|
options = [
|
||||||
|
click.Option(
|
||||||
|
["--uri"],
|
||||||
|
help="URI to user when connecting",
|
||||||
|
),
|
||||||
|
click.Option(
|
||||||
|
["--host"],
|
||||||
|
help="hostname or IP address or Unix domain socket path of a single mongod or "
|
||||||
|
"mongos instance to connect to, or a list of hostnames",
|
||||||
|
),
|
||||||
|
click.Option(["--port"], type=int, default=27017),
|
||||||
|
click.Option(
|
||||||
|
["--database"], type=str, required=True, help="database name to connect to"
|
||||||
|
),
|
||||||
|
click.Option(
|
||||||
|
["--collection"], required=True, type=str, help="collection name to connect to"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
return options
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MongoDBCliUploaderConfig(CliConfig):
|
||||||
|
@staticmethod
|
||||||
|
def get_cli_options() -> list[click.Option]:
|
||||||
|
options = [
|
||||||
|
click.Option(
|
||||||
|
["--batch-size"],
|
||||||
|
default=100,
|
||||||
|
type=int,
|
||||||
|
help="Number of records per batch",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
return options
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MongoDBCliUploadStagerConfig(CliConfig):
|
||||||
|
@staticmethod
|
||||||
|
def get_cli_options() -> list[click.Option]:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
mongodb_dest_cmd = DestCmd(
|
||||||
|
cmd_name=CONNECTOR_TYPE,
|
||||||
|
connection_config=MongoDBCliConnectionConfig,
|
||||||
|
uploader_config=MongoDBCliUploaderConfig,
|
||||||
|
upload_stager_config=MongoDBCliUploadStagerConfig,
|
||||||
|
)
|
52
unstructured/ingest/v2/examples/example_mongodb.py
Normal file
52
unstructured/ingest/v2/examples/example_mongodb.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from unstructured.ingest.v2.interfaces import ProcessorConfig
|
||||||
|
from unstructured.ingest.v2.logger import logger
|
||||||
|
from unstructured.ingest.v2.pipeline.pipeline import Pipeline
|
||||||
|
from unstructured.ingest.v2.processes.chunker import ChunkerConfig
|
||||||
|
from unstructured.ingest.v2.processes.connectors.local import (
|
||||||
|
LocalConnectionConfig,
|
||||||
|
LocalDownloaderConfig,
|
||||||
|
LocalIndexerConfig,
|
||||||
|
)
|
||||||
|
from unstructured.ingest.v2.processes.connectors.mongodb import (
|
||||||
|
MongoDBAccessConfig,
|
||||||
|
MongoDBConnectionConfig,
|
||||||
|
MongoDBUploaderConfig,
|
||||||
|
MongoDBUploadStagerConfig,
|
||||||
|
)
|
||||||
|
from unstructured.ingest.v2.processes.embedder import EmbedderConfig
|
||||||
|
from unstructured.ingest.v2.processes.partitioner import PartitionerConfig
|
||||||
|
|
||||||
|
base_path = Path(__file__).parent.parent.parent.parent.parent
|
||||||
|
docs_path = base_path / "example-docs"
|
||||||
|
work_dir = base_path / "tmp_ingest"
|
||||||
|
output_path = work_dir / "output"
|
||||||
|
download_path = work_dir / "download"
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logger.info(f"Writing all content in: {work_dir.resolve()}")
|
||||||
|
Pipeline.from_configs(
|
||||||
|
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
|
||||||
|
indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
|
||||||
|
downloader_config=LocalDownloaderConfig(download_dir=download_path),
|
||||||
|
source_connection_config=LocalConnectionConfig(),
|
||||||
|
partitioner_config=PartitionerConfig(strategy="fast"),
|
||||||
|
chunker_config=ChunkerConfig(
|
||||||
|
chunking_strategy="by_title",
|
||||||
|
chunk_include_orig_elements=False,
|
||||||
|
chunk_max_characters=1500,
|
||||||
|
chunk_multipage_sections=True,
|
||||||
|
),
|
||||||
|
embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"),
|
||||||
|
destination_connection_config=MongoDBConnectionConfig(
|
||||||
|
access_config=MongoDBAccessConfig(uri=None),
|
||||||
|
host="localhost",
|
||||||
|
port=27017,
|
||||||
|
collection=f"test-collection-{random.randint(1000,9999)}",
|
||||||
|
database="testDatabase",
|
||||||
|
),
|
||||||
|
stager_config=MongoDBUploadStagerConfig(),
|
||||||
|
uploader_config=MongoDBUploaderConfig(batch_size=10),
|
||||||
|
).run()
|
141
unstructured/ingest/v2/processes/connectors/mongodb.py
Normal file
141
unstructured/ingest/v2/processes/connectors/mongodb.py
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
import json
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING, Any, Optional
|
||||||
|
|
||||||
|
from unstructured.__version__ import __version__ as unstructured_version
|
||||||
|
from unstructured.ingest.enhanced_dataclass import enhanced_field
|
||||||
|
from unstructured.ingest.utils.data_prep import chunk_generator
|
||||||
|
from unstructured.ingest.v2.interfaces import (
|
||||||
|
AccessConfig,
|
||||||
|
ConnectionConfig,
|
||||||
|
FileData,
|
||||||
|
UploadContent,
|
||||||
|
Uploader,
|
||||||
|
UploaderConfig,
|
||||||
|
UploadStager,
|
||||||
|
UploadStagerConfig,
|
||||||
|
)
|
||||||
|
from unstructured.ingest.v2.logger import logger
|
||||||
|
from unstructured.ingest.v2.processes.connector_registry import (
|
||||||
|
DestinationRegistryEntry,
|
||||||
|
add_destination_entry,
|
||||||
|
)
|
||||||
|
from unstructured.utils import requires_dependencies
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pymongo import MongoClient
|
||||||
|
|
||||||
|
CONNECTOR_TYPE = "mongodb"
|
||||||
|
SERVER_API_VERSION = "1"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MongoDBAccessConfig(AccessConfig):
|
||||||
|
uri: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MongoDBConnectionConfig(ConnectionConfig):
|
||||||
|
access_config: MongoDBAccessConfig = enhanced_field(
|
||||||
|
sensitive=True, default_factory=MongoDBAccessConfig
|
||||||
|
)
|
||||||
|
host: Optional[str] = None
|
||||||
|
database: Optional[str] = None
|
||||||
|
collection: Optional[str] = None
|
||||||
|
port: int = 27017
|
||||||
|
batch_size: int = 100
|
||||||
|
connector_type: str = CONNECTOR_TYPE
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MongoDBUploadStagerConfig(UploadStagerConfig):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MongoDBUploadStager(UploadStager):
|
||||||
|
upload_stager_config: MongoDBUploadStagerConfig = field(
|
||||||
|
default_factory=lambda: MongoDBUploadStagerConfig()
|
||||||
|
)
|
||||||
|
|
||||||
|
def run(
|
||||||
|
self,
|
||||||
|
elements_filepath: Path,
|
||||||
|
file_data: FileData,
|
||||||
|
output_dir: Path,
|
||||||
|
output_filename: str,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Path:
|
||||||
|
with open(elements_filepath) as elements_file:
|
||||||
|
elements_contents = json.load(elements_file)
|
||||||
|
|
||||||
|
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
||||||
|
with open(output_path, "w") as output_file:
|
||||||
|
json.dump(elements_contents, output_file)
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MongoDBUploaderConfig(UploaderConfig):
|
||||||
|
batch_size: int = 100
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MongoDBUploader(Uploader):
|
||||||
|
upload_config: MongoDBUploaderConfig
|
||||||
|
connection_config: MongoDBConnectionConfig
|
||||||
|
client: Optional["MongoClient"] = field(init=False)
|
||||||
|
connector_type: str = CONNECTOR_TYPE
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
self.client = self.create_client()
|
||||||
|
|
||||||
|
@requires_dependencies(["pymongo"], extras="mongodb")
|
||||||
|
def create_client(self) -> "MongoClient":
|
||||||
|
from pymongo import MongoClient
|
||||||
|
from pymongo.driver_info import DriverInfo
|
||||||
|
from pymongo.server_api import ServerApi
|
||||||
|
|
||||||
|
if self.connection_config.access_config.uri:
|
||||||
|
return MongoClient(
|
||||||
|
self.connection_config.access_config.uri,
|
||||||
|
server_api=ServerApi(version=SERVER_API_VERSION),
|
||||||
|
driver=DriverInfo(name="unstructured", version=unstructured_version),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return MongoClient(
|
||||||
|
host=self.connection_config.host,
|
||||||
|
port=self.connection_config.port,
|
||||||
|
server_api=ServerApi(version=SERVER_API_VERSION),
|
||||||
|
)
|
||||||
|
|
||||||
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
||||||
|
elements_dict = []
|
||||||
|
for content in contents:
|
||||||
|
with open(content.path) as elements_file:
|
||||||
|
elements = json.load(elements_file)
|
||||||
|
elements_dict.extend(elements)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"writing {len(elements_dict)} objects to destination "
|
||||||
|
f"db, {self.connection_config.database}, "
|
||||||
|
f"collection {self.connection_config.collection} "
|
||||||
|
f"at {self.connection_config.host}",
|
||||||
|
)
|
||||||
|
db = self.client[self.connection_config.database]
|
||||||
|
collection = db[self.connection_config.collection]
|
||||||
|
for chunk in chunk_generator(elements_dict, self.upload_config.batch_size):
|
||||||
|
collection.insert_many(chunk)
|
||||||
|
|
||||||
|
|
||||||
|
add_destination_entry(
|
||||||
|
destination_type=CONNECTOR_TYPE,
|
||||||
|
entry=DestinationRegistryEntry(
|
||||||
|
connection_config=MongoDBConnectionConfig,
|
||||||
|
uploader=MongoDBUploader,
|
||||||
|
uploader_config=MongoDBUploaderConfig,
|
||||||
|
upload_stager=MongoDBUploadStager,
|
||||||
|
upload_stager_config=MongoDBUploadStagerConfig,
|
||||||
|
),
|
||||||
|
)
|
Loading…
x
Reference in New Issue
Block a user