Roman Isecke db8af4f5de
Roman/notion tests (#1072)
### Description
* Add ingest test for Notion docs
* Update default cache dir for connectors to include connector name.
Makes debugging the cached content easier.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com>
2023-08-21 15:16:50 -04:00

39 lines
1.2 KiB
Python

import logging
from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig
from unstructured.ingest.logger import ingest_log_streaming_init, logger
from unstructured.ingest.processor import process_documents
from unstructured.ingest.runner.utils import update_download_dir_remote_url
def s3(
verbose: bool,
connector_config: StandardConnectorConfig,
processor_config: ProcessorConfigs,
remote_url: str,
recursive: bool,
anonymous: bool,
**kwargs,
):
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
connector_config.download_dir = update_download_dir_remote_url(
connector_name="s3",
connector_config=connector_config,
remote_url=remote_url,
logger=logger,
)
from unstructured.ingest.connector.s3 import S3Connector, SimpleS3Config
doc_connector = S3Connector( # type: ignore
standard_config=connector_config,
config=SimpleS3Config(
path=remote_url,
recursive=recursive,
access_kwargs={"anon": anonymous},
),
)
process_documents(doc_connector=doc_connector, processor_config=processor_config)