mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-17 02:03:34 +00:00
### Description * Add ingest test for Notion docs * Update default cache dir for connectors to include connector name. Makes debugging the cached content easier. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com>
39 lines
1.2 KiB
Python
39 lines
1.2 KiB
Python
import logging
|
|
|
|
from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig
|
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
|
from unstructured.ingest.processor import process_documents
|
|
from unstructured.ingest.runner.utils import update_download_dir_remote_url
|
|
|
|
|
|
def s3(
|
|
verbose: bool,
|
|
connector_config: StandardConnectorConfig,
|
|
processor_config: ProcessorConfigs,
|
|
remote_url: str,
|
|
recursive: bool,
|
|
anonymous: bool,
|
|
**kwargs,
|
|
):
|
|
ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO)
|
|
|
|
connector_config.download_dir = update_download_dir_remote_url(
|
|
connector_name="s3",
|
|
connector_config=connector_config,
|
|
remote_url=remote_url,
|
|
logger=logger,
|
|
)
|
|
|
|
from unstructured.ingest.connector.s3 import S3Connector, SimpleS3Config
|
|
|
|
doc_connector = S3Connector( # type: ignore
|
|
standard_config=connector_config,
|
|
config=SimpleS3Config(
|
|
path=remote_url,
|
|
recursive=recursive,
|
|
access_kwargs={"anon": anonymous},
|
|
),
|
|
)
|
|
|
|
process_documents(doc_connector=doc_connector, processor_config=processor_config)
|