Roman Isecke db8af4f5de
Roman/notion tests (#1072)
### Description
* Add ingest test for Notion docs
* Update default cache dir for connectors to include connector name.
Makes debugging the cached content easier.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com>
2023-08-21 15:16:50 -04:00

47 lines
1.4 KiB
Python

from __future__ import annotations
import hashlib
import logging
from pathlib import Path
from unstructured.ingest.interfaces import (
StandardConnectorConfig,
)
def update_download_dir_remote_url(
connector_name: str,
connector_config: StandardConnectorConfig,
remote_url: str,
logger: logging.Logger,
) -> str:
hashed_dir_name = hashlib.sha256(remote_url.encode("utf-8"))
return update_download_dir_hash(
connector_name=connector_name,
connector_config=connector_config,
hashed_dir_name=hashed_dir_name,
logger=logger,
)
def update_download_dir_hash(
connector_name: str,
connector_config: StandardConnectorConfig,
hashed_dir_name: hashlib._Hash,
logger: logging.Logger,
) -> str:
new_download_dir = connector_config.download_dir
if not connector_config.download_dir:
cache_path = Path.home() / ".cache" / "unstructured" / "ingest"
if not cache_path.exists():
cache_path.mkdir(parents=True, exist_ok=True)
download_dir = cache_path / connector_name / hashed_dir_name.hexdigest()[:10]
if connector_config.preserve_downloads:
logger.warning(
f"Preserving downloaded files but download_dir is not specified,"
f" using {download_dir}",
)
new_download_dir = str(download_dir)
logger.debug(f"updating download directory to: {new_download_dir}")
return new_download_dir