feat: add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR config parameteres (#3014)

This PR introduces GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR
controlling where temporary files are stored during partition flow, via
tempfile.tempdir.

#### Edit:
Renamed prefixes from STORAGE_ to UNSTRUCTURED_CACHE_

#### Edit 2:
Renamed prefixes from UNSTRUCTURED_CACHE to GLOBAL_WORKING_DIR_
This commit is contained in:
amadeusz-ds 2024-05-17 21:16:10 +02:00 committed by GitHub
parent ec987dcbb2
commit 1c8b2b23eb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 122 additions and 7 deletions

View File

@ -1,4 +1,4 @@
## 0.14.0-dev14
## 0.14.0-dev15
### BREAKING CHANGES
@ -9,6 +9,7 @@
* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted.
* **Faster evaluation** Support for concurrent processing of documents during evaluation
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
* **Add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR** configuration parameteres to control temporary storage.
### Features
* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.

View File

@ -143,7 +143,9 @@ def test_save_elements(
assert not el.metadata.image_mime_type
def test_save_elements_with_output_dir_path_none():
@pytest.mark.parametrize("storage_enabled", [False, True])
def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", storage_enabled)
with (
patch("PIL.Image.open"),
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
@ -161,7 +163,12 @@ def test_save_elements_with_output_dir_path_none():
)
# Verify that the images are saved in the expected directory
expected_output_dir = os.path.join(tmpdir, "figures")
if storage_enabled:
from unstructured.partition.utils.config import env_config
expected_output_dir = os.path.join(env_config.GLOBAL_WORKING_PROCESS_DIR, "figures")
else:
expected_output_dir = os.path.join(tmpdir, "figures")
assert os.path.exists(expected_output_dir)
assert os.path.isdir(expected_output_dir)
os.chdir(original_cwd)

View File

@ -1,3 +1,10 @@
import shutil
import tempfile
from pathlib import Path
import pytest
def test_default_config():
from unstructured.partition.utils.config import env_config
@ -9,3 +16,43 @@ def test_env_override(monkeypatch):
from unstructured.partition.utils.config import env_config
assert env_config.IMAGE_CROP_PAD == 1
@pytest.fixture()
def _setup_tmpdir():
from unstructured.partition.utils.config import env_config
_tmpdir = tempfile.tempdir
_storage_tmpdir = env_config.GLOBAL_WORKING_PROCESS_DIR
_storage_tmpdir_bak = f"{env_config.GLOBAL_WORKING_PROCESS_DIR}_bak"
if Path(_storage_tmpdir).is_dir():
shutil.move(_storage_tmpdir, _storage_tmpdir_bak)
tempfile.tempdir = None
yield
if Path(_storage_tmpdir_bak).is_dir():
if Path(_storage_tmpdir).is_dir():
shutil.rmtree(_storage_tmpdir)
shutil.move(_storage_tmpdir_bak, _storage_tmpdir)
tempfile.tempdir = _tmpdir
@pytest.mark.usefixtures("_setup_tmpdir")
def test_env_storage_disabled(monkeypatch):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "false")
from unstructured.partition.utils.config import env_config
assert not env_config.GLOBAL_WORKING_DIR_ENABLED
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
assert not Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
assert tempfile.gettempdir() != env_config.GLOBAL_WORKING_PROCESS_DIR
@pytest.mark.usefixtures("_setup_tmpdir")
def test_env_storage_enabled(monkeypatch):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "true")
from unstructured.partition.utils.config import env_config
assert env_config.GLOBAL_WORKING_DIR_ENABLED
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
assert Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
assert tempfile.gettempdir() == env_config.GLOBAL_WORKING_PROCESS_DIR

View File

@ -0,0 +1,4 @@
from .partition.utils.config import env_config
# init env_config
env_config

View File

@ -1 +1 @@
__version__ = "0.14.0-dev14" # pragma: no cover
__version__ = "0.14.0-dev15" # pragma: no cover

View File

@ -160,7 +160,6 @@ class BaseMetricsCalculator(ABC):
@abstractmethod
def _process_document(self, doc: Path) -> list:
"""Should return all metadata and metrics for a single document."""
pass
@dataclass

View File

@ -6,6 +6,7 @@ import io
import os
import re
import warnings
from pathlib import Path
from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, cast
import numpy as np
@ -438,6 +439,14 @@ def _partition_pdf_or_image_local(
)
if analysis:
if not analyzed_image_output_dir_path:
if env_config.GLOBAL_WORKING_DIR_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
annotate_layout_elements(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,

View File

@ -4,7 +4,7 @@ import re
import tempfile
from copy import deepcopy
from io import BytesIO
from pathlib import PurePath
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast
import cv2
@ -131,7 +131,10 @@ def save_elements(
"""
if not output_dir_path:
output_dir_path = os.path.join(os.getcwd(), "figures")
if env_config.GLOBAL_WORKING_DIR_ENABLED:
output_dir_path = str(Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "figures")
else:
output_dir_path = str(Path.cwd() / "figures")
os.makedirs(output_dir_path, exist_ok=True)
with tempfile.TemporaryDirectory() as temp_dir:

View File

@ -7,15 +7,28 @@ in bytes). Constants should go into `./constants.py`
"""
import os
import tempfile
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT
@lru_cache(maxsize=1)
def get_tempdir(dir: str) -> str:
tempdir = Path(dir) / f"tmp/{os.getpgid(0)}"
return str(tempdir)
@dataclass
class ENVConfig:
"""class for configuring enviorment parameters"""
def __post_init__(self):
if self.GLOBAL_WORKING_DIR_ENABLED:
self._setup_tmpdir(self.GLOBAL_WORKING_PROCESS_DIR)
def _get_string(self, var: str, default_value: str = "") -> str:
"""attempt to get the value of var from the os environment; if not present return the
default_value"""
@ -31,6 +44,15 @@ class ENVConfig:
return float(value)
return default_value
def _get_bool(self, var: str, default_value: bool) -> bool:
if value := self._get_string(var):
return value.lower() in ("true", "1", "t")
return default_value
def _setup_tmpdir(self, tmpdir: str) -> None:
Path(tmpdir).mkdir(parents=True, exist_ok=True)
tempfile.tempdir = tmpdir
@property
def IMAGE_CROP_PAD(self) -> int:
"""extra image content to add around an identified element region; measured in pixels"""
@ -117,5 +139,28 @@ class ENVConfig:
return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)
@property
def GLOBAL_WORKING_DIR_ENABLED(self) -> bool:
"""Enable usage of GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR."""
return self._get_bool("GLOBAL_WORKING_DIR_ENABLED", False)
@property
def GLOBAL_WORKING_DIR(self) -> str:
"""Path to Unstructured cache directory."""
return self._get_string("GLOBAL_WORKING_DIR", str(Path.home() / ".cache/unstructured"))
@property
def GLOBAL_WORKING_PROCESS_DIR(self) -> str:
"""Path to Unstructured cache tempdir. Overrides TMPDIR, TEMP and TMP.
Defaults to '{GLOBAL_WORKING_DIR}/tmp/{os.getpgid(0)}'.
"""
default_tmpdir = get_tempdir(dir=self.GLOBAL_WORKING_DIR)
tmpdir = self._get_string("GLOBAL_WORKING_PROCESS_DIR", default_tmpdir)
if tmpdir == "":
tmpdir = default_tmpdir
if self.GLOBAL_WORKING_DIR_ENABLED:
self._setup_tmpdir(tmpdir)
return tmpdir
env_config = ENVConfig()