feat: add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR config parameteres (#3014)

This PR introduces GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR
controlling where temporary files are stored during partition flow, via
tempfile.tempdir.

#### Edit:
Renamed prefixes from STORAGE_ to UNSTRUCTURED_CACHE_

#### Edit 2:
Renamed prefixes from UNSTRUCTURED_CACHE to GLOBAL_WORKING_DIR_
This commit is contained in:
amadeusz-ds 2024-05-17 21:16:10 +02:00 committed by GitHub
parent ec987dcbb2
commit 1c8b2b23eb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 122 additions and 7 deletions

View File

@ -1,4 +1,4 @@
## 0.14.0-dev14 ## 0.14.0-dev15
### BREAKING CHANGES ### BREAKING CHANGES
@ -9,6 +9,7 @@
* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted. * **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted.
* **Faster evaluation** Support for concurrent processing of documents during evaluation * **Faster evaluation** Support for concurrent processing of documents during evaluation
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy. * **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
* **Add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR** configuration parameteres to control temporary storage.
### Features ### Features
* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`. * **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.

View File

@ -143,7 +143,9 @@ def test_save_elements(
assert not el.metadata.image_mime_type assert not el.metadata.image_mime_type
def test_save_elements_with_output_dir_path_none(): @pytest.mark.parametrize("storage_enabled", [False, True])
def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", storage_enabled)
with ( with (
patch("PIL.Image.open"), patch("PIL.Image.open"),
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"), patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
@ -161,6 +163,11 @@ def test_save_elements_with_output_dir_path_none():
) )
# Verify that the images are saved in the expected directory # Verify that the images are saved in the expected directory
if storage_enabled:
from unstructured.partition.utils.config import env_config
expected_output_dir = os.path.join(env_config.GLOBAL_WORKING_PROCESS_DIR, "figures")
else:
expected_output_dir = os.path.join(tmpdir, "figures") expected_output_dir = os.path.join(tmpdir, "figures")
assert os.path.exists(expected_output_dir) assert os.path.exists(expected_output_dir)
assert os.path.isdir(expected_output_dir) assert os.path.isdir(expected_output_dir)

View File

@ -1,3 +1,10 @@
import shutil
import tempfile
from pathlib import Path
import pytest
def test_default_config(): def test_default_config():
from unstructured.partition.utils.config import env_config from unstructured.partition.utils.config import env_config
@ -9,3 +16,43 @@ def test_env_override(monkeypatch):
from unstructured.partition.utils.config import env_config from unstructured.partition.utils.config import env_config
assert env_config.IMAGE_CROP_PAD == 1 assert env_config.IMAGE_CROP_PAD == 1
@pytest.fixture()
def _setup_tmpdir():
from unstructured.partition.utils.config import env_config
_tmpdir = tempfile.tempdir
_storage_tmpdir = env_config.GLOBAL_WORKING_PROCESS_DIR
_storage_tmpdir_bak = f"{env_config.GLOBAL_WORKING_PROCESS_DIR}_bak"
if Path(_storage_tmpdir).is_dir():
shutil.move(_storage_tmpdir, _storage_tmpdir_bak)
tempfile.tempdir = None
yield
if Path(_storage_tmpdir_bak).is_dir():
if Path(_storage_tmpdir).is_dir():
shutil.rmtree(_storage_tmpdir)
shutil.move(_storage_tmpdir_bak, _storage_tmpdir)
tempfile.tempdir = _tmpdir
@pytest.mark.usefixtures("_setup_tmpdir")
def test_env_storage_disabled(monkeypatch):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "false")
from unstructured.partition.utils.config import env_config
assert not env_config.GLOBAL_WORKING_DIR_ENABLED
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
assert not Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
assert tempfile.gettempdir() != env_config.GLOBAL_WORKING_PROCESS_DIR
@pytest.mark.usefixtures("_setup_tmpdir")
def test_env_storage_enabled(monkeypatch):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "true")
from unstructured.partition.utils.config import env_config
assert env_config.GLOBAL_WORKING_DIR_ENABLED
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
assert Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
assert tempfile.gettempdir() == env_config.GLOBAL_WORKING_PROCESS_DIR

View File

@ -0,0 +1,4 @@
from .partition.utils.config import env_config
# init env_config
env_config

View File

@ -1 +1 @@
__version__ = "0.14.0-dev14" # pragma: no cover __version__ = "0.14.0-dev15" # pragma: no cover

View File

@ -160,7 +160,6 @@ class BaseMetricsCalculator(ABC):
@abstractmethod @abstractmethod
def _process_document(self, doc: Path) -> list: def _process_document(self, doc: Path) -> list:
"""Should return all metadata and metrics for a single document.""" """Should return all metadata and metrics for a single document."""
pass
@dataclass @dataclass

View File

@ -6,6 +6,7 @@ import io
import os import os
import re import re
import warnings import warnings
from pathlib import Path
from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, cast from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, cast
import numpy as np import numpy as np
@ -438,6 +439,14 @@ def _partition_pdf_or_image_local(
) )
if analysis: if analysis:
if not analyzed_image_output_dir_path:
if env_config.GLOBAL_WORKING_DIR_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
annotate_layout_elements( annotate_layout_elements(
inferred_document_layout=inferred_document_layout, inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout, extracted_layout=extracted_layout,

View File

@ -4,7 +4,7 @@ import re
import tempfile import tempfile
from copy import deepcopy from copy import deepcopy
from io import BytesIO from io import BytesIO
from pathlib import PurePath from pathlib import Path, PurePath
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast
import cv2 import cv2
@ -131,7 +131,10 @@ def save_elements(
""" """
if not output_dir_path: if not output_dir_path:
output_dir_path = os.path.join(os.getcwd(), "figures") if env_config.GLOBAL_WORKING_DIR_ENABLED:
output_dir_path = str(Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "figures")
else:
output_dir_path = str(Path.cwd() / "figures")
os.makedirs(output_dir_path, exist_ok=True) os.makedirs(output_dir_path, exist_ok=True)
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:

View File

@ -7,15 +7,28 @@ in bytes). Constants should go into `./constants.py`
""" """
import os import os
import tempfile
from dataclasses import dataclass from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT
@lru_cache(maxsize=1)
def get_tempdir(dir: str) -> str:
tempdir = Path(dir) / f"tmp/{os.getpgid(0)}"
return str(tempdir)
@dataclass @dataclass
class ENVConfig: class ENVConfig:
"""class for configuring enviorment parameters""" """class for configuring enviorment parameters"""
def __post_init__(self):
if self.GLOBAL_WORKING_DIR_ENABLED:
self._setup_tmpdir(self.GLOBAL_WORKING_PROCESS_DIR)
def _get_string(self, var: str, default_value: str = "") -> str: def _get_string(self, var: str, default_value: str = "") -> str:
"""attempt to get the value of var from the os environment; if not present return the """attempt to get the value of var from the os environment; if not present return the
default_value""" default_value"""
@ -31,6 +44,15 @@ class ENVConfig:
return float(value) return float(value)
return default_value return default_value
def _get_bool(self, var: str, default_value: bool) -> bool:
if value := self._get_string(var):
return value.lower() in ("true", "1", "t")
return default_value
def _setup_tmpdir(self, tmpdir: str) -> None:
Path(tmpdir).mkdir(parents=True, exist_ok=True)
tempfile.tempdir = tmpdir
@property @property
def IMAGE_CROP_PAD(self) -> int: def IMAGE_CROP_PAD(self) -> int:
"""extra image content to add around an identified element region; measured in pixels""" """extra image content to add around an identified element region; measured in pixels"""
@ -117,5 +139,28 @@ class ENVConfig:
return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9) return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)
@property
def GLOBAL_WORKING_DIR_ENABLED(self) -> bool:
"""Enable usage of GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR."""
return self._get_bool("GLOBAL_WORKING_DIR_ENABLED", False)
@property
def GLOBAL_WORKING_DIR(self) -> str:
"""Path to Unstructured cache directory."""
return self._get_string("GLOBAL_WORKING_DIR", str(Path.home() / ".cache/unstructured"))
@property
def GLOBAL_WORKING_PROCESS_DIR(self) -> str:
"""Path to Unstructured cache tempdir. Overrides TMPDIR, TEMP and TMP.
Defaults to '{GLOBAL_WORKING_DIR}/tmp/{os.getpgid(0)}'.
"""
default_tmpdir = get_tempdir(dir=self.GLOBAL_WORKING_DIR)
tmpdir = self._get_string("GLOBAL_WORKING_PROCESS_DIR", default_tmpdir)
if tmpdir == "":
tmpdir = default_tmpdir
if self.GLOBAL_WORKING_DIR_ENABLED:
self._setup_tmpdir(tmpdir)
return tmpdir
env_config = ENVConfig() env_config = ENVConfig()