mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-03 03:23:25 +00:00
feat: add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR config parameteres (#3014)
This PR introduces GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR controlling where temporary files are stored during partition flow, via tempfile.tempdir. #### Edit: Renamed prefixes from STORAGE_ to UNSTRUCTURED_CACHE_ #### Edit 2: Renamed prefixes from UNSTRUCTURED_CACHE to GLOBAL_WORKING_DIR_
This commit is contained in:
parent
ec987dcbb2
commit
1c8b2b23eb
@ -1,4 +1,4 @@
|
||||
## 0.14.0-dev14
|
||||
## 0.14.0-dev15
|
||||
|
||||
### BREAKING CHANGES
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted.
|
||||
* **Faster evaluation** Support for concurrent processing of documents during evaluation
|
||||
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
|
||||
* **Add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR** configuration parameteres to control temporary storage.
|
||||
|
||||
### Features
|
||||
* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.
|
||||
|
||||
@ -143,7 +143,9 @@ def test_save_elements(
|
||||
assert not el.metadata.image_mime_type
|
||||
|
||||
|
||||
def test_save_elements_with_output_dir_path_none():
|
||||
@pytest.mark.parametrize("storage_enabled", [False, True])
|
||||
def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
|
||||
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", storage_enabled)
|
||||
with (
|
||||
patch("PIL.Image.open"),
|
||||
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
|
||||
@ -161,7 +163,12 @@ def test_save_elements_with_output_dir_path_none():
|
||||
)
|
||||
|
||||
# Verify that the images are saved in the expected directory
|
||||
expected_output_dir = os.path.join(tmpdir, "figures")
|
||||
if storage_enabled:
|
||||
from unstructured.partition.utils.config import env_config
|
||||
|
||||
expected_output_dir = os.path.join(env_config.GLOBAL_WORKING_PROCESS_DIR, "figures")
|
||||
else:
|
||||
expected_output_dir = os.path.join(tmpdir, "figures")
|
||||
assert os.path.exists(expected_output_dir)
|
||||
assert os.path.isdir(expected_output_dir)
|
||||
os.chdir(original_cwd)
|
||||
|
||||
@ -1,3 +1,10 @@
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_default_config():
|
||||
from unstructured.partition.utils.config import env_config
|
||||
|
||||
@ -9,3 +16,43 @@ def test_env_override(monkeypatch):
|
||||
from unstructured.partition.utils.config import env_config
|
||||
|
||||
assert env_config.IMAGE_CROP_PAD == 1
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def _setup_tmpdir():
|
||||
from unstructured.partition.utils.config import env_config
|
||||
|
||||
_tmpdir = tempfile.tempdir
|
||||
_storage_tmpdir = env_config.GLOBAL_WORKING_PROCESS_DIR
|
||||
_storage_tmpdir_bak = f"{env_config.GLOBAL_WORKING_PROCESS_DIR}_bak"
|
||||
if Path(_storage_tmpdir).is_dir():
|
||||
shutil.move(_storage_tmpdir, _storage_tmpdir_bak)
|
||||
tempfile.tempdir = None
|
||||
yield
|
||||
if Path(_storage_tmpdir_bak).is_dir():
|
||||
if Path(_storage_tmpdir).is_dir():
|
||||
shutil.rmtree(_storage_tmpdir)
|
||||
shutil.move(_storage_tmpdir_bak, _storage_tmpdir)
|
||||
tempfile.tempdir = _tmpdir
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("_setup_tmpdir")
|
||||
def test_env_storage_disabled(monkeypatch):
|
||||
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "false")
|
||||
from unstructured.partition.utils.config import env_config
|
||||
|
||||
assert not env_config.GLOBAL_WORKING_DIR_ENABLED
|
||||
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
|
||||
assert not Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
|
||||
assert tempfile.gettempdir() != env_config.GLOBAL_WORKING_PROCESS_DIR
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("_setup_tmpdir")
|
||||
def test_env_storage_enabled(monkeypatch):
|
||||
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "true")
|
||||
from unstructured.partition.utils.config import env_config
|
||||
|
||||
assert env_config.GLOBAL_WORKING_DIR_ENABLED
|
||||
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
|
||||
assert Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
|
||||
assert tempfile.gettempdir() == env_config.GLOBAL_WORKING_PROCESS_DIR
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
from .partition.utils.config import env_config
|
||||
|
||||
# init env_config
|
||||
env_config
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.14.0-dev14" # pragma: no cover
|
||||
__version__ = "0.14.0-dev15" # pragma: no cover
|
||||
|
||||
@ -160,7 +160,6 @@ class BaseMetricsCalculator(ABC):
|
||||
@abstractmethod
|
||||
def _process_document(self, doc: Path) -> list:
|
||||
"""Should return all metadata and metrics for a single document."""
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@ -6,6 +6,7 @@ import io
|
||||
import os
|
||||
import re
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, cast
|
||||
|
||||
import numpy as np
|
||||
@ -438,6 +439,14 @@ def _partition_pdf_or_image_local(
|
||||
)
|
||||
|
||||
if analysis:
|
||||
if not analyzed_image_output_dir_path:
|
||||
if env_config.GLOBAL_WORKING_DIR_ENABLED:
|
||||
analyzed_image_output_dir_path = str(
|
||||
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
|
||||
)
|
||||
else:
|
||||
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
|
||||
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
|
||||
annotate_layout_elements(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
|
||||
@ -4,7 +4,7 @@ import re
|
||||
import tempfile
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from pathlib import PurePath
|
||||
from pathlib import Path, PurePath
|
||||
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast
|
||||
|
||||
import cv2
|
||||
@ -131,7 +131,10 @@ def save_elements(
|
||||
"""
|
||||
|
||||
if not output_dir_path:
|
||||
output_dir_path = os.path.join(os.getcwd(), "figures")
|
||||
if env_config.GLOBAL_WORKING_DIR_ENABLED:
|
||||
output_dir_path = str(Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "figures")
|
||||
else:
|
||||
output_dir_path = str(Path.cwd() / "figures")
|
||||
os.makedirs(output_dir_path, exist_ok=True)
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
|
||||
@ -7,15 +7,28 @@ in bytes). Constants should go into `./constants.py`
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_tempdir(dir: str) -> str:
|
||||
tempdir = Path(dir) / f"tmp/{os.getpgid(0)}"
|
||||
return str(tempdir)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ENVConfig:
|
||||
"""class for configuring enviorment parameters"""
|
||||
|
||||
def __post_init__(self):
|
||||
if self.GLOBAL_WORKING_DIR_ENABLED:
|
||||
self._setup_tmpdir(self.GLOBAL_WORKING_PROCESS_DIR)
|
||||
|
||||
def _get_string(self, var: str, default_value: str = "") -> str:
|
||||
"""attempt to get the value of var from the os environment; if not present return the
|
||||
default_value"""
|
||||
@ -31,6 +44,15 @@ class ENVConfig:
|
||||
return float(value)
|
||||
return default_value
|
||||
|
||||
def _get_bool(self, var: str, default_value: bool) -> bool:
|
||||
if value := self._get_string(var):
|
||||
return value.lower() in ("true", "1", "t")
|
||||
return default_value
|
||||
|
||||
def _setup_tmpdir(self, tmpdir: str) -> None:
|
||||
Path(tmpdir).mkdir(parents=True, exist_ok=True)
|
||||
tempfile.tempdir = tmpdir
|
||||
|
||||
@property
|
||||
def IMAGE_CROP_PAD(self) -> int:
|
||||
"""extra image content to add around an identified element region; measured in pixels"""
|
||||
@ -117,5 +139,28 @@ class ENVConfig:
|
||||
|
||||
return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)
|
||||
|
||||
@property
|
||||
def GLOBAL_WORKING_DIR_ENABLED(self) -> bool:
|
||||
"""Enable usage of GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR."""
|
||||
return self._get_bool("GLOBAL_WORKING_DIR_ENABLED", False)
|
||||
|
||||
@property
|
||||
def GLOBAL_WORKING_DIR(self) -> str:
|
||||
"""Path to Unstructured cache directory."""
|
||||
return self._get_string("GLOBAL_WORKING_DIR", str(Path.home() / ".cache/unstructured"))
|
||||
|
||||
@property
|
||||
def GLOBAL_WORKING_PROCESS_DIR(self) -> str:
|
||||
"""Path to Unstructured cache tempdir. Overrides TMPDIR, TEMP and TMP.
|
||||
Defaults to '{GLOBAL_WORKING_DIR}/tmp/{os.getpgid(0)}'.
|
||||
"""
|
||||
default_tmpdir = get_tempdir(dir=self.GLOBAL_WORKING_DIR)
|
||||
tmpdir = self._get_string("GLOBAL_WORKING_PROCESS_DIR", default_tmpdir)
|
||||
if tmpdir == "":
|
||||
tmpdir = default_tmpdir
|
||||
if self.GLOBAL_WORKING_DIR_ENABLED:
|
||||
self._setup_tmpdir(tmpdir)
|
||||
return tmpdir
|
||||
|
||||
|
||||
env_config = ENVConfig()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user