mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-11 07:57:21 +00:00
feat: add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR config parameteres (#3014)
This PR introduces GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR controlling where temporary files are stored during partition flow, via tempfile.tempdir. #### Edit: Renamed prefixes from STORAGE_ to UNSTRUCTURED_CACHE_ #### Edit 2: Renamed prefixes from UNSTRUCTURED_CACHE to GLOBAL_WORKING_DIR_
This commit is contained in:
parent
ec987dcbb2
commit
1c8b2b23eb
@ -1,4 +1,4 @@
|
|||||||
## 0.14.0-dev14
|
## 0.14.0-dev15
|
||||||
|
|
||||||
### BREAKING CHANGES
|
### BREAKING CHANGES
|
||||||
|
|
||||||
@ -9,6 +9,7 @@
|
|||||||
* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted.
|
* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted.
|
||||||
* **Faster evaluation** Support for concurrent processing of documents during evaluation
|
* **Faster evaluation** Support for concurrent processing of documents during evaluation
|
||||||
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
|
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
|
||||||
|
* **Add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR** configuration parameteres to control temporary storage.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.
|
* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.
|
||||||
|
|||||||
@ -143,7 +143,9 @@ def test_save_elements(
|
|||||||
assert not el.metadata.image_mime_type
|
assert not el.metadata.image_mime_type
|
||||||
|
|
||||||
|
|
||||||
def test_save_elements_with_output_dir_path_none():
|
@pytest.mark.parametrize("storage_enabled", [False, True])
|
||||||
|
def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
|
||||||
|
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", storage_enabled)
|
||||||
with (
|
with (
|
||||||
patch("PIL.Image.open"),
|
patch("PIL.Image.open"),
|
||||||
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
|
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
|
||||||
@ -161,6 +163,11 @@ def test_save_elements_with_output_dir_path_none():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Verify that the images are saved in the expected directory
|
# Verify that the images are saved in the expected directory
|
||||||
|
if storage_enabled:
|
||||||
|
from unstructured.partition.utils.config import env_config
|
||||||
|
|
||||||
|
expected_output_dir = os.path.join(env_config.GLOBAL_WORKING_PROCESS_DIR, "figures")
|
||||||
|
else:
|
||||||
expected_output_dir = os.path.join(tmpdir, "figures")
|
expected_output_dir = os.path.join(tmpdir, "figures")
|
||||||
assert os.path.exists(expected_output_dir)
|
assert os.path.exists(expected_output_dir)
|
||||||
assert os.path.isdir(expected_output_dir)
|
assert os.path.isdir(expected_output_dir)
|
||||||
|
|||||||
@ -1,3 +1,10 @@
|
|||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_default_config():
|
def test_default_config():
|
||||||
from unstructured.partition.utils.config import env_config
|
from unstructured.partition.utils.config import env_config
|
||||||
|
|
||||||
@ -9,3 +16,43 @@ def test_env_override(monkeypatch):
|
|||||||
from unstructured.partition.utils.config import env_config
|
from unstructured.partition.utils.config import env_config
|
||||||
|
|
||||||
assert env_config.IMAGE_CROP_PAD == 1
|
assert env_config.IMAGE_CROP_PAD == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def _setup_tmpdir():
|
||||||
|
from unstructured.partition.utils.config import env_config
|
||||||
|
|
||||||
|
_tmpdir = tempfile.tempdir
|
||||||
|
_storage_tmpdir = env_config.GLOBAL_WORKING_PROCESS_DIR
|
||||||
|
_storage_tmpdir_bak = f"{env_config.GLOBAL_WORKING_PROCESS_DIR}_bak"
|
||||||
|
if Path(_storage_tmpdir).is_dir():
|
||||||
|
shutil.move(_storage_tmpdir, _storage_tmpdir_bak)
|
||||||
|
tempfile.tempdir = None
|
||||||
|
yield
|
||||||
|
if Path(_storage_tmpdir_bak).is_dir():
|
||||||
|
if Path(_storage_tmpdir).is_dir():
|
||||||
|
shutil.rmtree(_storage_tmpdir)
|
||||||
|
shutil.move(_storage_tmpdir_bak, _storage_tmpdir)
|
||||||
|
tempfile.tempdir = _tmpdir
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("_setup_tmpdir")
|
||||||
|
def test_env_storage_disabled(monkeypatch):
|
||||||
|
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "false")
|
||||||
|
from unstructured.partition.utils.config import env_config
|
||||||
|
|
||||||
|
assert not env_config.GLOBAL_WORKING_DIR_ENABLED
|
||||||
|
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
|
||||||
|
assert not Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
|
||||||
|
assert tempfile.gettempdir() != env_config.GLOBAL_WORKING_PROCESS_DIR
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("_setup_tmpdir")
|
||||||
|
def test_env_storage_enabled(monkeypatch):
|
||||||
|
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "true")
|
||||||
|
from unstructured.partition.utils.config import env_config
|
||||||
|
|
||||||
|
assert env_config.GLOBAL_WORKING_DIR_ENABLED
|
||||||
|
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
|
||||||
|
assert Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
|
||||||
|
assert tempfile.gettempdir() == env_config.GLOBAL_WORKING_PROCESS_DIR
|
||||||
|
|||||||
@ -0,0 +1,4 @@
|
|||||||
|
from .partition.utils.config import env_config
|
||||||
|
|
||||||
|
# init env_config
|
||||||
|
env_config
|
||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.14.0-dev14" # pragma: no cover
|
__version__ = "0.14.0-dev15" # pragma: no cover
|
||||||
|
|||||||
@ -160,7 +160,6 @@ class BaseMetricsCalculator(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _process_document(self, doc: Path) -> list:
|
def _process_document(self, doc: Path) -> list:
|
||||||
"""Should return all metadata and metrics for a single document."""
|
"""Should return all metadata and metrics for a single document."""
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@ -6,6 +6,7 @@ import io
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
|
from pathlib import Path
|
||||||
from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, cast
|
from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -438,6 +439,14 @@ def _partition_pdf_or_image_local(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if analysis:
|
if analysis:
|
||||||
|
if not analyzed_image_output_dir_path:
|
||||||
|
if env_config.GLOBAL_WORKING_DIR_ENABLED:
|
||||||
|
analyzed_image_output_dir_path = str(
|
||||||
|
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
|
||||||
|
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
|
||||||
annotate_layout_elements(
|
annotate_layout_elements(
|
||||||
inferred_document_layout=inferred_document_layout,
|
inferred_document_layout=inferred_document_layout,
|
||||||
extracted_layout=extracted_layout,
|
extracted_layout=extracted_layout,
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import re
|
|||||||
import tempfile
|
import tempfile
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast
|
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
@ -131,7 +131,10 @@ def save_elements(
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
if not output_dir_path:
|
if not output_dir_path:
|
||||||
output_dir_path = os.path.join(os.getcwd(), "figures")
|
if env_config.GLOBAL_WORKING_DIR_ENABLED:
|
||||||
|
output_dir_path = str(Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "figures")
|
||||||
|
else:
|
||||||
|
output_dir_path = str(Path.cwd() / "figures")
|
||||||
os.makedirs(output_dir_path, exist_ok=True)
|
os.makedirs(output_dir_path, exist_ok=True)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
|||||||
@ -7,15 +7,28 @@ in bytes). Constants should go into `./constants.py`
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import tempfile
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from functools import lru_cache
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT
|
from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def get_tempdir(dir: str) -> str:
|
||||||
|
tempdir = Path(dir) / f"tmp/{os.getpgid(0)}"
|
||||||
|
return str(tempdir)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ENVConfig:
|
class ENVConfig:
|
||||||
"""class for configuring enviorment parameters"""
|
"""class for configuring enviorment parameters"""
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
if self.GLOBAL_WORKING_DIR_ENABLED:
|
||||||
|
self._setup_tmpdir(self.GLOBAL_WORKING_PROCESS_DIR)
|
||||||
|
|
||||||
def _get_string(self, var: str, default_value: str = "") -> str:
|
def _get_string(self, var: str, default_value: str = "") -> str:
|
||||||
"""attempt to get the value of var from the os environment; if not present return the
|
"""attempt to get the value of var from the os environment; if not present return the
|
||||||
default_value"""
|
default_value"""
|
||||||
@ -31,6 +44,15 @@ class ENVConfig:
|
|||||||
return float(value)
|
return float(value)
|
||||||
return default_value
|
return default_value
|
||||||
|
|
||||||
|
def _get_bool(self, var: str, default_value: bool) -> bool:
|
||||||
|
if value := self._get_string(var):
|
||||||
|
return value.lower() in ("true", "1", "t")
|
||||||
|
return default_value
|
||||||
|
|
||||||
|
def _setup_tmpdir(self, tmpdir: str) -> None:
|
||||||
|
Path(tmpdir).mkdir(parents=True, exist_ok=True)
|
||||||
|
tempfile.tempdir = tmpdir
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def IMAGE_CROP_PAD(self) -> int:
|
def IMAGE_CROP_PAD(self) -> int:
|
||||||
"""extra image content to add around an identified element region; measured in pixels"""
|
"""extra image content to add around an identified element region; measured in pixels"""
|
||||||
@ -117,5 +139,28 @@ class ENVConfig:
|
|||||||
|
|
||||||
return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)
|
return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def GLOBAL_WORKING_DIR_ENABLED(self) -> bool:
|
||||||
|
"""Enable usage of GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR."""
|
||||||
|
return self._get_bool("GLOBAL_WORKING_DIR_ENABLED", False)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def GLOBAL_WORKING_DIR(self) -> str:
|
||||||
|
"""Path to Unstructured cache directory."""
|
||||||
|
return self._get_string("GLOBAL_WORKING_DIR", str(Path.home() / ".cache/unstructured"))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def GLOBAL_WORKING_PROCESS_DIR(self) -> str:
|
||||||
|
"""Path to Unstructured cache tempdir. Overrides TMPDIR, TEMP and TMP.
|
||||||
|
Defaults to '{GLOBAL_WORKING_DIR}/tmp/{os.getpgid(0)}'.
|
||||||
|
"""
|
||||||
|
default_tmpdir = get_tempdir(dir=self.GLOBAL_WORKING_DIR)
|
||||||
|
tmpdir = self._get_string("GLOBAL_WORKING_PROCESS_DIR", default_tmpdir)
|
||||||
|
if tmpdir == "":
|
||||||
|
tmpdir = default_tmpdir
|
||||||
|
if self.GLOBAL_WORKING_DIR_ENABLED:
|
||||||
|
self._setup_tmpdir(tmpdir)
|
||||||
|
return tmpdir
|
||||||
|
|
||||||
|
|
||||||
env_config = ENVConfig()
|
env_config = ENVConfig()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user