diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c20934af..1c8534143 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.13.4-dev4 ### Enhancements + * **Unique and deterministic hash IDs for elements** Element IDs produced by any partitioning function are now deterministic and unique at the document level by default. Before, hashes were based only on text; however, they now also take into account the element's sequence number on a @@ -12,6 +13,7 @@ ### Features +* **Add a `PDF_ANNOTATION_THRESHOLD` environment variable to control the capture of embedded links in `partition_pdf()` for `fast` strategy**. * **Add integration with the Google Cloud Vision API**. Adds a third OCR provider, alongside Tesseract and Paddle: the Google Cloud Vision API. ### Fixes diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 5b4ae8c55..01b0c9804 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -90,6 +90,7 @@ from unstructured.partition.pdf_image.pdfminer_utils import ( ) from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy from unstructured.partition.text import element_from_text +from unstructured.partition.utils.config import env_config from unstructured.partition.utils.constants import ( SORT_MODE_BASIC, SORT_MODE_DONT, @@ -705,7 +706,7 @@ def _process_pdfminer_pages( languages: List[str], metadata_last_modified: Optional[str], sort_mode: str = SORT_MODE_XY_CUT, - annotation_threshold: Optional[float] = 0.9, + annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD, starting_page_number: int = 1, **kwargs, ): diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 93f3f9590..28eade91d 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -109,5 +109,13 @@ class ENVConfig: """threshold to consider the bounding boxes of two embedded images as the same region""" return self._get_float("EMBEDDED_IMAGE_SAME_REGION_THRESHOLD", 0.6) + @property + def PDF_ANNOTATION_THRESHOLD(self) -> float: + """The threshold value (between 0.0 and 1.0) that determines the minimum overlap required + for an annotation to be considered within the element. + """ + + return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9) + env_config = ENVConfig()