feat: partition_pdf() add an environment variable to control the capture of embedded links (#2934)

This PR aims to add an environment variable to control the capture of
embedded links in `partition_pdf()` for `fast` strategy.

Related PR: https://github.com/Unstructured-IO/unstructured/pull/2537
This commit is contained in:
Christine Straub 2024-04-25 14:00:21 -07:00 committed by GitHub
parent 00f544f100
commit fcdfbabe8f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 12 additions and 1 deletions

View File

@ -1,6 +1,7 @@
## 0.13.4-dev4
### Enhancements
* **Unique and deterministic hash IDs for elements** Element IDs produced by any partitioning
function are now deterministic and unique at the document level by default. Before, hashes were
based only on text; however, they now also take into account the element's sequence number on a
@ -12,6 +13,7 @@
### Features
* **Add a `PDF_ANNOTATION_THRESHOLD` environment variable to control the capture of embedded links in `partition_pdf()` for `fast` strategy**.
* **Add integration with the Google Cloud Vision API**. Adds a third OCR provider, alongside Tesseract and Paddle: the Google Cloud Vision API.
### Fixes

View File

@ -90,6 +90,7 @@ from unstructured.partition.pdf_image.pdfminer_utils import (
)
from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy
from unstructured.partition.text import element_from_text
from unstructured.partition.utils.config import env_config
from unstructured.partition.utils.constants import (
SORT_MODE_BASIC,
SORT_MODE_DONT,
@ -705,7 +706,7 @@ def _process_pdfminer_pages(
languages: List[str],
metadata_last_modified: Optional[str],
sort_mode: str = SORT_MODE_XY_CUT,
annotation_threshold: Optional[float] = 0.9,
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
starting_page_number: int = 1,
**kwargs,
):

View File

@ -109,5 +109,13 @@ class ENVConfig:
"""threshold to consider the bounding boxes of two embedded images as the same region"""
return self._get_float("EMBEDDED_IMAGE_SAME_REGION_THRESHOLD", 0.6)
@property
def PDF_ANNOTATION_THRESHOLD(self) -> float:
"""The threshold value (between 0.0 and 1.0) that determines the minimum overlap required
for an annotation to be considered within the element.
"""
return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)
env_config = ENVConfig()