feat: partition_pdf() add an environment variable to control the capture of embedded links (#2934)

This PR aims to add an environment variable to control the capture of
embedded links in `partition_pdf()` for `fast` strategy.

Related PR: https://github.com/Unstructured-IO/unstructured/pull/2537
This commit is contained in:
Christine Straub 2024-04-25 14:00:21 -07:00 committed by GitHub
parent 00f544f100
commit fcdfbabe8f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 12 additions and 1 deletions

View File

@ -1,6 +1,7 @@
## 0.13.4-dev4 ## 0.13.4-dev4
### Enhancements ### Enhancements
* **Unique and deterministic hash IDs for elements** Element IDs produced by any partitioning * **Unique and deterministic hash IDs for elements** Element IDs produced by any partitioning
function are now deterministic and unique at the document level by default. Before, hashes were function are now deterministic and unique at the document level by default. Before, hashes were
based only on text; however, they now also take into account the element's sequence number on a based only on text; however, they now also take into account the element's sequence number on a
@ -12,6 +13,7 @@
### Features ### Features
* **Add a `PDF_ANNOTATION_THRESHOLD` environment variable to control the capture of embedded links in `partition_pdf()` for `fast` strategy**.
* **Add integration with the Google Cloud Vision API**. Adds a third OCR provider, alongside Tesseract and Paddle: the Google Cloud Vision API. * **Add integration with the Google Cloud Vision API**. Adds a third OCR provider, alongside Tesseract and Paddle: the Google Cloud Vision API.
### Fixes ### Fixes

View File

@ -90,6 +90,7 @@ from unstructured.partition.pdf_image.pdfminer_utils import (
) )
from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy
from unstructured.partition.text import element_from_text from unstructured.partition.text import element_from_text
from unstructured.partition.utils.config import env_config
from unstructured.partition.utils.constants import ( from unstructured.partition.utils.constants import (
SORT_MODE_BASIC, SORT_MODE_BASIC,
SORT_MODE_DONT, SORT_MODE_DONT,
@ -705,7 +706,7 @@ def _process_pdfminer_pages(
languages: List[str], languages: List[str],
metadata_last_modified: Optional[str], metadata_last_modified: Optional[str],
sort_mode: str = SORT_MODE_XY_CUT, sort_mode: str = SORT_MODE_XY_CUT,
annotation_threshold: Optional[float] = 0.9, annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
starting_page_number: int = 1, starting_page_number: int = 1,
**kwargs, **kwargs,
): ):

View File

@ -109,5 +109,13 @@ class ENVConfig:
"""threshold to consider the bounding boxes of two embedded images as the same region""" """threshold to consider the bounding boxes of two embedded images as the same region"""
return self._get_float("EMBEDDED_IMAGE_SAME_REGION_THRESHOLD", 0.6) return self._get_float("EMBEDDED_IMAGE_SAME_REGION_THRESHOLD", 0.6)
@property
def PDF_ANNOTATION_THRESHOLD(self) -> float:
"""The threshold value (between 0.0 and 1.0) that determines the minimum overlap required
for an annotation to be considered within the element.
"""
return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)
env_config = ENVConfig() env_config = ENVConfig()