mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-21 22:40:43 +00:00
feat: partition_pdf()
add an environment variable to control the capture of embedded links (#2934)
This PR aims to add an environment variable to control the capture of embedded links in `partition_pdf()` for `fast` strategy. Related PR: https://github.com/Unstructured-IO/unstructured/pull/2537
This commit is contained in:
parent
00f544f100
commit
fcdfbabe8f
@ -1,6 +1,7 @@
|
||||
## 0.13.4-dev4
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Unique and deterministic hash IDs for elements** Element IDs produced by any partitioning
|
||||
function are now deterministic and unique at the document level by default. Before, hashes were
|
||||
based only on text; however, they now also take into account the element's sequence number on a
|
||||
@ -12,6 +13,7 @@
|
||||
|
||||
### Features
|
||||
|
||||
* **Add a `PDF_ANNOTATION_THRESHOLD` environment variable to control the capture of embedded links in `partition_pdf()` for `fast` strategy**.
|
||||
* **Add integration with the Google Cloud Vision API**. Adds a third OCR provider, alongside Tesseract and Paddle: the Google Cloud Vision API.
|
||||
|
||||
### Fixes
|
||||
|
@ -90,6 +90,7 @@ from unstructured.partition.pdf_image.pdfminer_utils import (
|
||||
)
|
||||
from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy
|
||||
from unstructured.partition.text import element_from_text
|
||||
from unstructured.partition.utils.config import env_config
|
||||
from unstructured.partition.utils.constants import (
|
||||
SORT_MODE_BASIC,
|
||||
SORT_MODE_DONT,
|
||||
@ -705,7 +706,7 @@ def _process_pdfminer_pages(
|
||||
languages: List[str],
|
||||
metadata_last_modified: Optional[str],
|
||||
sort_mode: str = SORT_MODE_XY_CUT,
|
||||
annotation_threshold: Optional[float] = 0.9,
|
||||
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
|
||||
starting_page_number: int = 1,
|
||||
**kwargs,
|
||||
):
|
||||
|
@ -109,5 +109,13 @@ class ENVConfig:
|
||||
"""threshold to consider the bounding boxes of two embedded images as the same region"""
|
||||
return self._get_float("EMBEDDED_IMAGE_SAME_REGION_THRESHOLD", 0.6)
|
||||
|
||||
@property
|
||||
def PDF_ANNOTATION_THRESHOLD(self) -> float:
|
||||
"""The threshold value (between 0.0 and 1.0) that determines the minimum overlap required
|
||||
for an annotation to be considered within the element.
|
||||
"""
|
||||
|
||||
return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)
|
||||
|
||||
|
||||
env_config = ENVConfig()
|
||||
|
Loading…
x
Reference in New Issue
Block a user