mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-22 14:59:46 +00:00
feat: partition_pdf()
add an environment variable to control the capture of embedded links (#2934)
This PR aims to add an environment variable to control the capture of embedded links in `partition_pdf()` for `fast` strategy. Related PR: https://github.com/Unstructured-IO/unstructured/pull/2537
This commit is contained in:
parent
00f544f100
commit
fcdfbabe8f
@ -1,6 +1,7 @@
|
|||||||
## 0.13.4-dev4
|
## 0.13.4-dev4
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* **Unique and deterministic hash IDs for elements** Element IDs produced by any partitioning
|
* **Unique and deterministic hash IDs for elements** Element IDs produced by any partitioning
|
||||||
function are now deterministic and unique at the document level by default. Before, hashes were
|
function are now deterministic and unique at the document level by default. Before, hashes were
|
||||||
based only on text; however, they now also take into account the element's sequence number on a
|
based only on text; however, they now also take into account the element's sequence number on a
|
||||||
@ -12,6 +13,7 @@
|
|||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
* **Add a `PDF_ANNOTATION_THRESHOLD` environment variable to control the capture of embedded links in `partition_pdf()` for `fast` strategy**.
|
||||||
* **Add integration with the Google Cloud Vision API**. Adds a third OCR provider, alongside Tesseract and Paddle: the Google Cloud Vision API.
|
* **Add integration with the Google Cloud Vision API**. Adds a third OCR provider, alongside Tesseract and Paddle: the Google Cloud Vision API.
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
@ -90,6 +90,7 @@ from unstructured.partition.pdf_image.pdfminer_utils import (
|
|||||||
)
|
)
|
||||||
from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy
|
from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy
|
||||||
from unstructured.partition.text import element_from_text
|
from unstructured.partition.text import element_from_text
|
||||||
|
from unstructured.partition.utils.config import env_config
|
||||||
from unstructured.partition.utils.constants import (
|
from unstructured.partition.utils.constants import (
|
||||||
SORT_MODE_BASIC,
|
SORT_MODE_BASIC,
|
||||||
SORT_MODE_DONT,
|
SORT_MODE_DONT,
|
||||||
@ -705,7 +706,7 @@ def _process_pdfminer_pages(
|
|||||||
languages: List[str],
|
languages: List[str],
|
||||||
metadata_last_modified: Optional[str],
|
metadata_last_modified: Optional[str],
|
||||||
sort_mode: str = SORT_MODE_XY_CUT,
|
sort_mode: str = SORT_MODE_XY_CUT,
|
||||||
annotation_threshold: Optional[float] = 0.9,
|
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
|
||||||
starting_page_number: int = 1,
|
starting_page_number: int = 1,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
|
@ -109,5 +109,13 @@ class ENVConfig:
|
|||||||
"""threshold to consider the bounding boxes of two embedded images as the same region"""
|
"""threshold to consider the bounding boxes of two embedded images as the same region"""
|
||||||
return self._get_float("EMBEDDED_IMAGE_SAME_REGION_THRESHOLD", 0.6)
|
return self._get_float("EMBEDDED_IMAGE_SAME_REGION_THRESHOLD", 0.6)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def PDF_ANNOTATION_THRESHOLD(self) -> float:
|
||||||
|
"""The threshold value (between 0.0 and 1.0) that determines the minimum overlap required
|
||||||
|
for an annotation to be considered within the element.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)
|
||||||
|
|
||||||
|
|
||||||
env_config = ENVConfig()
|
env_config = ENVConfig()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user