From 79552ff70b4c60e0a85eb330d030fad1ce629d8a Mon Sep 17 00:00:00 2001 From: MiXiBo <34360898+MiXiBo@users.noreply.github.com> Date: Thu, 7 Mar 2024 00:08:49 +0100 Subject: [PATCH] Refactor threshold to annotation_threshold and make it an optional parameter (#2537) We are activating to configure the annotation threshold for links as an optional parameter. The reason for the change is that we ran into issues extracting simple text links from PDF documents that were created with MS Word. The sample PDF from unstructured worked with a default value of 0.9, and the PDF generated with Word resulted in a threshold of approx 0.67. We do use unstructured in together with langchain within an automated container deployment and to access by default the setting 'annotation_threshold' (refactored from 'threshold') can be very helpful. --------- Co-authored-by: Michael Niestroj Co-authored-by: christinestraub --- CHANGELOG.md | 1 + unstructured/partition/pdf.py | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9cf186d8c..8045627ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ### Enhancements +* **Improve ability to capture embedded links in `partition_pdf()` for `fast` strategy** Previously, a threshold value that affects the capture of embedded links was set to a fixed value by default. This allows users to specify the threshold value for better capturing. * **Refactor `add_chunking_strategy` decorator to dispatch by name.** Add `chunk()` function to be used by the `add_chunking_strategy` decorator to dispatch chunking call based on a chunking-strategy name (that can be dynamic at runtime). This decouples chunking dispatch from only those chunkers known at "compile" time and enables runtime registration of custom chunkers. ### Features diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index f1c7f9eff..4be6c1c9d 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -680,6 +680,7 @@ def _process_pdfminer_pages( languages: List[str], metadata_last_modified: Optional[str], sort_mode: str = SORT_MODE_XY_CUT, + annotation_threshold: Optional[float] = 0.9, **kwargs, ): """Uses PDFMiner to split a document into pages and process them.""" @@ -710,6 +711,7 @@ def _process_pdfminer_pages( annotation_list, bbox, i + 1, + annotation_threshold, ) _, words = get_word_bounding_box_from_element(obj, height) for annot in annotations_within_element: @@ -1177,7 +1179,7 @@ def check_annotations_within_element( annotation_list: List[Dict[str, Any]], element_bbox: Tuple[float, float, float, float], page_number: int, - threshold: float = 0.9, + annotation_threshold: float, ) -> List[Dict[str, Any]]: """ Filter annotations that are within or highly overlap with a specified element on a page. @@ -1188,9 +1190,9 @@ def check_annotations_within_element( element_bbox (Tuple[float, float, float, float]): The bounding box coordinates of the specified element in the bbox format (x1, y1, x2, y2). page_number (int): The page number to which the annotations and element belong. - threshold (float, optional): The threshold value (between 0.0 and 1.0) that determines - the minimum overlap required for an annotation to be considered within the element. - Default is 0.9. + annotation_threshold (float, optional): The threshold value (between 0.0 and 1.0) + that determines the minimum overlap required for an annotation to be considered + within the element. Default is 0.9. Returns: List[Dict[str,Any]]: A list of dictionaries containing information about annotations @@ -1203,7 +1205,7 @@ def check_annotations_within_element( annotation_bbox_size = calculate_bbox_area(annotation["bbox"]) if annotation_bbox_size and ( calculate_intersection_area(element_bbox, annotation["bbox"]) / annotation_bbox_size - > threshold + > annotation_threshold ): annotations_within_element.append(annotation) return annotations_within_element