pdf rfctr (#2260)

Refactor `_process_pdfminer_pages` by extracting logic into helper functions. --------- Co-authored-by: christinestraub <christinemstraub@gmail.com>
2025-12-27 15:13:35 +00:00 · 2023-12-14 02:16:38 -06:00 · 2023-12-14 02:16:38 -06:00 · 7895d4e0a7
commit 7895d4e0a7
parent 5f5ff6319f
3 changed files with 101 additions and 78 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.11.4-dev12
+## 0.11.4-dev13

 ### Enhancements

--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.11.4-dev12"  # pragma: no cover
+__version__ = "0.11.4-dev13"  # pragma: no cover
--- a/unstructured/partition/pdf_image/pdf.py
+++ b/unstructured/partition/pdf_image/pdf.py
@ -9,6 +9,7 @@ from typing import (
    TYPE_CHECKING,
    Any,
    BinaryIO,
+    Dict,
    Iterator,
    List,
    Optional,
@ -624,7 +625,7 @@ def _process_pdfminer_pages(
    for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)):
        width, height = page_layout.width, page_layout.height

-        page_elements = []
+        page_elements: List[Element] = []
        annotation_list = []

        coordinate_system = PixelSpace(
@ -638,7 +639,7 @@ def _process_pdfminer_pages(
            x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
            bbox = (x1, y1, x2, y2)

-            urls_metadata = []
+            urls_metadata: List[Dict[str, Any]] = []

            if len(annotation_list) > 0 and isinstance(obj, LTTextBox):
                annotations_within_element = check_annotations_within_element(
@ -651,7 +652,7 @@ def _process_pdfminer_pages(
                    urls_metadata.append(map_bbox_and_index(words, annot))

            if hasattr(obj, "get_text"):
-                _text_snippets = [obj.get_text()]
+                _text_snippets: List[str | Any] = [obj.get_text()]  # type: ignore
            else:
                _text = _extract_text(obj)
                _text_snippets = re.split(PARAGRAPH_PATTERN, _text)
@ -669,20 +670,8 @@ def _process_pdfminer_pages(
                        points=points,
                        system=coordinate_system,
                    )
+                    links = _get_links_from_urls_metadata(urls_metadata, moved_indices)

-                    links: List[Link] = []
-                    for url in urls_metadata:
-                        with contextlib.suppress(IndexError):
-                            links.append(
-                                {
-                                    "text": url["text"],
-                                    "url": url["uri"],
-                                    "start_index": index_adjustment_after_clean_extra_whitespace(
-                                        url["start_index"],
-                                        moved_indices,
-                                    ),
-                                },
-                            )
                    element.metadata = ElementMetadata(
                        filename=filename,
                        page_number=i + 1,
@ -693,50 +682,8 @@ def _process_pdfminer_pages(
                    )
                    element.metadata.detection_origin = "pdfminer"
                    page_elements.append(element)
-        list_item = 0
-        updated_page_elements = []  # type: ignore
-        coordinate_system = PixelSpace(width=width, height=height)
-        for page_element in page_elements:
-            if isinstance(page_element, ListItem):
-                list_item += 1
-                list_page_element = page_element
-                list_item_text = page_element.text
-                list_item_coords = page_element.metadata.coordinates
-            elif list_item > 0 and check_coords_within_boundary(
-                page_element.metadata.coordinates,
-                list_item_coords,
-            ):
-                text = page_element.text  # type: ignore
-                list_item_text = list_item_text + " " + text
-                x1 = min(
-                    list_page_element.metadata.coordinates.points[0][0],
-                    page_element.metadata.coordinates.points[0][0],
-                )
-                x2 = max(
-                    list_page_element.metadata.coordinates.points[2][0],
-                    page_element.metadata.coordinates.points[2][0],
-                )
-                y1 = min(
-                    list_page_element.metadata.coordinates.points[0][1],
-                    page_element.metadata.coordinates.points[0][1],
-                )
-                y2 = max(
-                    list_page_element.metadata.coordinates.points[1][1],
-                    page_element.metadata.coordinates.points[1][1],
-                )
-                points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
-                list_page_element.text = list_item_text
-                list_page_element.metadata.coordinates = CoordinatesMetadata(
-                    points=points,
-                    system=coordinate_system,
-                )
-                page_element = list_page_element
-                updated_page_elements.pop()

-            updated_page_elements.append(page_element)
-
-        page_elements = updated_page_elements
-        del updated_page_elements
+        page_elements = _combine_list_elements(page_elements, coordinate_system)

        # NOTE(crag, christine): always do the basic sort first for determinsitic order across
        # python versions.
@ -752,6 +699,82 @@ def _process_pdfminer_pages(
    return elements


+def _combine_list_elements(
+    elements: List[Element], coordinate_system: Union[PixelSpace, PointSpace]
+) -> List[Element]:
+    """Combine elements that should be considered a single ListItem element."""
+    tmp_element = None
+    updated_elements: List[Element] = []
+    for element in elements:
+        if isinstance(element, ListItem):
+            tmp_element = element
+            tmp_text = element.text
+            tmp_coords = element.metadata.coordinates
+        elif tmp_element and check_coords_within_boundary(
+            coordinates=element.metadata.coordinates,
+            boundary=tmp_coords,
+        ):
+            tmp_element.text = f"{tmp_text} {element.text}"
+            # replace "element" with the corrected element
+            element = _combine_coordinates_into_element1(
+                element1=tmp_element,
+                element2=element,
+                coordinate_system=coordinate_system,
+            )
+            # remove previously added ListItem element with incomplete text
+            updated_elements.pop()
+        updated_elements.append(element)
+    return updated_elements
+
+
+def _get_links_from_urls_metadata(
+    urls_metadata: List[Dict[str, Any]], moved_indices: np.ndarray
+) -> List[Link]:
+    """Extracts links from a list of URL metadata."""
+    links: List[Link] = []
+    for url in urls_metadata:
+        with contextlib.suppress(IndexError):
+            links.append(
+                {
+                    "text": url["text"],
+                    "url": url["uri"],
+                    "start_index": index_adjustment_after_clean_extra_whitespace(
+                        url["start_index"],
+                        moved_indices,
+                    ),
+                },
+            )
+    return links
+
+
+def _combine_coordinates_into_element1(
+    element1: Element, element2: Element, coordinate_system: Union[PixelSpace, PointSpace]
+) -> Element:
+    """Combine the coordiantes of two elements and apply the updated coordiantes to `elements1`"""
+    x1 = min(
+        element1.metadata.coordinates.points[0][0],
+        element2.metadata.coordinates.points[0][0],
+    )
+    x2 = max(
+        element1.metadata.coordinates.points[2][0],
+        element2.metadata.coordinates.points[2][0],
+    )
+    y1 = min(
+        element1.metadata.coordinates.points[0][1],
+        element2.metadata.coordinates.points[0][1],
+    )
+    y2 = max(
+        element1.metadata.coordinates.points[1][1],
+        element2.metadata.coordinates.points[1][1],
+    )
+    points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
+    element1.metadata.coordinates = CoordinatesMetadata(
+        points=points,
+        system=coordinate_system,
+    )
+    return element1
+
+
 def convert_pdf_to_images(
    filename: str = "",
    file: Optional[Union[bytes, IO[bytes]]] = None,
@ -933,7 +956,7 @@ def get_uris(
    height: float,
    coordinate_system: Union[PixelSpace, PointSpace],
    page_number: int,
-) -> List[dict]:
+) -> List[Dict[str, Any]]:
    """
    Extracts URI annotations from a single or a list of PDF object references on a specific page.
    The type of annots (list or not) depends on the pdf formatting. The function detectes the type
@ -964,7 +987,7 @@ def get_uris_from_annots(
    height: Union[int, float],
    coordinate_system: Union[PixelSpace, PointSpace],
    page_number: int,
-) -> List[dict]:
+) -> List[Dict[str, Any]]:
    """
    Extracts URI annotations from a list of PDF object references.

@ -1092,16 +1115,16 @@ def calculate_bbox_area(bbox: Tuple[float, float, float, float]) -> float:


 def check_annotations_within_element(
-    annotation_list: List[dict],
+    annotation_list: List[Dict[str, Any]],
    element_bbox: Tuple[float, float, float, float],
    page_number: int,
    threshold: float = 0.9,
-) -> List[dict]:
+) -> List[Dict[str, Any]]:
    """
    Filter annotations that are within or highly overlap with a specified element on a page.

    Args:
-        annotation_list (List[dict]): A list of dictionaries, each containing information
+        annotation_list (List[Dict[str,Any]]): A list of dictionaries, each containing information
            about an annotation.
        element_bbox (Tuple[float, float, float, float]): The bounding box coordinates of the
            specified element in the bbox format (x1, y1, x2, y2).
@ -1111,9 +1134,9 @@ def check_annotations_within_element(
            Default is 0.9.

    Returns:
-        List[dict]: A list of dictionaries containing information about annotations that are
-        within or highly overlap with the specified element on the given page, based on the
-        specified threshold.
+        List[Dict[str,Any]]: A list of dictionaries containing information about annotations
+        that are within or highly overlap with the specified element on the given page, based on
+        the specified threshold.
    """
    annotations_within_element = []
    for annotation in annotation_list:
@ -1130,7 +1153,7 @@ def check_annotations_within_element(
 def get_word_bounding_box_from_element(
    obj: LTTextBox,
    height: float,
-) -> Tuple[List[LTChar], List[dict]]:
+) -> Tuple[List[LTChar], List[Dict[str, Any]]]:
    """
    Extracts characters and word bounding boxes from a PDF text element.

@ -1139,10 +1162,10 @@ def get_word_bounding_box_from_element(
        height (float): The height of the page in the specified coordinate system.

    Returns:
-        Tuple[List[LTChar], List[dict]]: A tuple containing two lists:
+        Tuple[List[LTChar], List[Dict[str,Any]]]: A tuple containing two lists:
            - List[LTChar]: A list of LTChar objects representing individual characters.
-            - List[dict]: A list of dictionaries, each containing information about a word,
-              including its text, bounding box, and start index in the element's text.
+            - List[Dict[str,Any]]]: A list of dictionaries, each containing information about
+                a word, including its text, bounding box, and start index in the element's text.
    """
    characters = []
    words = []
@ -1190,15 +1213,15 @@ def get_word_bounding_box_from_element(
    return characters, words


-def map_bbox_and_index(words: List[dict], annot: dict):
+def map_bbox_and_index(words: List[Dict[str, Any]], annot: Dict[str, Any]):
    """
    Maps a bounding box annotation to the corresponding text and start index within a list of words.

    Args:
-        words (List[dict]): A list of dictionaries, each containing information about a word,
-            including its text, bounding box, and start index.
-        annot (dict): The annotation dictionary to be mapped, which will be updated with "text" and
-            "start_index" fields.
+        words (List[Dict[str,Any]]): A list of dictionaries, each containing information about
+            a word, including its text, bounding box, and start index.
+        annot (Dict[str,Any]): The annotation dictionary to be mapped, which will be updated with
+        "text" and "start_index" fields.

    Returns:
        dict: The updated annotation dictionary with "text" representing the mapped text and