fix: isalnum referenced before assignment (#1586)

**Executive Summary** Fix bug on the `get_word_bounding_box_from_element` function that prevent `partition_pdf` to run. **Technical Details** - The function originally first define `isalnum` on the first index. Now switched to conditional on flag value.
2025-12-24 21:55:33 +00:00 · 2023-10-03 11:25:20 -04:00 · 2023-10-03 11:25:20 -04:00 · d6efd52b4b
commit d6efd52b4b
parent b2e997635f
4 changed files with 146 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,12 @@
 * **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images.
 * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself.

+### Features 
+
+### Fixes
+
+* **Fixes partition_pdf is_alnum reference bug** Problem: The `partition_pdf` when attempt to get bounding box from element experienced a reference before assignment error when the first object is not text extractable.  Fix: Switched to a flag when the condition is met. Importance: Crucial to be able to partition with pdf.
+
 ## 0.10.17-dev3

 ### Enhancements
@ -20,7 +26,6 @@
  Fix: Updated code to deal with these cases.
  Importance: This will ensure the correctness when partitioning HTML and Markdown documents.

-
 ## 0.10.18

 ### Enhancements
--- a/example-docs/interface-config-guide-p93.pdf
+++ b/example-docs/interface-config-guide-p93.pdf
--- a/test_unstructured/partition/pdf-image/test_pdf.py
+++ b/test_unstructured/partition/pdf-image/test_pdf.py
@ -907,7 +907,7 @@ def test_combine_numbered_list(filename):
    "filename",
    ["example-docs/layout-parser-paper-fast.pdf"],
 )
-def test_hyperlinks(filename):
+def test_partition_pdf_hyperlinks(filename):
    elements = pdf.partition_pdf(filename=filename, strategy="auto")
    links = [
        {
@ -933,7 +933,7 @@ def test_hyperlinks(filename):
    "filename",
    ["example-docs/embedded-link.pdf"],
 )
-def test_hyperlinks_multiple_lines(filename):
+def test_partition_pdf_hyperlinks_multiple_lines(filename):
    elements = pdf.partition_pdf(filename=filename, strategy="auto")
    assert elements[-1].metadata.links[-1]["text"] == "capturing"
    assert len(elements[-1].metadata.links) == 2
@ -953,3 +953,13 @@ def test_partition_pdf_uses_model_name():
        mockpartition.assert_called_once()
        assert "model_name" in mockpartition.call_args.kwargs
        assert mockpartition.call_args.kwargs["model_name"]
+
+
+def test_partition_pdf_word_bbox_not_char(
+    filename="example-docs/interface-config-guide-p93.pdf",
+):
+    try:
+        elements = pdf.partition_pdf(filename=filename)
+    except Exception as e:
+        raise ("Partitioning fail: %s" % e)
+    assert len(elements) == 17
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -868,6 +868,23 @@ def get_uris(
    coordinate_system: Union[PixelSpace, PointSpace],
    page_number: int,
 ) -> List[dict]:
+    """
+    Extracts URI annotations from a single or a list of PDF object references on a specific page.
+    The type of annots (list or not) depends on the pdf formatting. The function detectes the type
+    of annots and then pass on to get_uris_from_annots function as a List.
+
+    Args:
+        annots (Union[PDFObjRef, List[PDFObjRef]]): A single or a list of PDF object references
+            representing annotations on the page.
+        height (float): The height of the page in the specified coordinate system.
+        coordinate_system (Union[PixelSpace, PointSpace]): The coordinate system used to represent
+            the annotations' coordinates.
+        page_number (int): The page number from which to extract annotations.
+
+    Returns:
+        List[dict]: A list of dictionaries, each containing information about a URI annotation,
+        including its coordinates, bounding box, type, URI link, and page number.
+    """
    if isinstance(annots, List):
        return get_uris_from_annots(annots, height, coordinate_system, page_number)
    return get_uris_from_annots(annots.resolve(), height, coordinate_system, page_number)
@ -879,6 +896,21 @@ def get_uris_from_annots(
    coordinate_system: Union[PixelSpace, PointSpace],
    page_number: int,
 ) -> List[dict]:
+    """
+    Extracts URI annotations from a list of PDF object references.
+
+    Args:
+        annots (List[PDFObjRef]): A list of PDF object references representing annotations on
+            a page.
+        height (Union[int, float]): The height of the page in the specified coordinate system.
+        coordinate_system (Union[PixelSpace, PointSpace]): The coordinate system used to represent
+            the annotations' coordinates.
+        page_number (int): The page number from which to extract annotations.
+
+    Returns:
+        List[dict]: A list of dictionaries, each containing information about a URI annotation,
+        including its coordinates, bounding box, type, URI link, and page number.
+    """
    annotation_list = []
    for annotation in annots:
        annotation_dict = try_resolve(annotation)
@ -916,6 +948,10 @@ def get_uris_from_annots(


 def try_resolve(annot: PDFObjRef):
+    """
+    Attempt to resolve a PDF object reference. If successful, returns the resolved object;
+    otherwise, returns the original reference.
+    """
    try:
        return annot.resolve()
    except Exception:
@ -926,6 +962,19 @@ def rect_to_bbox(
    rect: Tuple[float, float, float, float],
    height: float,
 ) -> Tuple[float, float, float, float]:
+    """
+    Converts a PDF rectangle coordinates (x1, y1, x2, y2) to a bounding box in the specified
+    coordinate system where the vertical axis is measured from the top of the page.
+
+    Args:
+        rect (Tuple[float, float, float, float]): A tuple representing a PDF rectangle
+            coordinates (x1, y1, x2, y2).
+        height (float): The height of the page in the specified coordinate system.
+
+    Returns:
+        Tuple[float, float, float, float]: A tuple representing the bounding box coordinates
+        (x1, y1, x2, y2) with the y-coordinates adjusted to be measured from the top of the page.
+    """
    x1, y2, x2, y1 = rect
    y1 = height - y1
    y2 = height - y2
@ -936,6 +985,19 @@ def calculate_intersection_area(
    bbox1: Tuple[float, float, float, float],
    bbox2: Tuple[float, float, float, float],
 ) -> float:
+    """
+    Calculate the area of intersection between two bounding boxes.
+
+    Args:
+        bbox1 (Tuple[float, float, float, float]): The coordinates of the first bounding box
+            in the format (x1, y1, x2, y2).
+        bbox2 (Tuple[float, float, float, float]): The coordinates of the second bounding box
+            in the format (x1, y1, x2, y2).
+
+    Returns:
+        float: The area of intersection between the two bounding boxes. If there is no
+        intersection, the function returns 0.0.
+    """
    x1_1, y1_1, x2_1, y2_1 = bbox1
    x1_2, y1_2, x2_2, y2_2 = bbox2

@ -954,6 +1016,16 @@ def calculate_intersection_area(


 def calculate_bbox_area(bbox: Tuple[float, float, float, float]) -> float:
+    """
+    Calculate the area of a bounding box.
+
+    Args:
+        bbox (Tuple[float, float, float, float]): The coordinates of the bounding box
+            in the format (x1, y1, x2, y2).
+
+    Returns:
+        float: The area of the bounding box, computed as the product of its width and height.
+    """
    x1, y1, x2, y2 = bbox
    area = (x2 - x1) * (y2 - y1)
    return area
@ -965,6 +1037,24 @@ def check_annotations_within_element(
    page_number: int,
    threshold: float = 0.9,
 ) -> List[dict]:
+    """
+    Filter annotations that are within or highly overlap with a specified element on a page.
+
+    Args:
+        annotation_list (List[dict]): A list of dictionaries, each containing information
+            about an annotation.
+        element_bbox (Tuple[float, float, float, float]): The bounding box coordinates of the
+            specified element in the bbox format (x1, y1, x2, y2).
+        page_number (int): The page number to which the annotations and element belong.
+        threshold (float, optional): The threshold value (between 0.0 and 1.0) that determines
+            the minimum overlap required for an annotation to be considered within the element.
+            Default is 0.9.
+
+    Returns:
+        List[dict]: A list of dictionaries containing information about annotations that are
+        within or highly overlap with the specified element on the given page, based on the
+        specified threshold.
+    """
    annotations_within_element = []
    for annotation in annotation_list:
        if annotation["page_number"] == page_number and (
@ -980,6 +1070,19 @@ def get_word_bounding_box_from_element(
    obj: LTTextBox,
    height: float,
 ) -> Tuple[List[LTChar], List[dict]]:
+    """
+    Extracts characters and word bounding boxes from a PDF text element.
+
+    Args:
+        obj (LTTextBox): The PDF text element from which to extract characters and words.
+        height (float): The height of the page in the specified coordinate system.
+
+    Returns:
+        Tuple[List[LTChar], List[dict]]: A tuple containing two lists:
+            - List[LTChar]: A list of LTChar objects representing individual characters.
+            - List[dict]: A list of dictionaries, each containing information about a word,
+              including its text, bounding box, and start index in the element's text.
+    """
    characters = []
    words = []
    text_len = 0
@ -1002,10 +1105,9 @@ def get_word_bounding_box_from_element(

                # TODO(klaijan) - isalnum() only works with A-Z, a-z and 0-9
                # will need to switch to some pattern matching once we support more languages
-                if index == 0:
+                if not word:
                    isalnum = char.isalnum()
-
-                if char.isalnum() != isalnum:
+                if word and char.isalnum() != isalnum:
                    isalnum = char.isalnum()
                    words.append(
                        {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index},
@ -1028,6 +1130,19 @@ def get_word_bounding_box_from_element(


 def map_bbox_and_index(words: List[dict], annot: dict):
+    """
+    Maps a bounding box annotation to the corresponding text and start index within a list of words.
+
+    Args:
+        words (List[dict]): A list of dictionaries, each containing information about a word,
+            including its text, bounding box, and start index.
+        annot (dict): The annotation dictionary to be mapped, which will be updated with "text" and
+            "start_index" fields.
+
+    Returns:
+        dict: The updated annotation dictionary with "text" representing the mapped text and
+            "start_index" representing the start index of the mapped text in the list of words.
+    """
    if len(words) == 0:
        annot["text"] = ""
        annot["start_index"] = -1
@ -1059,6 +1174,16 @@ def map_bbox_and_index(words: List[dict], annot: dict):


 def try_argmin(array: np.ndarray) -> int:
+    """
+    Attempt to find the index of the minimum value in a NumPy array.
+
+    Args:
+        array (np.ndarray): The NumPy array in which to find the minimum value's index.
+
+    Returns:
+        int: The index of the minimum value in the array. If the array is empty or an
+        IndexError occurs, it returns -1.
+    """
    try:
        return int(np.argmin(array))
    except IndexError: