Fix/fix table id checking logic (#3898)

- there is a bug in deciding if a page has tables before performing table extraction. This logic checks if the id associated with Table type element is True - however, it should be checking if the id is `None` because sometimes the id can be 0 (the first type of element in the page) - the fix updates the logic - adds a unit test for this specific case
2025-12-25 06:04:53 +00:00 · 2025-01-31 12:19:14 -06:00 · 2025-01-31 12:19:14 -06:00 · 9d58b34ab4
commit 9d58b34ab4
parent a368aac4a3
4 changed files with 29 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,7 @@
 ### Features

 ### Fixes
+- **fix a bug where table extraction is skipped when it shouldn't**. Pages with just one table as its content or starts with a table misses table extraction. The routing logic is now fixed.

 ## 0.16.18-dev1

--- a/example-docs/pdf/single_table.pdf
+++ b/example-docs/pdf/single_table.pdf
--- a/test_unstructured/partition/pdf_image/test_ocr.py
+++ b/test_unstructured/partition/pdf_image/test_ocr.py
@ -16,9 +16,13 @@ from unstructured_inference.inference.layoutelement import (
    LayoutElements,
 )

+from test_unstructured.unit_utils import example_doc_path
 from unstructured.documents.elements import ElementType
 from unstructured.partition.pdf_image import ocr
-from unstructured.partition.pdf_image.pdf_image_utils import pad_element_bboxes
+from unstructured.partition.pdf_image.pdf_image_utils import (
+    convert_pdf_to_images,
+    pad_element_bboxes,
+)
 from unstructured.partition.utils.config import env_config
 from unstructured.partition.utils.constants import (
    Source,
@ -436,6 +440,28 @@ def mock_ocr_layout():
    )


+def test_supplement_element_with_table_extraction():
+    from unstructured_inference.models import tables
+
+    tables.load_agent()
+
+    image = next(convert_pdf_to_images(example_doc_path("pdf/single_table.pdf")))
+    elements = LayoutElements(
+        element_coords=np.array([[215.00109863, 731.89996338, 1470.07739258, 972.83129883]]),
+        texts=np.array(["foo"]),
+        sources=np.array(["yolox_sg"]),
+        element_class_ids=np.array([0]),
+        element_class_id_map={0: "Table"},
+    )
+    supplemented = ocr.supplement_element_with_table_extraction(
+        elements=elements,
+        image=image,
+        tables_agent=tables.tables_agent,
+        ocr_agent=ocr.OCRAgent.get_agent(language="eng"),
+    )
+    assert supplemented.text_as_html[0].startswith("<table>")
+
+
 def test_get_table_tokens(mock_ocr_layout):
    with patch.object(OCRAgentTesseract, "get_layout_from_image", return_value=mock_ocr_layout):
        ocr_agent = OCRAgent.get_agent(language="eng")
--- a/unstructured/partition/pdf_image/ocr.py
+++ b/unstructured/partition/pdf_image/ocr.py
@ -276,7 +276,7 @@ def supplement_element_with_table_extraction(
    from unstructured_inference.models.tables import cells_to_html

    table_id = {v: k for k, v in elements.element_class_id_map.items()}.get(ElementType.TABLE)
-    if not table_id:
+    if table_id is None:
        # no table found in this page
        return elements