diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e288dac0..74511a5e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Features ### Fixes +- **fix a bug where table extraction is skipped when it shouldn't**. Pages with just one table as its content or starts with a table misses table extraction. The routing logic is now fixed. ## 0.16.18-dev1 diff --git a/example-docs/pdf/single_table.pdf b/example-docs/pdf/single_table.pdf new file mode 100644 index 000000000..55fdf336a Binary files /dev/null and b/example-docs/pdf/single_table.pdf differ diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index 28fa0493c..76d0e94a3 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -16,9 +16,13 @@ from unstructured_inference.inference.layoutelement import ( LayoutElements, ) +from test_unstructured.unit_utils import example_doc_path from unstructured.documents.elements import ElementType from unstructured.partition.pdf_image import ocr -from unstructured.partition.pdf_image.pdf_image_utils import pad_element_bboxes +from unstructured.partition.pdf_image.pdf_image_utils import ( + convert_pdf_to_images, + pad_element_bboxes, +) from unstructured.partition.utils.config import env_config from unstructured.partition.utils.constants import ( Source, @@ -436,6 +440,28 @@ def mock_ocr_layout(): ) +def test_supplement_element_with_table_extraction(): + from unstructured_inference.models import tables + + tables.load_agent() + + image = next(convert_pdf_to_images(example_doc_path("pdf/single_table.pdf"))) + elements = LayoutElements( + element_coords=np.array([[215.00109863, 731.89996338, 1470.07739258, 972.83129883]]), + texts=np.array(["foo"]), + sources=np.array(["yolox_sg"]), + element_class_ids=np.array([0]), + element_class_id_map={0: "Table"}, + ) + supplemented = ocr.supplement_element_with_table_extraction( + elements=elements, + image=image, + tables_agent=tables.tables_agent, + ocr_agent=ocr.OCRAgent.get_agent(language="eng"), + ) + assert supplemented.text_as_html[0].startswith("