Check chipper version and prevent running pdfminer with chipper (#2347)

We have added a new version of chipper (Chipperv3), which needs to allow unstructured to effective work with all the current Chipper versions. This implies resizing images with the appropriate resolution and make sure that Chipper elements are not sorted by unstructured. In addition, it seems that PDFMiner is being called when calling Chipper, which adds repeated elements from Chipper and PDFMiner. To evaluate this PR, you can test the code below with the attached PDF. The code writes a JSON file with the generated elements. The output can be examined with `cat out.un.json | python -m json.tool`. There are three things to check: 1. The size of the image passed to Chipper, which can be identiied in the layout_height and layout_width attributes, which should have values 3301 and 2550 as shown in the example below: ``` [ { "element_id": "c0493a7872f227e4172c4192c5f48a06", "metadata": { "coordinates": { "layout_height": 3301, "layout_width": 2550, ``` 2. There should be no repeated elements. 3. Order should be closer to reading order. The script to run Chipper from unstructured is: ``` from unstructured import __version__ print(__version__.__version__) import json from unstructured.partition.auto import partition from unstructured.staging.base import elements_to_json elements = json.loads(elements_to_json(partition("Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf", strategy="hi_res", model_name="chipperv3"))) with open('out.un.json', 'w') as w: json.dump(elements, w) ``` [Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf](https://github.com/Unstructured-IO/unstructured/files/13817273/Huang_Improving_Table_Structure_Recognition_With_Visual-Alignment_Sequential_Coordinate_Modeling_CVPR_2023_paper-p6.pdf) --------- Co-authored-by: Antonio Jimeno Yepes <antonio@unstructured.io>
2025-11-03 19:43:24 +00:00 · 2024-01-25 13:33:32 +11:00 · 2024-01-25 13:33:32 +11:00 · d8b3bdb919
commit d8b3bdb919
parent 4613e52e11
4 changed files with 56 additions and 40 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.12.3-dev4
+## 0.12.3-dev5

 ### Enhancements

@ -10,6 +10,7 @@
 * **Add Databricks Volumes destination connector** Databricks Volumes connector added to ingest CLI.  Users may now use `unstructured-ingest` to write partitioned data to a Databricks Volumes storage service.

 ### Fixes
+* **Fix support for different Chipper versions and prevent running PDFMiner with Chipper** 
 * **Treat YAML files as text.** Adds YAML MIME types to the file detection code and treats those
  files as text.
 * **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector.
--- a/test_unstructured/partition/pdf_image/test_chipper.py
+++ b/test_unstructured/partition/pdf_image/test_chipper.py
@ -30,3 +30,13 @@ def test_chipper_not_losing_parents(chipper_results, chipper_children):
        [el for el in chipper_results if el.id == child.metadata.parent_id]
        for child in chipper_children
    )
+
+
+def chipper_test_pdfminer_repeated(chipper_results):
+    """
+    Test to verify that PDFMiner has not been run together with Chipper
+    """
+    elements = chipper_results
+    assert len([element.text for element in elements]) == len(
+        {element.text for element in elements}
+    )
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.12.3-dev4"  # pragma: no cover
+__version__ = "0.12.3-dev5"  # pragma: no cover
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -298,8 +298,8 @@ def _partition_pdf_or_image_local(
        hi_res_model_name or model_name or default_hi_res_model(infer_table_structure)
    )
    if pdf_image_dpi is None:
-        pdf_image_dpi = 300 if hi_res_model_name == "chipper" else 200
-    if (pdf_image_dpi < 300) and (hi_res_model_name == "chipper"):
+        pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200
+    if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")):
        logger.warning(
            "The Chipper model performs better when images are rendered with DPI >= 300 "
            f"(currently {pdf_image_dpi}).",
@ -313,32 +313,33 @@ def _partition_pdf_or_image_local(
            pdf_image_dpi=pdf_image_dpi,
        )

-        extracted_layout = (
-            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
-            if pdf_text_extractable
-            else []
-        )
-
-        if analysis:
-            annotate_layout_elements(
-                inferred_document_layout=inferred_document_layout,
-                extracted_layout=extracted_layout,
-                filename=filename,
-                output_dir_path=analyzed_image_output_dir_path,
-                pdf_image_dpi=pdf_image_dpi,
-                is_image=is_image,
-            )
-
-        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-        merged_document_layout = merge_inferred_with_extracted_layout(
-            inferred_document_layout=inferred_document_layout,
-            extracted_layout=extracted_layout,
-        )
-
        if hi_res_model_name.startswith("chipper"):
            # NOTE(alan): We shouldn't do OCR with chipper
-            final_document_layout = merged_document_layout
+            # NOTE(antonio): We shouldn't do PDFMiner with chipper
+            final_document_layout = inferred_document_layout
        else:
+            extracted_layout = (
+                process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
+                if pdf_text_extractable
+                else []
+            )
+
+            if analysis:
+                annotate_layout_elements(
+                    inferred_document_layout=inferred_document_layout,
+                    extracted_layout=extracted_layout,
+                    filename=filename,
+                    output_dir_path=analyzed_image_output_dir_path,
+                    pdf_image_dpi=pdf_image_dpi,
+                    is_image=is_image,
+                )
+
+            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+            merged_document_layout = merge_inferred_with_extracted_layout(
+                inferred_document_layout=inferred_document_layout,
+                extracted_layout=extracted_layout,
+            )
+
            final_document_layout = process_file_with_ocr(
                filename,
                merged_document_layout,
@ -355,23 +356,27 @@ def _partition_pdf_or_image_local(
            model_name=hi_res_model_name,
            pdf_image_dpi=pdf_image_dpi,
        )
-        if hasattr(file, "seek"):
-            file.seek(0)
-
-        extracted_layout = (
-            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
-        )
-
-        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-        merged_document_layout = merge_inferred_with_extracted_layout(
-            inferred_document_layout=inferred_document_layout,
-            extracted_layout=extracted_layout,
-        )

        if hi_res_model_name.startswith("chipper"):
            # NOTE(alan): We shouldn't do OCR with chipper
+            # NOTE(antonio): We shouldn't do PDFMiner with chipper
            final_document_layout = merged_document_layout
        else:
+            if hasattr(file, "seek"):
+                file.seek(0)
+
+            extracted_layout = (
+                process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
+                if pdf_text_extractable
+                else []
+            )
+
+            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+            merged_document_layout = merge_inferred_with_extracted_layout(
+                inferred_document_layout=inferred_document_layout,
+                extracted_layout=extracted_layout,
+            )
+
            if hasattr(file, "seek"):
                file.seek(0)
            final_document_layout = process_data_with_ocr(
@ -385,7 +390,7 @@ def _partition_pdf_or_image_local(
            )

    # NOTE(alan): starting with v2, chipper sorts the elements itself.
-    if hi_res_model_name == "chipper":
+    if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
        kwargs["sort_mode"] = SORT_MODE_DONT

    final_document_layout = clean_pdfminer_inner_elements(final_document_layout)