Remove unsupported chipper model (#3728)

The chipper model is no longer supported.
2025-11-02 11:03:38 +00:00 · 2024-10-17 13:40:45 -04:00 · 2024-10-17 13:40:45 -04:00 · b092d45816
commit b092d45816
parent 1eceac26c8
9 changed files with 88 additions and 202 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -153,41 +153,6 @@ jobs:
        make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
        make check-coverage

-  test_chipper:
-    strategy:
-      matrix:
-        python-version: ["3.10"]
-    runs-on: ubuntu-latest
-    env:
-      UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
-    needs: [setup, lint]
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Setup virtual environment
-      uses: ./.github/actions/base-cache
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Test
-      env:
-        UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
-        PYTHON: python${{ matrix.python-version }}
-        NLTK_DATA: ${{ github.workspace }}/nltk_data
-      run: |
-        source .venv/bin/activate
-        sudo apt-get update
-        sudo apt-get install -y poppler-utils
-        make install-pandoc install-test
-        sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
-        sudo apt-get update
-        sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
-        tesseract --version
-        make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
-
  test_unit_no_extras:
    strategy:
      matrix:
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.16.1-dev0
+## 0.16.1-dev1

 ### Enhancements

@ -6,6 +6,7 @@

 ### Fixes

+* **Remove unsupported chipper model**
 * **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.

 ## 0.16.0
--- a/9
+++ b/9
@ -138,12 +138,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
 .PHONY: test
 test:
 	PYTHONPATH=. CI=$(CI) \
-	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
-
-.PHONY: test-chipper
-test-chipper:
-	PYTHONPATH=. CI=$(CI) \
-	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
+	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40

 .PHONY: test-unstructured-api-unit
 test-unstructured-api-unit:
@ -309,7 +304,7 @@ docker-test:
 	$(DOCKER_IMAGE) \
 	bash -c "CI=$(CI) \
 	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
-	pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
+	pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"

 .PHONY: docker-smoke-test
 docker-smoke-test:
--- a/setup.cfg
+++ b/setup.cfg
@ -15,8 +15,6 @@ filterwarnings =
    ignore::DeprecationWarning
 python_classes = Test Describe
 python_functions = test_ it_ they_ but_ and_
-markers =
-    chipper: mark a test as running chipper, which tends to be slow and compute-heavy.
 testpaths =
    test_unstructured
    test_unstructured_ingest
--- a/test_unstructured/partition/pdf_image/test_chipper.py
+++ b/test_unstructured/partition/pdf_image/test_chipper.py
@ -1,43 +0,0 @@
-import pytest
-
-from test_unstructured.unit_utils import example_doc_path
-from unstructured.partition import pdf
-from unstructured.partition.utils.constants import PartitionStrategy
-
-
-@pytest.fixture(scope="session")
-def chipper_results():
-    elements = pdf.partition_pdf(
-        filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
-        strategy=PartitionStrategy.HI_RES,
-        model_name="chipper",
-    )
-    return elements
-
-
-@pytest.fixture(scope="session")
-def chipper_children(chipper_results):
-    return [el for el in chipper_results if el.metadata.parent_id is not None]
-
-
-@pytest.mark.chipper()
-def test_chipper_has_hierarchy(chipper_children):
-    assert chipper_children
-
-
-@pytest.mark.chipper()
-def test_chipper_not_losing_parents(chipper_results, chipper_children):
-    assert all(
-        [el for el in chipper_results if el.id == child.metadata.parent_id]
-        for child in chipper_children
-    )
-
-
-def chipper_test_pdfminer_repeated(chipper_results):
-    """
-    Test to verify that PDFMiner has not been run together with Chipper
-    """
-    elements = chipper_results
-    assert len([element.text for element in elements]) == len(
-        {element.text for element in elements}
-    )
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@ -218,7 +218,7 @@ def test_partition_pdf_with_model_name_env_var(
        assert mock_process.call_args[1]["model_name"] == "checkbox"


-@pytest.mark.parametrize("model_name", ["checkbox", "yolox", "chipper"])
+@pytest.mark.parametrize("model_name", ["checkbox", "yolox"])
 def test_partition_pdf_with_model_name(
    monkeypatch,
    model_name,
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.16.1-dev0"  # pragma: no cover
+__version__ = "0.16.1-dev1"  # pragma: no cover
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -566,12 +566,7 @@ def _partition_pdf_or_image_local(

    hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
    if pdf_image_dpi is None:
-        pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200
-    if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")):
-        logger.warning(
-            "The Chipper model performs better when images are rendered with DPI >= 300 "
-            f"(currently {pdf_image_dpi}).",
-        )
+        pdf_image_dpi = 200

    od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
    extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
@ -588,53 +583,48 @@ def _partition_pdf_or_image_local(
            pdf_image_dpi=pdf_image_dpi,
        )

-        if hi_res_model_name.startswith("chipper"):
-            # NOTE(alan): We shouldn't do OCR with chipper
-            # NOTE(antonio): We shouldn't do PDFMiner with chipper
-            final_document_layout = inferred_document_layout
-        else:
-            extracted_layout = (
-                process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
-                if pdf_text_extractable
-                else []
-            )
+        extracted_layout = (
+            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
+            if pdf_text_extractable
+            else []
+        )

-            if analysis:
-                if not analyzed_image_output_dir_path:
-                    if env_config.GLOBAL_WORKING_DIR_ENABLED:
-                        analyzed_image_output_dir_path = str(
-                            Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
-                        )
-                    else:
-                        analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
-                os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
-                if not skip_analysis_dump:
-                    od_model_layout_dumper = ObjectDetectionLayoutDumper(
-                        layout=inferred_document_layout,
-                        model_name=hi_res_model_name,
+        if analysis:
+            if not analyzed_image_output_dir_path:
+                if env_config.GLOBAL_WORKING_DIR_ENABLED:
+                    analyzed_image_output_dir_path = str(
+                        Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
                    )
-                    extracted_layout_dumper = ExtractedLayoutDumper(
-                        layout=extracted_layout,
-                    )
-                    ocr_layout_dumper = OCRLayoutDumper()
-            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-            merged_document_layout = merge_inferred_with_extracted_layout(
-                inferred_document_layout=inferred_document_layout,
-                extracted_layout=extracted_layout,
-                hi_res_model_name=hi_res_model_name,
-            )
+                else:
+                    analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
+            os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
+            if not skip_analysis_dump:
+                od_model_layout_dumper = ObjectDetectionLayoutDumper(
+                    layout=inferred_document_layout,
+                    model_name=hi_res_model_name,
+                )
+                extracted_layout_dumper = ExtractedLayoutDumper(
+                    layout=extracted_layout,
+                )
+                ocr_layout_dumper = OCRLayoutDumper()
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+        merged_document_layout = merge_inferred_with_extracted_layout(
+            inferred_document_layout=inferred_document_layout,
+            extracted_layout=extracted_layout,
+            hi_res_model_name=hi_res_model_name,
+        )

-            final_document_layout = process_file_with_ocr(
-                filename,
-                merged_document_layout,
-                extracted_layout=extracted_layout,
-                is_image=is_image,
-                infer_table_structure=infer_table_structure,
-                ocr_languages=ocr_languages,
-                ocr_mode=ocr_mode,
-                pdf_image_dpi=pdf_image_dpi,
-                ocr_layout_dumper=ocr_layout_dumper,
-            )
+        final_document_layout = process_file_with_ocr(
+            filename,
+            merged_document_layout,
+            extracted_layout=extracted_layout,
+            is_image=is_image,
+            infer_table_structure=infer_table_structure,
+            ocr_languages=ocr_languages,
+            ocr_mode=ocr_mode,
+            pdf_image_dpi=pdf_image_dpi,
+            ocr_layout_dumper=ocr_layout_dumper,
+        )
    else:
        inferred_document_layout = process_data_with_model(
            file,
@ -643,62 +633,51 @@ def _partition_pdf_or_image_local(
            pdf_image_dpi=pdf_image_dpi,
        )

-        if hi_res_model_name.startswith("chipper"):
-            # NOTE(alan): We shouldn't do OCR with chipper
-            # NOTE(antonio): We shouldn't do PDFMiner with chipper
-            final_document_layout = inferred_document_layout
-        else:
-            if hasattr(file, "seek"):
-                file.seek(0)
+        if hasattr(file, "seek"):
+            file.seek(0)

-            extracted_layout = (
-                process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
-                if pdf_text_extractable
-                else []
-            )
+        extracted_layout = (
+            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
+        )

-            if analysis:
-                if not analyzed_image_output_dir_path:
-                    if env_config.GLOBAL_WORKING_DIR_ENABLED:
-                        analyzed_image_output_dir_path = str(
-                            Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
-                        )
-                    else:
-                        analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
-                if not skip_analysis_dump:
-                    od_model_layout_dumper = ObjectDetectionLayoutDumper(
-                        layout=inferred_document_layout,
-                        model_name=hi_res_model_name,
+        if analysis:
+            if not analyzed_image_output_dir_path:
+                if env_config.GLOBAL_WORKING_DIR_ENABLED:
+                    analyzed_image_output_dir_path = str(
+                        Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
                    )
-                    extracted_layout_dumper = ExtractedLayoutDumper(
-                        layout=extracted_layout,
-                    )
-                    ocr_layout_dumper = OCRLayoutDumper()
+                else:
+                    analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
+            if not skip_analysis_dump:
+                od_model_layout_dumper = ObjectDetectionLayoutDumper(
+                    layout=inferred_document_layout,
+                    model_name=hi_res_model_name,
+                )
+                extracted_layout_dumper = ExtractedLayoutDumper(
+                    layout=extracted_layout,
+                )
+                ocr_layout_dumper = OCRLayoutDumper()

-            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-            merged_document_layout = merge_inferred_with_extracted_layout(
-                inferred_document_layout=inferred_document_layout,
-                extracted_layout=extracted_layout,
-                hi_res_model_name=hi_res_model_name,
-            )
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+        merged_document_layout = merge_inferred_with_extracted_layout(
+            inferred_document_layout=inferred_document_layout,
+            extracted_layout=extracted_layout,
+            hi_res_model_name=hi_res_model_name,
+        )

-            if hasattr(file, "seek"):
-                file.seek(0)
-            final_document_layout = process_data_with_ocr(
-                file,
-                merged_document_layout,
-                extracted_layout=extracted_layout,
-                is_image=is_image,
-                infer_table_structure=infer_table_structure,
-                ocr_languages=ocr_languages,
-                ocr_mode=ocr_mode,
-                pdf_image_dpi=pdf_image_dpi,
-                ocr_layout_dumper=ocr_layout_dumper,
-            )
-
-    # NOTE(alan): starting with v2, chipper sorts the elements itself.
-    if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
-        kwargs["sort_mode"] = SORT_MODE_DONT
+        if hasattr(file, "seek"):
+            file.seek(0)
+        final_document_layout = process_data_with_ocr(
+            file,
+            merged_document_layout,
+            extracted_layout=extracted_layout,
+            is_image=is_image,
+            infer_table_structure=infer_table_structure,
+            ocr_languages=ocr_languages,
+            ocr_mode=ocr_mode,
+            pdf_image_dpi=pdf_image_dpi,
+            ocr_layout_dumper=ocr_layout_dumper,
+        )

    final_document_layout = clean_pdfminer_inner_elements(final_document_layout)

@ -766,9 +745,7 @@ def _partition_pdf_or_image_local(
                " ",
                el.text or "",
            ).strip()
-            # NOTE(alan): with chipper there are parent elements with no text we don't want to
-            # filter those out and leave the children orphaned.
-            if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"):
+            if el.text or isinstance(el, PageBreak):
                out_elements.append(cast(Element, el))

    if extract_forms:
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@ -176,10 +176,6 @@ def merge_inferred_with_extracted_layout(
    )
    from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel

-    # If the model is a chipper model, we don't want to order the
-    # elements, as they are already ordered
-    order_elements = not hi_res_model_name.startswith("chipper")
-
    inferred_pages = inferred_document_layout.pages
    for i, (inferred_page, extracted_page_layout) in enumerate(
        zip(inferred_pages, extracted_layout)
@ -206,10 +202,7 @@ def merge_inferred_with_extracted_layout(
            **threshold_kwargs,
        )

-        if order_elements:
-            merged_layout = sort_text_regions(
-                cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC
-            )
+        merged_layout = sort_text_regions(cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC)

        elements = []
        for layout_el in merged_layout: