diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 12c261ffb..81afe54c5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -153,41 +153,6 @@ jobs: make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true make check-coverage - test_chipper: - strategy: - matrix: - python-version: ["3.10"] - runs-on: ubuntu-latest - env: - UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }} - NLTK_DATA: ${{ github.workspace }}/nltk_data - needs: [setup, lint] - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Setup virtual environment - uses: ./.github/actions/base-cache - with: - python-version: ${{ matrix.python-version }} - - name: Test - env: - UNS_API_KEY: ${{ secrets.UNS_API_KEY }} - PYTHON: python${{ matrix.python-version }} - NLTK_DATA: ${{ github.workspace }}/nltk_data - run: | - source .venv/bin/activate - sudo apt-get update - sudo apt-get install -y poppler-utils - make install-pandoc install-test - sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 - sudo apt-get update - sudo apt-get install -y tesseract-ocr tesseract-ocr-kor - tesseract --version - make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true - test_unit_no_extras: strategy: matrix: diff --git a/CHANGELOG.md b/CHANGELOG.md index 46862fbdf..1d180ec7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.1-dev0 +## 0.16.1-dev1 ### Enhancements @@ -6,6 +6,7 @@ ### Fixes +* **Remove unsupported chipper model** * **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process. ## 0.16.0 diff --git a/Makefile b/Makefile index 714992a83..c8bdc25c0 100644 --- a/Makefile +++ b/Makefile @@ -138,12 +138,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false .PHONY: test test: PYTHONPATH=. CI=$(CI) \ - UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40 - -.PHONY: test-chipper -test-chipper: - PYTHONPATH=. CI=$(CI) \ - UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40 + UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40 .PHONY: test-unstructured-api-unit test-unstructured-api-unit: @@ -309,7 +304,7 @@ docker-test: $(DOCKER_IMAGE) \ bash -c "CI=$(CI) \ UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \ - pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)" + pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)" .PHONY: docker-smoke-test docker-smoke-test: diff --git a/setup.cfg b/setup.cfg index 5b619b7a9..ced1baaa6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -15,8 +15,6 @@ filterwarnings = ignore::DeprecationWarning python_classes = Test Describe python_functions = test_ it_ they_ but_ and_ -markers = - chipper: mark a test as running chipper, which tends to be slow and compute-heavy. testpaths = test_unstructured test_unstructured_ingest diff --git a/test_unstructured/partition/pdf_image/test_chipper.py b/test_unstructured/partition/pdf_image/test_chipper.py deleted file mode 100644 index 03cc610ea..000000000 --- a/test_unstructured/partition/pdf_image/test_chipper.py +++ /dev/null @@ -1,43 +0,0 @@ -import pytest - -from test_unstructured.unit_utils import example_doc_path -from unstructured.partition import pdf -from unstructured.partition.utils.constants import PartitionStrategy - - -@pytest.fixture(scope="session") -def chipper_results(): - elements = pdf.partition_pdf( - filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), - strategy=PartitionStrategy.HI_RES, - model_name="chipper", - ) - return elements - - -@pytest.fixture(scope="session") -def chipper_children(chipper_results): - return [el for el in chipper_results if el.metadata.parent_id is not None] - - -@pytest.mark.chipper() -def test_chipper_has_hierarchy(chipper_children): - assert chipper_children - - -@pytest.mark.chipper() -def test_chipper_not_losing_parents(chipper_results, chipper_children): - assert all( - [el for el in chipper_results if el.id == child.metadata.parent_id] - for child in chipper_children - ) - - -def chipper_test_pdfminer_repeated(chipper_results): - """ - Test to verify that PDFMiner has not been run together with Chipper - """ - elements = chipper_results - assert len([element.text for element in elements]) == len( - {element.text for element in elements} - ) diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index afab53cb4..4362f06bb 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -218,7 +218,7 @@ def test_partition_pdf_with_model_name_env_var( assert mock_process.call_args[1]["model_name"] == "checkbox" -@pytest.mark.parametrize("model_name", ["checkbox", "yolox", "chipper"]) +@pytest.mark.parametrize("model_name", ["checkbox", "yolox"]) def test_partition_pdf_with_model_name( monkeypatch, model_name, diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d0af64910..16e6dacb9 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.1-dev0" # pragma: no cover +__version__ = "0.16.1-dev1" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 1f8300ce3..e7b5b516f 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -566,12 +566,7 @@ def _partition_pdf_or_image_local( hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model() if pdf_image_dpi is None: - pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200 - if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")): - logger.warning( - "The Chipper model performs better when images are rendered with DPI >= 300 " - f"(currently {pdf_image_dpi}).", - ) + pdf_image_dpi = 200 od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None @@ -588,53 +583,48 @@ def _partition_pdf_or_image_local( pdf_image_dpi=pdf_image_dpi, ) - if hi_res_model_name.startswith("chipper"): - # NOTE(alan): We shouldn't do OCR with chipper - # NOTE(antonio): We shouldn't do PDFMiner with chipper - final_document_layout = inferred_document_layout - else: - extracted_layout = ( - process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi) - if pdf_text_extractable - else [] - ) + extracted_layout = ( + process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi) + if pdf_text_extractable + else [] + ) - if analysis: - if not analyzed_image_output_dir_path: - if env_config.GLOBAL_WORKING_DIR_ENABLED: - analyzed_image_output_dir_path = str( - Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated" - ) - else: - analyzed_image_output_dir_path = str(Path.cwd() / "annotated") - os.makedirs(analyzed_image_output_dir_path, exist_ok=True) - if not skip_analysis_dump: - od_model_layout_dumper = ObjectDetectionLayoutDumper( - layout=inferred_document_layout, - model_name=hi_res_model_name, + if analysis: + if not analyzed_image_output_dir_path: + if env_config.GLOBAL_WORKING_DIR_ENABLED: + analyzed_image_output_dir_path = str( + Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated" ) - extracted_layout_dumper = ExtractedLayoutDumper( - layout=extracted_layout, - ) - ocr_layout_dumper = OCRLayoutDumper() - # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout - merged_document_layout = merge_inferred_with_extracted_layout( - inferred_document_layout=inferred_document_layout, - extracted_layout=extracted_layout, - hi_res_model_name=hi_res_model_name, - ) + else: + analyzed_image_output_dir_path = str(Path.cwd() / "annotated") + os.makedirs(analyzed_image_output_dir_path, exist_ok=True) + if not skip_analysis_dump: + od_model_layout_dumper = ObjectDetectionLayoutDumper( + layout=inferred_document_layout, + model_name=hi_res_model_name, + ) + extracted_layout_dumper = ExtractedLayoutDumper( + layout=extracted_layout, + ) + ocr_layout_dumper = OCRLayoutDumper() + # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout + merged_document_layout = merge_inferred_with_extracted_layout( + inferred_document_layout=inferred_document_layout, + extracted_layout=extracted_layout, + hi_res_model_name=hi_res_model_name, + ) - final_document_layout = process_file_with_ocr( - filename, - merged_document_layout, - extracted_layout=extracted_layout, - is_image=is_image, - infer_table_structure=infer_table_structure, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, - pdf_image_dpi=pdf_image_dpi, - ocr_layout_dumper=ocr_layout_dumper, - ) + final_document_layout = process_file_with_ocr( + filename, + merged_document_layout, + extracted_layout=extracted_layout, + is_image=is_image, + infer_table_structure=infer_table_structure, + ocr_languages=ocr_languages, + ocr_mode=ocr_mode, + pdf_image_dpi=pdf_image_dpi, + ocr_layout_dumper=ocr_layout_dumper, + ) else: inferred_document_layout = process_data_with_model( file, @@ -643,62 +633,51 @@ def _partition_pdf_or_image_local( pdf_image_dpi=pdf_image_dpi, ) - if hi_res_model_name.startswith("chipper"): - # NOTE(alan): We shouldn't do OCR with chipper - # NOTE(antonio): We shouldn't do PDFMiner with chipper - final_document_layout = inferred_document_layout - else: - if hasattr(file, "seek"): - file.seek(0) + if hasattr(file, "seek"): + file.seek(0) - extracted_layout = ( - process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) - if pdf_text_extractable - else [] - ) + extracted_layout = ( + process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else [] + ) - if analysis: - if not analyzed_image_output_dir_path: - if env_config.GLOBAL_WORKING_DIR_ENABLED: - analyzed_image_output_dir_path = str( - Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated" - ) - else: - analyzed_image_output_dir_path = str(Path.cwd() / "annotated") - if not skip_analysis_dump: - od_model_layout_dumper = ObjectDetectionLayoutDumper( - layout=inferred_document_layout, - model_name=hi_res_model_name, + if analysis: + if not analyzed_image_output_dir_path: + if env_config.GLOBAL_WORKING_DIR_ENABLED: + analyzed_image_output_dir_path = str( + Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated" ) - extracted_layout_dumper = ExtractedLayoutDumper( - layout=extracted_layout, - ) - ocr_layout_dumper = OCRLayoutDumper() + else: + analyzed_image_output_dir_path = str(Path.cwd() / "annotated") + if not skip_analysis_dump: + od_model_layout_dumper = ObjectDetectionLayoutDumper( + layout=inferred_document_layout, + model_name=hi_res_model_name, + ) + extracted_layout_dumper = ExtractedLayoutDumper( + layout=extracted_layout, + ) + ocr_layout_dumper = OCRLayoutDumper() - # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout - merged_document_layout = merge_inferred_with_extracted_layout( - inferred_document_layout=inferred_document_layout, - extracted_layout=extracted_layout, - hi_res_model_name=hi_res_model_name, - ) + # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout + merged_document_layout = merge_inferred_with_extracted_layout( + inferred_document_layout=inferred_document_layout, + extracted_layout=extracted_layout, + hi_res_model_name=hi_res_model_name, + ) - if hasattr(file, "seek"): - file.seek(0) - final_document_layout = process_data_with_ocr( - file, - merged_document_layout, - extracted_layout=extracted_layout, - is_image=is_image, - infer_table_structure=infer_table_structure, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, - pdf_image_dpi=pdf_image_dpi, - ocr_layout_dumper=ocr_layout_dumper, - ) - - # NOTE(alan): starting with v2, chipper sorts the elements itself. - if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1": - kwargs["sort_mode"] = SORT_MODE_DONT + if hasattr(file, "seek"): + file.seek(0) + final_document_layout = process_data_with_ocr( + file, + merged_document_layout, + extracted_layout=extracted_layout, + is_image=is_image, + infer_table_structure=infer_table_structure, + ocr_languages=ocr_languages, + ocr_mode=ocr_mode, + pdf_image_dpi=pdf_image_dpi, + ocr_layout_dumper=ocr_layout_dumper, + ) final_document_layout = clean_pdfminer_inner_elements(final_document_layout) @@ -766,9 +745,7 @@ def _partition_pdf_or_image_local( " ", el.text or "", ).strip() - # NOTE(alan): with chipper there are parent elements with no text we don't want to - # filter those out and leave the children orphaned. - if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"): + if el.text or isinstance(el, PageBreak): out_elements.append(cast(Element, el)) if extract_forms: diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 0836292c8..59ea351fc 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -176,10 +176,6 @@ def merge_inferred_with_extracted_layout( ) from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel - # If the model is a chipper model, we don't want to order the - # elements, as they are already ordered - order_elements = not hi_res_model_name.startswith("chipper") - inferred_pages = inferred_document_layout.pages for i, (inferred_page, extracted_page_layout) in enumerate( zip(inferred_pages, extracted_layout) @@ -206,10 +202,7 @@ def merge_inferred_with_extracted_layout( **threshold_kwargs, ) - if order_elements: - merged_layout = sort_text_regions( - cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC - ) + merged_layout = sort_text_regions(cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC) elements = [] for layout_el in merged_layout: