Remove unsupported chipper model (#3728)

The chipper model is no longer supported.
This commit is contained in:
Nathan Van Gheem 2024-10-17 13:40:45 -04:00 committed by GitHub
parent 1eceac26c8
commit b092d45816
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 88 additions and 202 deletions

View File

@ -153,41 +153,6 @@ jobs:
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
make check-coverage
test_chipper:
strategy:
matrix:
python-version: ["3.10"]
runs-on: ubuntu-latest
env:
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup, lint]
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Setup virtual environment
uses: ./.github/actions/base-cache
with:
python-version: ${{ matrix.python-version }}
- name: Test
env:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
PYTHON: python${{ matrix.python-version }}
NLTK_DATA: ${{ github.workspace }}/nltk_data
run: |
source .venv/bin/activate
sudo apt-get update
sudo apt-get install -y poppler-utils
make install-pandoc install-test
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version
make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
test_unit_no_extras:
strategy:
matrix:

View File

@ -1,4 +1,4 @@
## 0.16.1-dev0
## 0.16.1-dev1
### Enhancements
@ -6,6 +6,7 @@
### Fixes
* **Remove unsupported chipper model**
* **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
## 0.16.0

View File

@ -138,12 +138,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
.PHONY: test
test:
PYTHONPATH=. CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
.PHONY: test-chipper
test-chipper:
PYTHONPATH=. CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
.PHONY: test-unstructured-api-unit
test-unstructured-api-unit:
@ -309,7 +304,7 @@ docker-test:
$(DOCKER_IMAGE) \
bash -c "CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
.PHONY: docker-smoke-test
docker-smoke-test:

View File

@ -15,8 +15,6 @@ filterwarnings =
ignore::DeprecationWarning
python_classes = Test Describe
python_functions = test_ it_ they_ but_ and_
markers =
chipper: mark a test as running chipper, which tends to be slow and compute-heavy.
testpaths =
test_unstructured
test_unstructured_ingest

View File

@ -1,43 +0,0 @@
import pytest
from test_unstructured.unit_utils import example_doc_path
from unstructured.partition import pdf
from unstructured.partition.utils.constants import PartitionStrategy
@pytest.fixture(scope="session")
def chipper_results():
elements = pdf.partition_pdf(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
strategy=PartitionStrategy.HI_RES,
model_name="chipper",
)
return elements
@pytest.fixture(scope="session")
def chipper_children(chipper_results):
return [el for el in chipper_results if el.metadata.parent_id is not None]
@pytest.mark.chipper()
def test_chipper_has_hierarchy(chipper_children):
assert chipper_children
@pytest.mark.chipper()
def test_chipper_not_losing_parents(chipper_results, chipper_children):
assert all(
[el for el in chipper_results if el.id == child.metadata.parent_id]
for child in chipper_children
)
def chipper_test_pdfminer_repeated(chipper_results):
"""
Test to verify that PDFMiner has not been run together with Chipper
"""
elements = chipper_results
assert len([element.text for element in elements]) == len(
{element.text for element in elements}
)

View File

@ -218,7 +218,7 @@ def test_partition_pdf_with_model_name_env_var(
assert mock_process.call_args[1]["model_name"] == "checkbox"
@pytest.mark.parametrize("model_name", ["checkbox", "yolox", "chipper"])
@pytest.mark.parametrize("model_name", ["checkbox", "yolox"])
def test_partition_pdf_with_model_name(
monkeypatch,
model_name,

View File

@ -1 +1 @@
__version__ = "0.16.1-dev0" # pragma: no cover
__version__ = "0.16.1-dev1" # pragma: no cover

View File

@ -566,12 +566,7 @@ def _partition_pdf_or_image_local(
hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
if pdf_image_dpi is None:
pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200
if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")):
logger.warning(
"The Chipper model performs better when images are rendered with DPI >= 300 "
f"(currently {pdf_image_dpi}).",
)
pdf_image_dpi = 200
od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
@ -588,53 +583,48 @@ def _partition_pdf_or_image_local(
pdf_image_dpi=pdf_image_dpi,
)
if hi_res_model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
# NOTE(antonio): We shouldn't do PDFMiner with chipper
final_document_layout = inferred_document_layout
else:
extracted_layout = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)
extracted_layout = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)
if analysis:
if not analyzed_image_output_dir_path:
if env_config.GLOBAL_WORKING_DIR_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
if not skip_analysis_dump:
od_model_layout_dumper = ObjectDetectionLayoutDumper(
layout=inferred_document_layout,
model_name=hi_res_model_name,
if analysis:
if not analyzed_image_output_dir_path:
if env_config.GLOBAL_WORKING_DIR_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
)
extracted_layout_dumper = ExtractedLayoutDumper(
layout=extracted_layout,
)
ocr_layout_dumper = OCRLayoutDumper()
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
hi_res_model_name=hi_res_model_name,
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
if not skip_analysis_dump:
od_model_layout_dumper = ObjectDetectionLayoutDumper(
layout=inferred_document_layout,
model_name=hi_res_model_name,
)
extracted_layout_dumper = ExtractedLayoutDumper(
layout=extracted_layout,
)
ocr_layout_dumper = OCRLayoutDumper()
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
hi_res_model_name=hi_res_model_name,
)
final_document_layout = process_file_with_ocr(
filename,
merged_document_layout,
extracted_layout=extracted_layout,
is_image=is_image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
final_document_layout = process_file_with_ocr(
filename,
merged_document_layout,
extracted_layout=extracted_layout,
is_image=is_image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
else:
inferred_document_layout = process_data_with_model(
file,
@ -643,62 +633,51 @@ def _partition_pdf_or_image_local(
pdf_image_dpi=pdf_image_dpi,
)
if hi_res_model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
# NOTE(antonio): We shouldn't do PDFMiner with chipper
final_document_layout = inferred_document_layout
else:
if hasattr(file, "seek"):
file.seek(0)
if hasattr(file, "seek"):
file.seek(0)
extracted_layout = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)
extracted_layout = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
)
if analysis:
if not analyzed_image_output_dir_path:
if env_config.GLOBAL_WORKING_DIR_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
if not skip_analysis_dump:
od_model_layout_dumper = ObjectDetectionLayoutDumper(
layout=inferred_document_layout,
model_name=hi_res_model_name,
if analysis:
if not analyzed_image_output_dir_path:
if env_config.GLOBAL_WORKING_DIR_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
)
extracted_layout_dumper = ExtractedLayoutDumper(
layout=extracted_layout,
)
ocr_layout_dumper = OCRLayoutDumper()
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
if not skip_analysis_dump:
od_model_layout_dumper = ObjectDetectionLayoutDumper(
layout=inferred_document_layout,
model_name=hi_res_model_name,
)
extracted_layout_dumper = ExtractedLayoutDumper(
layout=extracted_layout,
)
ocr_layout_dumper = OCRLayoutDumper()
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
hi_res_model_name=hi_res_model_name,
)
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
hi_res_model_name=hi_res_model_name,
)
if hasattr(file, "seek"):
file.seek(0)
final_document_layout = process_data_with_ocr(
file,
merged_document_layout,
extracted_layout=extracted_layout,
is_image=is_image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
# NOTE(alan): starting with v2, chipper sorts the elements itself.
if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
kwargs["sort_mode"] = SORT_MODE_DONT
if hasattr(file, "seek"):
file.seek(0)
final_document_layout = process_data_with_ocr(
file,
merged_document_layout,
extracted_layout=extracted_layout,
is_image=is_image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
@ -766,9 +745,7 @@ def _partition_pdf_or_image_local(
" ",
el.text or "",
).strip()
# NOTE(alan): with chipper there are parent elements with no text we don't want to
# filter those out and leave the children orphaned.
if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"):
if el.text or isinstance(el, PageBreak):
out_elements.append(cast(Element, el))
if extract_forms:

View File

@ -176,10 +176,6 @@ def merge_inferred_with_extracted_layout(
)
from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
# If the model is a chipper model, we don't want to order the
# elements, as they are already ordered
order_elements = not hi_res_model_name.startswith("chipper")
inferred_pages = inferred_document_layout.pages
for i, (inferred_page, extracted_page_layout) in enumerate(
zip(inferred_pages, extracted_layout)
@ -206,10 +202,7 @@ def merge_inferred_with_extracted_layout(
**threshold_kwargs,
)
if order_elements:
merged_layout = sort_text_regions(
cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC
)
merged_layout = sort_text_regions(cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC)
elements = []
for layout_el in merged_layout: