mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Remove unsupported chipper model (#3728)
The chipper model is no longer supported.
This commit is contained in:
parent
1eceac26c8
commit
b092d45816
35
.github/workflows/ci.yml
vendored
35
.github/workflows/ci.yml
vendored
@ -153,41 +153,6 @@ jobs:
|
||||
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
||||
make check-coverage
|
||||
|
||||
test_chipper:
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.10"]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
needs: [setup, lint]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Setup virtual environment
|
||||
uses: ./.github/actions/base-cache
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Test
|
||||
env:
|
||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||
PYTHON: python${{ matrix.python-version }}
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y poppler-utils
|
||||
make install-pandoc install-test
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||
tesseract --version
|
||||
make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
||||
|
||||
test_unit_no_extras:
|
||||
strategy:
|
||||
matrix:
|
||||
|
@ -1,4 +1,4 @@
|
||||
## 0.16.1-dev0
|
||||
## 0.16.1-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Remove unsupported chipper model**
|
||||
* **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
|
||||
|
||||
## 0.16.0
|
||||
|
9
Makefile
9
Makefile
@ -138,12 +138,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
|
||||
.PHONY: test
|
||||
test:
|
||||
PYTHONPATH=. CI=$(CI) \
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
||||
|
||||
.PHONY: test-chipper
|
||||
test-chipper:
|
||||
PYTHONPATH=. CI=$(CI) \
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
||||
|
||||
.PHONY: test-unstructured-api-unit
|
||||
test-unstructured-api-unit:
|
||||
@ -309,7 +304,7 @@ docker-test:
|
||||
$(DOCKER_IMAGE) \
|
||||
bash -c "CI=$(CI) \
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
|
||||
pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
|
||||
pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
|
||||
|
||||
.PHONY: docker-smoke-test
|
||||
docker-smoke-test:
|
||||
|
@ -15,8 +15,6 @@ filterwarnings =
|
||||
ignore::DeprecationWarning
|
||||
python_classes = Test Describe
|
||||
python_functions = test_ it_ they_ but_ and_
|
||||
markers =
|
||||
chipper: mark a test as running chipper, which tends to be slow and compute-heavy.
|
||||
testpaths =
|
||||
test_unstructured
|
||||
test_unstructured_ingest
|
||||
|
@ -1,43 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from test_unstructured.unit_utils import example_doc_path
|
||||
from unstructured.partition import pdf
|
||||
from unstructured.partition.utils.constants import PartitionStrategy
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def chipper_results():
|
||||
elements = pdf.partition_pdf(
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
strategy=PartitionStrategy.HI_RES,
|
||||
model_name="chipper",
|
||||
)
|
||||
return elements
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def chipper_children(chipper_results):
|
||||
return [el for el in chipper_results if el.metadata.parent_id is not None]
|
||||
|
||||
|
||||
@pytest.mark.chipper()
|
||||
def test_chipper_has_hierarchy(chipper_children):
|
||||
assert chipper_children
|
||||
|
||||
|
||||
@pytest.mark.chipper()
|
||||
def test_chipper_not_losing_parents(chipper_results, chipper_children):
|
||||
assert all(
|
||||
[el for el in chipper_results if el.id == child.metadata.parent_id]
|
||||
for child in chipper_children
|
||||
)
|
||||
|
||||
|
||||
def chipper_test_pdfminer_repeated(chipper_results):
|
||||
"""
|
||||
Test to verify that PDFMiner has not been run together with Chipper
|
||||
"""
|
||||
elements = chipper_results
|
||||
assert len([element.text for element in elements]) == len(
|
||||
{element.text for element in elements}
|
||||
)
|
@ -218,7 +218,7 @@ def test_partition_pdf_with_model_name_env_var(
|
||||
assert mock_process.call_args[1]["model_name"] == "checkbox"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", ["checkbox", "yolox", "chipper"])
|
||||
@pytest.mark.parametrize("model_name", ["checkbox", "yolox"])
|
||||
def test_partition_pdf_with_model_name(
|
||||
monkeypatch,
|
||||
model_name,
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.16.1-dev0" # pragma: no cover
|
||||
__version__ = "0.16.1-dev1" # pragma: no cover
|
||||
|
@ -566,12 +566,7 @@ def _partition_pdf_or_image_local(
|
||||
|
||||
hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
|
||||
if pdf_image_dpi is None:
|
||||
pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200
|
||||
if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")):
|
||||
logger.warning(
|
||||
"The Chipper model performs better when images are rendered with DPI >= 300 "
|
||||
f"(currently {pdf_image_dpi}).",
|
||||
)
|
||||
pdf_image_dpi = 200
|
||||
|
||||
od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
|
||||
extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
|
||||
@ -588,53 +583,48 @@ def _partition_pdf_or_image_local(
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
)
|
||||
|
||||
if hi_res_model_name.startswith("chipper"):
|
||||
# NOTE(alan): We shouldn't do OCR with chipper
|
||||
# NOTE(antonio): We shouldn't do PDFMiner with chipper
|
||||
final_document_layout = inferred_document_layout
|
||||
else:
|
||||
extracted_layout = (
|
||||
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
|
||||
if pdf_text_extractable
|
||||
else []
|
||||
)
|
||||
extracted_layout = (
|
||||
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
|
||||
if pdf_text_extractable
|
||||
else []
|
||||
)
|
||||
|
||||
if analysis:
|
||||
if not analyzed_image_output_dir_path:
|
||||
if env_config.GLOBAL_WORKING_DIR_ENABLED:
|
||||
analyzed_image_output_dir_path = str(
|
||||
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
|
||||
)
|
||||
else:
|
||||
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
|
||||
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
|
||||
if not skip_analysis_dump:
|
||||
od_model_layout_dumper = ObjectDetectionLayoutDumper(
|
||||
layout=inferred_document_layout,
|
||||
model_name=hi_res_model_name,
|
||||
if analysis:
|
||||
if not analyzed_image_output_dir_path:
|
||||
if env_config.GLOBAL_WORKING_DIR_ENABLED:
|
||||
analyzed_image_output_dir_path = str(
|
||||
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
|
||||
)
|
||||
extracted_layout_dumper = ExtractedLayoutDumper(
|
||||
layout=extracted_layout,
|
||||
)
|
||||
ocr_layout_dumper = OCRLayoutDumper()
|
||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||
merged_document_layout = merge_inferred_with_extracted_layout(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
)
|
||||
else:
|
||||
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
|
||||
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
|
||||
if not skip_analysis_dump:
|
||||
od_model_layout_dumper = ObjectDetectionLayoutDumper(
|
||||
layout=inferred_document_layout,
|
||||
model_name=hi_res_model_name,
|
||||
)
|
||||
extracted_layout_dumper = ExtractedLayoutDumper(
|
||||
layout=extracted_layout,
|
||||
)
|
||||
ocr_layout_dumper = OCRLayoutDumper()
|
||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||
merged_document_layout = merge_inferred_with_extracted_layout(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
)
|
||||
|
||||
final_document_layout = process_file_with_ocr(
|
||||
filename,
|
||||
merged_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
is_image=is_image,
|
||||
infer_table_structure=infer_table_structure,
|
||||
ocr_languages=ocr_languages,
|
||||
ocr_mode=ocr_mode,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
ocr_layout_dumper=ocr_layout_dumper,
|
||||
)
|
||||
final_document_layout = process_file_with_ocr(
|
||||
filename,
|
||||
merged_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
is_image=is_image,
|
||||
infer_table_structure=infer_table_structure,
|
||||
ocr_languages=ocr_languages,
|
||||
ocr_mode=ocr_mode,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
ocr_layout_dumper=ocr_layout_dumper,
|
||||
)
|
||||
else:
|
||||
inferred_document_layout = process_data_with_model(
|
||||
file,
|
||||
@ -643,62 +633,51 @@ def _partition_pdf_or_image_local(
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
)
|
||||
|
||||
if hi_res_model_name.startswith("chipper"):
|
||||
# NOTE(alan): We shouldn't do OCR with chipper
|
||||
# NOTE(antonio): We shouldn't do PDFMiner with chipper
|
||||
final_document_layout = inferred_document_layout
|
||||
else:
|
||||
if hasattr(file, "seek"):
|
||||
file.seek(0)
|
||||
if hasattr(file, "seek"):
|
||||
file.seek(0)
|
||||
|
||||
extracted_layout = (
|
||||
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
|
||||
if pdf_text_extractable
|
||||
else []
|
||||
)
|
||||
extracted_layout = (
|
||||
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
|
||||
)
|
||||
|
||||
if analysis:
|
||||
if not analyzed_image_output_dir_path:
|
||||
if env_config.GLOBAL_WORKING_DIR_ENABLED:
|
||||
analyzed_image_output_dir_path = str(
|
||||
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
|
||||
)
|
||||
else:
|
||||
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
|
||||
if not skip_analysis_dump:
|
||||
od_model_layout_dumper = ObjectDetectionLayoutDumper(
|
||||
layout=inferred_document_layout,
|
||||
model_name=hi_res_model_name,
|
||||
if analysis:
|
||||
if not analyzed_image_output_dir_path:
|
||||
if env_config.GLOBAL_WORKING_DIR_ENABLED:
|
||||
analyzed_image_output_dir_path = str(
|
||||
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
|
||||
)
|
||||
extracted_layout_dumper = ExtractedLayoutDumper(
|
||||
layout=extracted_layout,
|
||||
)
|
||||
ocr_layout_dumper = OCRLayoutDumper()
|
||||
else:
|
||||
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
|
||||
if not skip_analysis_dump:
|
||||
od_model_layout_dumper = ObjectDetectionLayoutDumper(
|
||||
layout=inferred_document_layout,
|
||||
model_name=hi_res_model_name,
|
||||
)
|
||||
extracted_layout_dumper = ExtractedLayoutDumper(
|
||||
layout=extracted_layout,
|
||||
)
|
||||
ocr_layout_dumper = OCRLayoutDumper()
|
||||
|
||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||
merged_document_layout = merge_inferred_with_extracted_layout(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
)
|
||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||
merged_document_layout = merge_inferred_with_extracted_layout(
|
||||
inferred_document_layout=inferred_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
)
|
||||
|
||||
if hasattr(file, "seek"):
|
||||
file.seek(0)
|
||||
final_document_layout = process_data_with_ocr(
|
||||
file,
|
||||
merged_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
is_image=is_image,
|
||||
infer_table_structure=infer_table_structure,
|
||||
ocr_languages=ocr_languages,
|
||||
ocr_mode=ocr_mode,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
ocr_layout_dumper=ocr_layout_dumper,
|
||||
)
|
||||
|
||||
# NOTE(alan): starting with v2, chipper sorts the elements itself.
|
||||
if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
|
||||
kwargs["sort_mode"] = SORT_MODE_DONT
|
||||
if hasattr(file, "seek"):
|
||||
file.seek(0)
|
||||
final_document_layout = process_data_with_ocr(
|
||||
file,
|
||||
merged_document_layout,
|
||||
extracted_layout=extracted_layout,
|
||||
is_image=is_image,
|
||||
infer_table_structure=infer_table_structure,
|
||||
ocr_languages=ocr_languages,
|
||||
ocr_mode=ocr_mode,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
ocr_layout_dumper=ocr_layout_dumper,
|
||||
)
|
||||
|
||||
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
|
||||
|
||||
@ -766,9 +745,7 @@ def _partition_pdf_or_image_local(
|
||||
" ",
|
||||
el.text or "",
|
||||
).strip()
|
||||
# NOTE(alan): with chipper there are parent elements with no text we don't want to
|
||||
# filter those out and leave the children orphaned.
|
||||
if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"):
|
||||
if el.text or isinstance(el, PageBreak):
|
||||
out_elements.append(cast(Element, el))
|
||||
|
||||
if extract_forms:
|
||||
|
@ -176,10 +176,6 @@ def merge_inferred_with_extracted_layout(
|
||||
)
|
||||
from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
|
||||
|
||||
# If the model is a chipper model, we don't want to order the
|
||||
# elements, as they are already ordered
|
||||
order_elements = not hi_res_model_name.startswith("chipper")
|
||||
|
||||
inferred_pages = inferred_document_layout.pages
|
||||
for i, (inferred_page, extracted_page_layout) in enumerate(
|
||||
zip(inferred_pages, extracted_layout)
|
||||
@ -206,10 +202,7 @@ def merge_inferred_with_extracted_layout(
|
||||
**threshold_kwargs,
|
||||
)
|
||||
|
||||
if order_elements:
|
||||
merged_layout = sort_text_regions(
|
||||
cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC
|
||||
)
|
||||
merged_layout = sort_text_regions(cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC)
|
||||
|
||||
elements = []
|
||||
for layout_el in merged_layout:
|
||||
|
Loading…
x
Reference in New Issue
Block a user