mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-17 19:07:20 +00:00
Remove unsupported chipper model (#3728)
The chipper model is no longer supported.
This commit is contained in:
parent
1eceac26c8
commit
b092d45816
35
.github/workflows/ci.yml
vendored
35
.github/workflows/ci.yml
vendored
@ -153,41 +153,6 @@ jobs:
|
|||||||
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
||||||
make check-coverage
|
make check-coverage
|
||||||
|
|
||||||
test_chipper:
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.10"]
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
env:
|
|
||||||
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
||||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
|
||||||
needs: [setup, lint]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Setup virtual environment
|
|
||||||
uses: ./.github/actions/base-cache
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Test
|
|
||||||
env:
|
|
||||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
|
||||||
PYTHON: python${{ matrix.python-version }}
|
|
||||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
|
||||||
run: |
|
|
||||||
source .venv/bin/activate
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y poppler-utils
|
|
||||||
make install-pandoc install-test
|
|
||||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
|
||||||
tesseract --version
|
|
||||||
make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
|
|
||||||
|
|
||||||
test_unit_no_extras:
|
test_unit_no_extras:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
## 0.16.1-dev0
|
## 0.16.1-dev1
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* **Remove unsupported chipper model**
|
||||||
* **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
|
* **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
|
||||||
|
|
||||||
## 0.16.0
|
## 0.16.0
|
||||||
|
|||||||
9
Makefile
9
Makefile
@ -138,12 +138,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
|
|||||||
.PHONY: test
|
.PHONY: test
|
||||||
test:
|
test:
|
||||||
PYTHONPATH=. CI=$(CI) \
|
PYTHONPATH=. CI=$(CI) \
|
||||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
||||||
|
|
||||||
.PHONY: test-chipper
|
|
||||||
test-chipper:
|
|
||||||
PYTHONPATH=. CI=$(CI) \
|
|
||||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
|
|
||||||
|
|
||||||
.PHONY: test-unstructured-api-unit
|
.PHONY: test-unstructured-api-unit
|
||||||
test-unstructured-api-unit:
|
test-unstructured-api-unit:
|
||||||
@ -309,7 +304,7 @@ docker-test:
|
|||||||
$(DOCKER_IMAGE) \
|
$(DOCKER_IMAGE) \
|
||||||
bash -c "CI=$(CI) \
|
bash -c "CI=$(CI) \
|
||||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
|
||||||
pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
|
pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
|
||||||
|
|
||||||
.PHONY: docker-smoke-test
|
.PHONY: docker-smoke-test
|
||||||
docker-smoke-test:
|
docker-smoke-test:
|
||||||
|
|||||||
@ -15,8 +15,6 @@ filterwarnings =
|
|||||||
ignore::DeprecationWarning
|
ignore::DeprecationWarning
|
||||||
python_classes = Test Describe
|
python_classes = Test Describe
|
||||||
python_functions = test_ it_ they_ but_ and_
|
python_functions = test_ it_ they_ but_ and_
|
||||||
markers =
|
|
||||||
chipper: mark a test as running chipper, which tends to be slow and compute-heavy.
|
|
||||||
testpaths =
|
testpaths =
|
||||||
test_unstructured
|
test_unstructured
|
||||||
test_unstructured_ingest
|
test_unstructured_ingest
|
||||||
|
|||||||
@ -1,43 +0,0 @@
|
|||||||
import pytest
|
|
||||||
|
|
||||||
from test_unstructured.unit_utils import example_doc_path
|
|
||||||
from unstructured.partition import pdf
|
|
||||||
from unstructured.partition.utils.constants import PartitionStrategy
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def chipper_results():
|
|
||||||
elements = pdf.partition_pdf(
|
|
||||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
|
||||||
strategy=PartitionStrategy.HI_RES,
|
|
||||||
model_name="chipper",
|
|
||||||
)
|
|
||||||
return elements
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def chipper_children(chipper_results):
|
|
||||||
return [el for el in chipper_results if el.metadata.parent_id is not None]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.chipper()
|
|
||||||
def test_chipper_has_hierarchy(chipper_children):
|
|
||||||
assert chipper_children
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.chipper()
|
|
||||||
def test_chipper_not_losing_parents(chipper_results, chipper_children):
|
|
||||||
assert all(
|
|
||||||
[el for el in chipper_results if el.id == child.metadata.parent_id]
|
|
||||||
for child in chipper_children
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def chipper_test_pdfminer_repeated(chipper_results):
|
|
||||||
"""
|
|
||||||
Test to verify that PDFMiner has not been run together with Chipper
|
|
||||||
"""
|
|
||||||
elements = chipper_results
|
|
||||||
assert len([element.text for element in elements]) == len(
|
|
||||||
{element.text for element in elements}
|
|
||||||
)
|
|
||||||
@ -218,7 +218,7 @@ def test_partition_pdf_with_model_name_env_var(
|
|||||||
assert mock_process.call_args[1]["model_name"] == "checkbox"
|
assert mock_process.call_args[1]["model_name"] == "checkbox"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model_name", ["checkbox", "yolox", "chipper"])
|
@pytest.mark.parametrize("model_name", ["checkbox", "yolox"])
|
||||||
def test_partition_pdf_with_model_name(
|
def test_partition_pdf_with_model_name(
|
||||||
monkeypatch,
|
monkeypatch,
|
||||||
model_name,
|
model_name,
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.16.1-dev0" # pragma: no cover
|
__version__ = "0.16.1-dev1" # pragma: no cover
|
||||||
|
|||||||
@ -566,12 +566,7 @@ def _partition_pdf_or_image_local(
|
|||||||
|
|
||||||
hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
|
hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
|
||||||
if pdf_image_dpi is None:
|
if pdf_image_dpi is None:
|
||||||
pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200
|
pdf_image_dpi = 200
|
||||||
if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")):
|
|
||||||
logger.warning(
|
|
||||||
"The Chipper model performs better when images are rendered with DPI >= 300 "
|
|
||||||
f"(currently {pdf_image_dpi}).",
|
|
||||||
)
|
|
||||||
|
|
||||||
od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
|
od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
|
||||||
extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
|
extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
|
||||||
@ -588,53 +583,48 @@ def _partition_pdf_or_image_local(
|
|||||||
pdf_image_dpi=pdf_image_dpi,
|
pdf_image_dpi=pdf_image_dpi,
|
||||||
)
|
)
|
||||||
|
|
||||||
if hi_res_model_name.startswith("chipper"):
|
extracted_layout = (
|
||||||
# NOTE(alan): We shouldn't do OCR with chipper
|
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
|
||||||
# NOTE(antonio): We shouldn't do PDFMiner with chipper
|
if pdf_text_extractable
|
||||||
final_document_layout = inferred_document_layout
|
else []
|
||||||
else:
|
)
|
||||||
extracted_layout = (
|
|
||||||
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
|
|
||||||
if pdf_text_extractable
|
|
||||||
else []
|
|
||||||
)
|
|
||||||
|
|
||||||
if analysis:
|
if analysis:
|
||||||
if not analyzed_image_output_dir_path:
|
if not analyzed_image_output_dir_path:
|
||||||
if env_config.GLOBAL_WORKING_DIR_ENABLED:
|
if env_config.GLOBAL_WORKING_DIR_ENABLED:
|
||||||
analyzed_image_output_dir_path = str(
|
analyzed_image_output_dir_path = str(
|
||||||
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
|
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
|
||||||
)
|
|
||||||
else:
|
|
||||||
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
|
|
||||||
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
|
|
||||||
if not skip_analysis_dump:
|
|
||||||
od_model_layout_dumper = ObjectDetectionLayoutDumper(
|
|
||||||
layout=inferred_document_layout,
|
|
||||||
model_name=hi_res_model_name,
|
|
||||||
)
|
)
|
||||||
extracted_layout_dumper = ExtractedLayoutDumper(
|
else:
|
||||||
layout=extracted_layout,
|
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
|
||||||
)
|
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
|
||||||
ocr_layout_dumper = OCRLayoutDumper()
|
if not skip_analysis_dump:
|
||||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
od_model_layout_dumper = ObjectDetectionLayoutDumper(
|
||||||
merged_document_layout = merge_inferred_with_extracted_layout(
|
layout=inferred_document_layout,
|
||||||
inferred_document_layout=inferred_document_layout,
|
model_name=hi_res_model_name,
|
||||||
extracted_layout=extracted_layout,
|
)
|
||||||
hi_res_model_name=hi_res_model_name,
|
extracted_layout_dumper = ExtractedLayoutDumper(
|
||||||
)
|
layout=extracted_layout,
|
||||||
|
)
|
||||||
|
ocr_layout_dumper = OCRLayoutDumper()
|
||||||
|
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||||
|
merged_document_layout = merge_inferred_with_extracted_layout(
|
||||||
|
inferred_document_layout=inferred_document_layout,
|
||||||
|
extracted_layout=extracted_layout,
|
||||||
|
hi_res_model_name=hi_res_model_name,
|
||||||
|
)
|
||||||
|
|
||||||
final_document_layout = process_file_with_ocr(
|
final_document_layout = process_file_with_ocr(
|
||||||
filename,
|
filename,
|
||||||
merged_document_layout,
|
merged_document_layout,
|
||||||
extracted_layout=extracted_layout,
|
extracted_layout=extracted_layout,
|
||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
ocr_mode=ocr_mode,
|
ocr_mode=ocr_mode,
|
||||||
pdf_image_dpi=pdf_image_dpi,
|
pdf_image_dpi=pdf_image_dpi,
|
||||||
ocr_layout_dumper=ocr_layout_dumper,
|
ocr_layout_dumper=ocr_layout_dumper,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
inferred_document_layout = process_data_with_model(
|
inferred_document_layout = process_data_with_model(
|
||||||
file,
|
file,
|
||||||
@ -643,62 +633,51 @@ def _partition_pdf_or_image_local(
|
|||||||
pdf_image_dpi=pdf_image_dpi,
|
pdf_image_dpi=pdf_image_dpi,
|
||||||
)
|
)
|
||||||
|
|
||||||
if hi_res_model_name.startswith("chipper"):
|
if hasattr(file, "seek"):
|
||||||
# NOTE(alan): We shouldn't do OCR with chipper
|
file.seek(0)
|
||||||
# NOTE(antonio): We shouldn't do PDFMiner with chipper
|
|
||||||
final_document_layout = inferred_document_layout
|
|
||||||
else:
|
|
||||||
if hasattr(file, "seek"):
|
|
||||||
file.seek(0)
|
|
||||||
|
|
||||||
extracted_layout = (
|
extracted_layout = (
|
||||||
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
|
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
|
||||||
if pdf_text_extractable
|
)
|
||||||
else []
|
|
||||||
)
|
|
||||||
|
|
||||||
if analysis:
|
if analysis:
|
||||||
if not analyzed_image_output_dir_path:
|
if not analyzed_image_output_dir_path:
|
||||||
if env_config.GLOBAL_WORKING_DIR_ENABLED:
|
if env_config.GLOBAL_WORKING_DIR_ENABLED:
|
||||||
analyzed_image_output_dir_path = str(
|
analyzed_image_output_dir_path = str(
|
||||||
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
|
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
|
||||||
)
|
|
||||||
else:
|
|
||||||
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
|
|
||||||
if not skip_analysis_dump:
|
|
||||||
od_model_layout_dumper = ObjectDetectionLayoutDumper(
|
|
||||||
layout=inferred_document_layout,
|
|
||||||
model_name=hi_res_model_name,
|
|
||||||
)
|
)
|
||||||
extracted_layout_dumper = ExtractedLayoutDumper(
|
else:
|
||||||
layout=extracted_layout,
|
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
|
||||||
)
|
if not skip_analysis_dump:
|
||||||
ocr_layout_dumper = OCRLayoutDumper()
|
od_model_layout_dumper = ObjectDetectionLayoutDumper(
|
||||||
|
layout=inferred_document_layout,
|
||||||
|
model_name=hi_res_model_name,
|
||||||
|
)
|
||||||
|
extracted_layout_dumper = ExtractedLayoutDumper(
|
||||||
|
layout=extracted_layout,
|
||||||
|
)
|
||||||
|
ocr_layout_dumper = OCRLayoutDumper()
|
||||||
|
|
||||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||||
merged_document_layout = merge_inferred_with_extracted_layout(
|
merged_document_layout = merge_inferred_with_extracted_layout(
|
||||||
inferred_document_layout=inferred_document_layout,
|
inferred_document_layout=inferred_document_layout,
|
||||||
extracted_layout=extracted_layout,
|
extracted_layout=extracted_layout,
|
||||||
hi_res_model_name=hi_res_model_name,
|
hi_res_model_name=hi_res_model_name,
|
||||||
)
|
)
|
||||||
|
|
||||||
if hasattr(file, "seek"):
|
if hasattr(file, "seek"):
|
||||||
file.seek(0)
|
file.seek(0)
|
||||||
final_document_layout = process_data_with_ocr(
|
final_document_layout = process_data_with_ocr(
|
||||||
file,
|
file,
|
||||||
merged_document_layout,
|
merged_document_layout,
|
||||||
extracted_layout=extracted_layout,
|
extracted_layout=extracted_layout,
|
||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
ocr_mode=ocr_mode,
|
ocr_mode=ocr_mode,
|
||||||
pdf_image_dpi=pdf_image_dpi,
|
pdf_image_dpi=pdf_image_dpi,
|
||||||
ocr_layout_dumper=ocr_layout_dumper,
|
ocr_layout_dumper=ocr_layout_dumper,
|
||||||
)
|
)
|
||||||
|
|
||||||
# NOTE(alan): starting with v2, chipper sorts the elements itself.
|
|
||||||
if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
|
|
||||||
kwargs["sort_mode"] = SORT_MODE_DONT
|
|
||||||
|
|
||||||
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
|
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
|
||||||
|
|
||||||
@ -766,9 +745,7 @@ def _partition_pdf_or_image_local(
|
|||||||
" ",
|
" ",
|
||||||
el.text or "",
|
el.text or "",
|
||||||
).strip()
|
).strip()
|
||||||
# NOTE(alan): with chipper there are parent elements with no text we don't want to
|
if el.text or isinstance(el, PageBreak):
|
||||||
# filter those out and leave the children orphaned.
|
|
||||||
if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"):
|
|
||||||
out_elements.append(cast(Element, el))
|
out_elements.append(cast(Element, el))
|
||||||
|
|
||||||
if extract_forms:
|
if extract_forms:
|
||||||
|
|||||||
@ -176,10 +176,6 @@ def merge_inferred_with_extracted_layout(
|
|||||||
)
|
)
|
||||||
from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
|
from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
|
||||||
|
|
||||||
# If the model is a chipper model, we don't want to order the
|
|
||||||
# elements, as they are already ordered
|
|
||||||
order_elements = not hi_res_model_name.startswith("chipper")
|
|
||||||
|
|
||||||
inferred_pages = inferred_document_layout.pages
|
inferred_pages = inferred_document_layout.pages
|
||||||
for i, (inferred_page, extracted_page_layout) in enumerate(
|
for i, (inferred_page, extracted_page_layout) in enumerate(
|
||||||
zip(inferred_pages, extracted_layout)
|
zip(inferred_pages, extracted_layout)
|
||||||
@ -206,10 +202,7 @@ def merge_inferred_with_extracted_layout(
|
|||||||
**threshold_kwargs,
|
**threshold_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
if order_elements:
|
merged_layout = sort_text_regions(cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC)
|
||||||
merged_layout = sort_text_regions(
|
|
||||||
cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC
|
|
||||||
)
|
|
||||||
|
|
||||||
elements = []
|
elements = []
|
||||||
for layout_el in merged_layout:
|
for layout_el in merged_layout:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user