mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00

* Upgraded Layout Postprocessing, sending old code back to ERZ Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Implement hierachical cluster layout processing Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Pass nested cluster processing through full pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Pass nested clusters through GLM as payload Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move to_docling_document from ds-glm to this repo Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Clean up imports again Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * feat(Accelerator): Introduce options to control the num_threads and device from API, envvars, CLI. - Introduce the AcceleratorOptions, AcceleratorDevice and use them to set the device where the models run. - Introduce the accelerator_utils with function to decide the device and resolve the AUTO setting. - Refactor the way how the docling-ibm-models are called to match the new init signature of models. - Translate the accelerator options to the specific inputs for third-party models. - Extend the docling CLI with parameters to set the num_threads and device. - Add new unit tests. - Write new example how to use the accelerator options. * fix: Improve the pydantic objects in the pipeline_options and imports. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: TableStructureModel: Refactor the artifacts path to use the new structure for fast/accurate model Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * Updated test ground-truth Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updated test ground-truth (again), bugfix for empty layout Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Do proper check to set the device in EasyOCR, RapidOCR. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * Rollback changes from main Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update test gt Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Remove unused debug settings Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Review fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Nail the accelerator defaults for MPS Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
126 lines
4.0 KiB
Python
126 lines
4.0 KiB
Python
import os
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
|
from docling.datamodel.document import ConversionResult
|
|
from docling.datamodel.pipeline_options import (
|
|
AcceleratorDevice,
|
|
AcceleratorOptions,
|
|
PdfPipelineOptions,
|
|
TableFormerMode,
|
|
)
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
|
|
|
|
@pytest.fixture
|
|
def test_doc_path():
|
|
return Path("./tests/data/2206.01062.pdf")
|
|
|
|
|
|
def get_converters_with_table_options():
|
|
for cell_matching in [True, False]:
|
|
for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]:
|
|
pipeline_options = PdfPipelineOptions()
|
|
pipeline_options.do_ocr = False
|
|
pipeline_options.do_table_structure = True
|
|
pipeline_options.table_structure_options.do_cell_matching = cell_matching
|
|
pipeline_options.table_structure_options.mode = mode
|
|
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_options=pipeline_options,
|
|
backend=DoclingParseDocumentBackend,
|
|
)
|
|
}
|
|
)
|
|
|
|
yield converter
|
|
|
|
|
|
def test_accelerator_options():
|
|
# Check the default options
|
|
ao = AcceleratorOptions()
|
|
assert ao.num_threads == 4, "Wrong default num_threads"
|
|
assert ao.device == AcceleratorDevice.AUTO, "Wrong default device"
|
|
|
|
# Use API
|
|
ao2 = AcceleratorOptions(num_threads=2, device=AcceleratorDevice.MPS)
|
|
ao3 = AcceleratorOptions(num_threads=3, device=AcceleratorDevice.CUDA)
|
|
assert ao2.num_threads == 2
|
|
assert ao2.device == AcceleratorDevice.MPS
|
|
assert ao3.num_threads == 3
|
|
assert ao3.device == AcceleratorDevice.CUDA
|
|
|
|
# Use envvars (regular + alternative) and default values
|
|
os.environ["OMP_NUM_THREADS"] = "1"
|
|
ao.__init__()
|
|
assert ao.num_threads == 1
|
|
assert ao.device == AcceleratorDevice.AUTO
|
|
os.environ["DOCLING_DEVICE"] = "cpu"
|
|
ao.__init__()
|
|
assert ao.device == AcceleratorDevice.CPU
|
|
assert ao.num_threads == 1
|
|
|
|
# Use envvars and override in init
|
|
os.environ["DOCLING_DEVICE"] = "cpu"
|
|
ao4 = AcceleratorOptions(num_threads=5, device=AcceleratorDevice.MPS)
|
|
assert ao4.num_threads == 5
|
|
assert ao4.device == AcceleratorDevice.MPS
|
|
|
|
# Use regular and alternative envvar
|
|
os.environ["DOCLING_NUM_THREADS"] = "2"
|
|
ao5 = AcceleratorOptions()
|
|
assert ao5.num_threads == 2
|
|
assert ao5.device == AcceleratorDevice.CPU
|
|
|
|
# Use wrong values
|
|
is_exception = False
|
|
try:
|
|
os.environ["DOCLING_DEVICE"] = "wrong"
|
|
ao5.__init__()
|
|
except Exception as ex:
|
|
print(ex)
|
|
is_exception = True
|
|
assert is_exception
|
|
|
|
# Use misformatted alternative envvar
|
|
del os.environ["DOCLING_NUM_THREADS"]
|
|
del os.environ["DOCLING_DEVICE"]
|
|
os.environ["OMP_NUM_THREADS"] = "wrong"
|
|
ao6 = AcceleratorOptions()
|
|
assert ao6.num_threads == 4
|
|
assert ao6.device == AcceleratorDevice.AUTO
|
|
|
|
|
|
def test_e2e_conversions(test_doc_path):
|
|
for converter in get_converters_with_table_options():
|
|
print(f"converting {test_doc_path}")
|
|
|
|
doc_result: ConversionResult = converter.convert(test_doc_path)
|
|
|
|
assert doc_result.status == ConversionStatus.SUCCESS
|
|
|
|
|
|
def test_ocr_coverage_threshold(test_doc_path):
|
|
pipeline_options = PdfPipelineOptions()
|
|
pipeline_options.do_ocr = True
|
|
pipeline_options.ocr_options.bitmap_area_threshold = 1.1
|
|
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_options=pipeline_options,
|
|
)
|
|
}
|
|
)
|
|
|
|
test_doc_path = Path("./tests/data_scanned/ocr_test.pdf")
|
|
doc_result: ConversionResult = converter.convert(test_doc_path)
|
|
|
|
# this should have generated no results, since we set a very high threshold
|
|
assert len(doc_result.document.texts) == 0
|