mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00

* feat: adding new vlm-models support Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the transformers Signed-off-by: Peter Staar <taa@zurich.ibm.com> * got microsoft/Phi-4-multimodal-instruct to work Signed-off-by: Peter Staar <taa@zurich.ibm.com> * working on vlm's Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring the VLM part Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, now serious refacgtoring necessary Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring the download_model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the formulate_prompt Signed-off-by: Peter Staar <taa@zurich.ibm.com> * pixtral 12b runs via MLX and native transformers Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the VlmPredictionToken Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring minimal_vlm_pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the MyPy Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added pipeline_model_specializations file Signed-off-by: Peter Staar <taa@zurich.ibm.com> * need to get Phi4 working again ... Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalising last points for vlms support Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the pipeline for Phi4 Signed-off-by: Peter Staar <taa@zurich.ibm.com> * streamlining all code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixing the tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the html backend to the VLM pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the static load_from_doctags Signed-off-by: Peter Staar <taa@zurich.ibm.com> * restore stable imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use AutoModelForVision2Seq for Pixtral and review example (including rename) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove unused value Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * refactor instances of VLM models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * skip compare example in CI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use lowercase and uppercase only Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add new minimal_vlm example and refactor pipeline_options_vlm_model for cleaner import Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename pipeline_vlm_model_spec Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * move more argument to options and simplify model init Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add supported_devices Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove not-needed function Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * exclude minimal_vlm Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * missing file Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add message for transformers version Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename to specs Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use module import and remove MLX from non-darwin Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove hf_vlm_model and add extra_generation_args Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use single HF VLM model class Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove torch type Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add docs for vision models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
173 lines
5.7 KiB
Python
173 lines
5.7 KiB
Python
import os
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
|
from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade
|
|
from docling.datamodel.document import ConversionResult
|
|
from docling.datamodel.pipeline_options import (
|
|
PdfPipelineOptions,
|
|
TableFormerMode,
|
|
)
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
|
|
|
|
@pytest.fixture
|
|
def test_doc_path():
|
|
return Path("./tests/data/pdf/2206.01062.pdf")
|
|
|
|
|
|
def get_converters_with_table_options():
|
|
for cell_matching in [True, False]:
|
|
for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]:
|
|
pipeline_options = PdfPipelineOptions()
|
|
pipeline_options.do_ocr = False
|
|
pipeline_options.do_table_structure = True
|
|
pipeline_options.table_structure_options.do_cell_matching = cell_matching
|
|
pipeline_options.table_structure_options.mode = mode
|
|
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
|
}
|
|
)
|
|
|
|
yield converter
|
|
|
|
|
|
def test_accelerator_options():
|
|
# Check the default options
|
|
ao = AcceleratorOptions()
|
|
assert ao.num_threads == 4, "Wrong default num_threads"
|
|
assert ao.device == AcceleratorDevice.AUTO, "Wrong default device"
|
|
|
|
# Use API
|
|
ao2 = AcceleratorOptions(num_threads=2, device=AcceleratorDevice.MPS)
|
|
ao3 = AcceleratorOptions(num_threads=3, device=AcceleratorDevice.CUDA)
|
|
assert ao2.num_threads == 2
|
|
assert ao2.device == AcceleratorDevice.MPS
|
|
assert ao3.num_threads == 3
|
|
assert ao3.device == AcceleratorDevice.CUDA
|
|
|
|
# Use envvars (regular + alternative) and default values
|
|
os.environ["OMP_NUM_THREADS"] = "1"
|
|
ao.__init__()
|
|
assert ao.num_threads == 1
|
|
assert ao.device == AcceleratorDevice.AUTO
|
|
os.environ["DOCLING_DEVICE"] = "cpu"
|
|
ao.__init__()
|
|
assert ao.device == AcceleratorDevice.CPU
|
|
assert ao.num_threads == 1
|
|
|
|
# Use envvars and override in init
|
|
os.environ["DOCLING_DEVICE"] = "cpu"
|
|
ao4 = AcceleratorOptions(num_threads=5, device=AcceleratorDevice.MPS)
|
|
assert ao4.num_threads == 5
|
|
assert ao4.device == AcceleratorDevice.MPS
|
|
|
|
# Use regular and alternative envvar
|
|
os.environ["DOCLING_NUM_THREADS"] = "2"
|
|
ao5 = AcceleratorOptions()
|
|
assert ao5.num_threads == 2
|
|
assert ao5.device == AcceleratorDevice.CPU
|
|
|
|
# Use wrong values
|
|
is_exception = False
|
|
try:
|
|
os.environ["DOCLING_DEVICE"] = "wrong"
|
|
ao5.__init__()
|
|
except Exception as ex:
|
|
print(ex)
|
|
is_exception = True
|
|
assert is_exception
|
|
|
|
# Use misformatted alternative envvar
|
|
del os.environ["DOCLING_NUM_THREADS"]
|
|
del os.environ["DOCLING_DEVICE"]
|
|
os.environ["OMP_NUM_THREADS"] = "wrong"
|
|
ao6 = AcceleratorOptions()
|
|
assert ao6.num_threads == 4
|
|
assert ao6.device == AcceleratorDevice.AUTO
|
|
|
|
|
|
def test_e2e_conversions(test_doc_path):
|
|
for converter in get_converters_with_table_options():
|
|
print(f"converting {test_doc_path}")
|
|
|
|
doc_result: ConversionResult = converter.convert(test_doc_path)
|
|
|
|
assert doc_result.status == ConversionStatus.SUCCESS
|
|
|
|
|
|
def test_page_range(test_doc_path):
|
|
converter = DocumentConverter()
|
|
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(9, 9))
|
|
|
|
assert doc_result.status == ConversionStatus.SUCCESS
|
|
assert doc_result.input.page_count == 9
|
|
assert doc_result.document.num_pages() == 1
|
|
|
|
doc_result: ConversionResult = converter.convert(
|
|
test_doc_path, page_range=(10, 10), raises_on_error=False
|
|
)
|
|
assert doc_result.status == ConversionStatus.FAILURE
|
|
|
|
|
|
def test_ocr_coverage_threshold(test_doc_path):
|
|
pipeline_options = PdfPipelineOptions()
|
|
pipeline_options.do_ocr = True
|
|
pipeline_options.ocr_options.bitmap_area_threshold = 1.1
|
|
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_options=pipeline_options,
|
|
)
|
|
}
|
|
)
|
|
|
|
test_doc_path = Path("./tests/data_scanned/ocr_test.pdf")
|
|
doc_result: ConversionResult = converter.convert(test_doc_path)
|
|
|
|
# this should have generated no results, since we set a very high threshold
|
|
assert len(doc_result.document.texts) == 0
|
|
|
|
|
|
def test_parser_backends(test_doc_path):
|
|
pipeline_options = PdfPipelineOptions()
|
|
pipeline_options.do_ocr = False
|
|
pipeline_options.do_table_structure = False
|
|
|
|
for backend_t in [
|
|
DoclingParseV4DocumentBackend,
|
|
DoclingParseV2DocumentBackend,
|
|
DoclingParseDocumentBackend,
|
|
PyPdfiumDocumentBackend,
|
|
]:
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_options=pipeline_options,
|
|
backend=backend_t,
|
|
)
|
|
}
|
|
)
|
|
|
|
test_doc_path = Path("./tests/data/pdf/code_and_formula.pdf")
|
|
doc_result: ConversionResult = converter.convert(test_doc_path)
|
|
|
|
assert doc_result.status == ConversionStatus.SUCCESS
|
|
|
|
|
|
def test_confidence(test_doc_path):
|
|
converter = DocumentConverter()
|
|
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(6, 9))
|
|
|
|
assert doc_result.confidence.mean_grade == QualityGrade.EXCELLENT
|
|
assert doc_result.confidence.low_grade == QualityGrade.EXCELLENT
|