mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00

* feat: adding new vlm-models support Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the transformers Signed-off-by: Peter Staar <taa@zurich.ibm.com> * got microsoft/Phi-4-multimodal-instruct to work Signed-off-by: Peter Staar <taa@zurich.ibm.com> * working on vlm's Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring the VLM part Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, now serious refacgtoring necessary Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring the download_model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the formulate_prompt Signed-off-by: Peter Staar <taa@zurich.ibm.com> * pixtral 12b runs via MLX and native transformers Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the VlmPredictionToken Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring minimal_vlm_pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the MyPy Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added pipeline_model_specializations file Signed-off-by: Peter Staar <taa@zurich.ibm.com> * need to get Phi4 working again ... Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalising last points for vlms support Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the pipeline for Phi4 Signed-off-by: Peter Staar <taa@zurich.ibm.com> * streamlining all code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixing the tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the html backend to the VLM pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the static load_from_doctags Signed-off-by: Peter Staar <taa@zurich.ibm.com> * restore stable imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use AutoModelForVision2Seq for Pixtral and review example (including rename) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove unused value Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * refactor instances of VLM models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * skip compare example in CI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use lowercase and uppercase only Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add new minimal_vlm example and refactor pipeline_options_vlm_model for cleaner import Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename pipeline_vlm_model_spec Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * move more argument to options and simplify model init Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add supported_devices Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove not-needed function Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * exclude minimal_vlm Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * missing file Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add message for transformers version Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename to specs Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use module import and remove MLX from non-darwin Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove hf_vlm_model and add extra_generation_args Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use single HF VLM model class Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove torch type Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add docs for vision models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
175 lines
5.9 KiB
Python
Vendored
175 lines
5.9 KiB
Python
Vendored
import json
|
|
import logging
|
|
import time
|
|
from pathlib import Path
|
|
|
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.pipeline_options import (
|
|
PdfPipelineOptions,
|
|
)
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
|
|
def main():
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
|
|
|
###########################################################################
|
|
|
|
# The following sections contain a combination of PipelineOptions
|
|
# and PDF Backends for various configurations.
|
|
# Uncomment one section at the time to see the differences in the output.
|
|
|
|
# PyPdfium without EasyOCR
|
|
# --------------------
|
|
# pipeline_options = PdfPipelineOptions()
|
|
# pipeline_options.do_ocr = False
|
|
# pipeline_options.do_table_structure = True
|
|
# pipeline_options.table_structure_options.do_cell_matching = False
|
|
|
|
# doc_converter = DocumentConverter(
|
|
# format_options={
|
|
# InputFormat.PDF: PdfFormatOption(
|
|
# pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
|
|
# )
|
|
# }
|
|
# )
|
|
|
|
# PyPdfium with EasyOCR
|
|
# -----------------
|
|
# pipeline_options = PdfPipelineOptions()
|
|
# pipeline_options.do_ocr = True
|
|
# pipeline_options.do_table_structure = True
|
|
# pipeline_options.table_structure_options.do_cell_matching = True
|
|
|
|
# doc_converter = DocumentConverter(
|
|
# format_options={
|
|
# InputFormat.PDF: PdfFormatOption(
|
|
# pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
|
|
# )
|
|
# }
|
|
# )
|
|
|
|
# Docling Parse without EasyOCR
|
|
# -------------------------
|
|
# pipeline_options = PdfPipelineOptions()
|
|
# pipeline_options.do_ocr = False
|
|
# pipeline_options.do_table_structure = True
|
|
# pipeline_options.table_structure_options.do_cell_matching = True
|
|
|
|
# doc_converter = DocumentConverter(
|
|
# format_options={
|
|
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
|
# }
|
|
# )
|
|
|
|
# Docling Parse with EasyOCR
|
|
# ----------------------
|
|
pipeline_options = PdfPipelineOptions()
|
|
pipeline_options.do_ocr = True
|
|
pipeline_options.do_table_structure = True
|
|
pipeline_options.table_structure_options.do_cell_matching = True
|
|
pipeline_options.ocr_options.lang = ["es"]
|
|
pipeline_options.accelerator_options = AcceleratorOptions(
|
|
num_threads=4, device=AcceleratorDevice.AUTO
|
|
)
|
|
|
|
doc_converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
|
}
|
|
)
|
|
|
|
# Docling Parse with EasyOCR (CPU only)
|
|
# ----------------------
|
|
# pipeline_options = PdfPipelineOptions()
|
|
# pipeline_options.do_ocr = True
|
|
# pipeline_options.ocr_options.use_gpu = False # <-- set this.
|
|
# pipeline_options.do_table_structure = True
|
|
# pipeline_options.table_structure_options.do_cell_matching = True
|
|
|
|
# doc_converter = DocumentConverter(
|
|
# format_options={
|
|
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
|
# }
|
|
# )
|
|
|
|
# Docling Parse with Tesseract
|
|
# ----------------------
|
|
# pipeline_options = PdfPipelineOptions()
|
|
# pipeline_options.do_ocr = True
|
|
# pipeline_options.do_table_structure = True
|
|
# pipeline_options.table_structure_options.do_cell_matching = True
|
|
# pipeline_options.ocr_options = TesseractOcrOptions()
|
|
|
|
# doc_converter = DocumentConverter(
|
|
# format_options={
|
|
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
|
# }
|
|
# )
|
|
|
|
# Docling Parse with Tesseract CLI
|
|
# ----------------------
|
|
# pipeline_options = PdfPipelineOptions()
|
|
# pipeline_options.do_ocr = True
|
|
# pipeline_options.do_table_structure = True
|
|
# pipeline_options.table_structure_options.do_cell_matching = True
|
|
# pipeline_options.ocr_options = TesseractCliOcrOptions()
|
|
|
|
# doc_converter = DocumentConverter(
|
|
# format_options={
|
|
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
|
# }
|
|
# )
|
|
|
|
# Docling Parse with ocrmac(Mac only)
|
|
# ----------------------
|
|
# pipeline_options = PdfPipelineOptions()
|
|
# pipeline_options.do_ocr = True
|
|
# pipeline_options.do_table_structure = True
|
|
# pipeline_options.table_structure_options.do_cell_matching = True
|
|
# pipeline_options.ocr_options = OcrMacOptions()
|
|
|
|
# doc_converter = DocumentConverter(
|
|
# format_options={
|
|
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
|
# }
|
|
# )
|
|
|
|
###########################################################################
|
|
|
|
start_time = time.time()
|
|
conv_result = doc_converter.convert(input_doc_path)
|
|
end_time = time.time() - start_time
|
|
|
|
_log.info(f"Document converted in {end_time:.2f} seconds.")
|
|
|
|
## Export results
|
|
output_dir = Path("scratch")
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
doc_filename = conv_result.input.file.stem
|
|
|
|
# Export Deep Search document JSON format:
|
|
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
|
|
fp.write(json.dumps(conv_result.document.export_to_dict()))
|
|
|
|
# Export Text format:
|
|
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
|
fp.write(conv_result.document.export_to_text())
|
|
|
|
# Export Markdown format:
|
|
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
|
fp.write(conv_result.document.export_to_markdown())
|
|
|
|
# Export Document Tags format:
|
|
with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
|
|
fp.write(conv_result.document.export_to_document_tokens())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|