docling/docs/examples/demo_layout_vlm.py
Christoph Auer 4852d8b4f2
feat(experimental): Layout + VLM model with layout prompt (#2244)
* adding granite-docling preview

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the model specs

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* Add Layout+VLM pipeline with prompt injection, ApiVlmModel updates

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update layout injection, move to experimental

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Adjust defaults

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Map Layout+VLM pipeline to GraniteDoclign

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Remove base_prompt from layout injection prompt

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Reinstate custom prompt

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* add demo_layout file that produces with vs without layout injection

Signed-off-by: Peter El Hachem <peter.el.hachem@ibm.com>
Signed-off-by: ElHachem02 <peterelhachem02@gmail.com>

* feat: wrap vlm_inference around process_images

Signed-off-by: ElHachem02 <peterelhachem02@gmail.com>

* feat: carry input prompt + number of input tokens

Signed-off-by: ElHachem02 <peterelhachem02@gmail.com>

* fix: adapt example to run on local test file

Signed-off-by: ElHachem02 <peterelhachem02@gmail.com>

* fix: example now expects single document

Signed-off-by: ElHachem02 <peterelhachem02@gmail.com>

* feat: add layout example to EXAMPLES_TO_SKIP

Signed-off-by: ElHachem02 <peterelhachem02@gmail.com>

* feat: address comments on git

Signed-off-by: ElHachem02 <peterelhachem02@gmail.com>

* feat: add inference wrapper for hf_transformers + carry input prompt

Signed-off-by: ElHachem02 <peterelhachem02@gmail.com>

* Feat: add track_input_prompt to ApiVlmOptions, and track input prompt as part of api vlm

Signed-off-by: ElHachem02 <peterelhachem02@gmail.com>

* fix: Ensure backward-compatible build_prompt by adding _internal_page ag

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Ensure backward-compatible build_prompt by adding _internal_page ag

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes for demo

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Typing fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Restoring lost changes in vllm_model

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Restoring vlm_pipeline_api_model example

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Peter El Hachem <peter.el.hachem@ibm.com>
Signed-off-by: ElHachem02 <peterelhachem02@gmail.com>
Co-authored-by: Peter Staar <taa@zurich.ibm.com>
Co-authored-by: ElHachem02 <peterelhachem02@gmail.com>
2025-11-12 13:42:09 +01:00

178 lines
5.7 KiB
Python
Vendored

#!/usr/bin/env python3
"""Demo script for the new ThreadedLayoutVlmPipeline.
This script demonstrates the usage of the experimental ThreadedLayoutVlmPipeline pipeline
that combines layout model preprocessing with VLM processing in a threaded manner.
"""
import argparse
import logging
import traceback
from pathlib import Path
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat
from docling.datamodel.vlm_model_specs import GRANITEDOCLING_TRANSFORMERS
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.experimental.datamodel.threaded_layout_vlm_pipeline_options import (
ThreadedLayoutVlmPipelineOptions,
)
from docling.experimental.pipeline.threaded_layout_vlm_pipeline import (
ThreadedLayoutVlmPipeline,
)
_log = logging.getLogger(__name__)
def _parse_args():
parser = argparse.ArgumentParser(
description="Demo script for the experimental ThreadedLayoutVlmPipeline"
)
parser.add_argument(
"--input-file",
type=str,
default="tests/data/pdf/code_and_formula.pdf",
help="Path to a PDF file",
)
parser.add_argument(
"--output-dir",
type=str,
default="scratch/demo_layout_vlm/",
help="Output directory for converted files",
)
return parser.parse_args()
# Can be used to read multiple pdf files under a folder
# def _get_docs(input_doc_path):
# """Yield DocumentStream objects from list of input document paths"""
# for path in input_doc_path:
# buf = BytesIO(path.read_bytes())
# stream = DocumentStream(name=path.name, stream=buf)
# yield stream
def openai_compatible_vlm_options(
model: str,
prompt: str,
format: ResponseFormat,
hostname_and_port,
temperature: float = 0.7,
max_tokens: int = 4096,
api_key: str = "",
skip_special_tokens=False,
):
headers = {}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
options = ApiVlmOptions(
url=f"http://{hostname_and_port}/v1/chat/completions", # LM studio defaults to port 1234, VLLM to 8000
params=dict(
model=model,
max_tokens=max_tokens,
skip_special_tokens=skip_special_tokens, # needed for VLLM
),
headers=headers,
prompt=prompt,
timeout=90,
scale=2.0,
temperature=temperature,
response_format=format,
)
return options
def demo_threaded_layout_vlm_pipeline(
input_doc_path: Path, out_dir_layout_aware: Path, use_api_vlm: bool
):
"""Demonstrate the threaded layout+VLM pipeline."""
vlm_options = GRANITEDOCLING_TRANSFORMERS.model_copy()
if use_api_vlm:
vlm_options = openai_compatible_vlm_options(
model="granite-docling-258m-mlx", # For VLLM use "ibm-granite/granite-docling-258M"
hostname_and_port="localhost:1234", # LM studio defaults to port 1234, VLLM to 8000
prompt="Convert this page to docling.",
format=ResponseFormat.DOCTAGS,
api_key="",
)
vlm_options.track_input_prompt = True
# Configure pipeline options
print("Configuring pipeline options...")
pipeline_options_layout_aware = ThreadedLayoutVlmPipelineOptions(
# VLM configuration - defaults to GRANITEDOCLING_TRANSFORMERS
vlm_options=vlm_options,
# Layout configuration - defaults to DOCLING_LAYOUT_HERON
# Batch sizes for parallel processing
layout_batch_size=2,
vlm_batch_size=1,
# Queue configuration
queue_max_size=10,
# Image processing
images_scale=2.0,
generate_page_images=True,
enable_remote_services=use_api_vlm,
)
# Create converter with the new pipeline
print("Initializing DocumentConverter (this may take a while - loading models)...")
doc_converter_layout_enhanced = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=ThreadedLayoutVlmPipeline,
pipeline_options=pipeline_options_layout_aware,
)
}
)
result_layout_aware = doc_converter_layout_enhanced.convert(
source=input_doc_path, raises_on_error=False
)
if result_layout_aware.status == ConversionStatus.FAILURE:
_log.error(f"Conversion failed: {result_layout_aware.status}")
doc_filename = result_layout_aware.input.file.stem
result_layout_aware.document.save_as_json(
out_dir_layout_aware / f"{doc_filename}.json"
)
result_layout_aware.document.save_as_html(
out_dir_layout_aware / f"{doc_filename}.html"
)
for page in result_layout_aware.pages:
_log.info("Page %s of VLM response:", page.page_no)
if page.predictions.vlm_response:
_log.info(page.predictions.vlm_response)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
try:
args = _parse_args()
_log.info(
f"Parsed arguments: input={args.input_file}, output={args.output_dir}"
)
input_path = Path(args.input_file)
if not input_path.exists():
raise FileNotFoundError(f"Input file does not exist: {input_path}")
if input_path.suffix.lower() != ".pdf":
raise ValueError(f"Input file must be a PDF: {input_path}")
out_dir_layout_aware = Path(args.output_dir) / "layout_aware/"
out_dir_layout_aware.mkdir(parents=True, exist_ok=True)
use_api_vlm = False # Set to False to use inline VLM model
demo_threaded_layout_vlm_pipeline(input_path, out_dir_layout_aware, use_api_vlm)
except Exception:
traceback.print_exc()
raise