docling/docs/examples/demo_layout_vlm.py
Christoph Auer 609069d12c
fix: Ensure proper image_scale for generated page images in VLM pipelines (#2728)
* fix: Ensure proper image_scale is used for generated page images in layout+vlm pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Ensure proper image_scale output in default VLM pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-05 13:16:11 +01:00

178 lines
5.7 KiB
Python
Vendored

#!/usr/bin/env python3
"""Demo script for the new ThreadedLayoutVlmPipeline.
This script demonstrates the usage of the experimental ThreadedLayoutVlmPipeline pipeline
that combines layout model preprocessing with VLM processing in a threaded manner.
"""
import argparse
import logging
import traceback
from pathlib import Path
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat
from docling.datamodel.vlm_model_specs import GRANITEDOCLING_TRANSFORMERS
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.experimental.datamodel.threaded_layout_vlm_pipeline_options import (
ThreadedLayoutVlmPipelineOptions,
)
from docling.experimental.pipeline.threaded_layout_vlm_pipeline import (
ThreadedLayoutVlmPipeline,
)
_log = logging.getLogger(__name__)
def _parse_args():
parser = argparse.ArgumentParser(
description="Demo script for the experimental ThreadedLayoutVlmPipeline"
)
parser.add_argument(
"--input-file",
type=str,
default="tests/data/pdf/code_and_formula.pdf",
help="Path to a PDF file",
)
parser.add_argument(
"--output-dir",
type=str,
default="scratch/demo_layout_vlm/",
help="Output directory for converted files",
)
return parser.parse_args()
# Can be used to read multiple pdf files under a folder
# def _get_docs(input_doc_path):
# """Yield DocumentStream objects from list of input document paths"""
# for path in input_doc_path:
# buf = BytesIO(path.read_bytes())
# stream = DocumentStream(name=path.name, stream=buf)
# yield stream
def openai_compatible_vlm_options(
model: str,
prompt: str,
format: ResponseFormat,
hostname_and_port,
temperature: float = 0.7,
max_tokens: int = 4096,
api_key: str = "",
skip_special_tokens=False,
):
headers = {}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
options = ApiVlmOptions(
url=f"http://{hostname_and_port}/v1/chat/completions", # LM studio defaults to port 1234, VLLM to 8000
params=dict(
model=model,
max_tokens=max_tokens,
skip_special_tokens=skip_special_tokens, # needed for VLLM
),
headers=headers,
prompt=prompt,
timeout=90,
scale=2.0,
temperature=temperature,
response_format=format,
)
return options
def demo_threaded_layout_vlm_pipeline(
input_doc_path: Path, out_dir_layout_aware: Path, use_api_vlm: bool
):
"""Demonstrate the threaded layout+VLM pipeline."""
vlm_options = GRANITEDOCLING_TRANSFORMERS.model_copy()
if use_api_vlm:
vlm_options = openai_compatible_vlm_options(
model="granite-docling-258m-mlx", # For VLLM use "ibm-granite/granite-docling-258M"
hostname_and_port="localhost:1234", # LM studio defaults to port 1234, VLLM to 8000
prompt="Convert this page to docling.",
format=ResponseFormat.DOCTAGS,
api_key="",
)
vlm_options.track_input_prompt = True
# Configure pipeline options
print("Configuring pipeline options...")
pipeline_options_layout_aware = ThreadedLayoutVlmPipelineOptions(
# VLM configuration - defaults to GRANITEDOCLING_TRANSFORMERS
vlm_options=vlm_options,
# Layout configuration - defaults to DOCLING_LAYOUT_HERON
# Batch sizes for parallel processing
layout_batch_size=2,
vlm_batch_size=1,
# Queue configuration
queue_max_size=10,
# Image processing
images_scale=vlm_options.scale,
generate_page_images=True,
enable_remote_services=use_api_vlm,
)
# Create converter with the new pipeline
print("Initializing DocumentConverter (this may take a while - loading models)...")
doc_converter_layout_enhanced = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=ThreadedLayoutVlmPipeline,
pipeline_options=pipeline_options_layout_aware,
)
}
)
result_layout_aware = doc_converter_layout_enhanced.convert(
source=input_doc_path, raises_on_error=False
)
if result_layout_aware.status == ConversionStatus.FAILURE:
_log.error(f"Conversion failed: {result_layout_aware.status}")
doc_filename = result_layout_aware.input.file.stem
result_layout_aware.document.save_as_json(
out_dir_layout_aware / f"{doc_filename}.json"
)
result_layout_aware.document.save_as_html(
out_dir_layout_aware / f"{doc_filename}.html", split_page_view=True
)
for page in result_layout_aware.pages:
_log.info("Page %s of VLM response:", page.page_no)
if page.predictions.vlm_response:
_log.info(page.predictions.vlm_response)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
try:
args = _parse_args()
_log.info(
f"Parsed arguments: input={args.input_file}, output={args.output_dir}"
)
input_path = Path(args.input_file)
if not input_path.exists():
raise FileNotFoundError(f"Input file does not exist: {input_path}")
if input_path.suffix.lower() != ".pdf":
raise ValueError(f"Input file must be a PDF: {input_path}")
out_dir_layout_aware = Path(args.output_dir) / "layout_aware/"
out_dir_layout_aware.mkdir(parents=True, exist_ok=True)
use_api_vlm = False # Set to False to use inline VLM model
demo_threaded_layout_vlm_pipeline(input_path, out_dir_layout_aware, use_api_vlm)
except Exception:
traceback.print_exc()
raise