mirror of
https://github.com/docling-project/docling.git
synced 2025-12-06 15:54:47 +00:00
* fix: Ensure proper image_scale is used for generated page images in layout+vlm pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Ensure proper image_scale output in default VLM pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
178 lines
5.7 KiB
Python
Vendored
178 lines
5.7 KiB
Python
Vendored
#!/usr/bin/env python3
|
|
"""Demo script for the new ThreadedLayoutVlmPipeline.
|
|
|
|
This script demonstrates the usage of the experimental ThreadedLayoutVlmPipeline pipeline
|
|
that combines layout model preprocessing with VLM processing in a threaded manner.
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import traceback
|
|
from pathlib import Path
|
|
|
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
|
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat
|
|
from docling.datamodel.vlm_model_specs import GRANITEDOCLING_TRANSFORMERS
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
from docling.experimental.datamodel.threaded_layout_vlm_pipeline_options import (
|
|
ThreadedLayoutVlmPipelineOptions,
|
|
)
|
|
from docling.experimental.pipeline.threaded_layout_vlm_pipeline import (
|
|
ThreadedLayoutVlmPipeline,
|
|
)
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
|
|
def _parse_args():
|
|
parser = argparse.ArgumentParser(
|
|
description="Demo script for the experimental ThreadedLayoutVlmPipeline"
|
|
)
|
|
parser.add_argument(
|
|
"--input-file",
|
|
type=str,
|
|
default="tests/data/pdf/code_and_formula.pdf",
|
|
help="Path to a PDF file",
|
|
)
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
type=str,
|
|
default="scratch/demo_layout_vlm/",
|
|
help="Output directory for converted files",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
# Can be used to read multiple pdf files under a folder
|
|
# def _get_docs(input_doc_path):
|
|
# """Yield DocumentStream objects from list of input document paths"""
|
|
# for path in input_doc_path:
|
|
# buf = BytesIO(path.read_bytes())
|
|
# stream = DocumentStream(name=path.name, stream=buf)
|
|
# yield stream
|
|
|
|
|
|
def openai_compatible_vlm_options(
|
|
model: str,
|
|
prompt: str,
|
|
format: ResponseFormat,
|
|
hostname_and_port,
|
|
temperature: float = 0.7,
|
|
max_tokens: int = 4096,
|
|
api_key: str = "",
|
|
skip_special_tokens=False,
|
|
):
|
|
headers = {}
|
|
if api_key:
|
|
headers["Authorization"] = f"Bearer {api_key}"
|
|
|
|
options = ApiVlmOptions(
|
|
url=f"http://{hostname_and_port}/v1/chat/completions", # LM studio defaults to port 1234, VLLM to 8000
|
|
params=dict(
|
|
model=model,
|
|
max_tokens=max_tokens,
|
|
skip_special_tokens=skip_special_tokens, # needed for VLLM
|
|
),
|
|
headers=headers,
|
|
prompt=prompt,
|
|
timeout=90,
|
|
scale=2.0,
|
|
temperature=temperature,
|
|
response_format=format,
|
|
)
|
|
|
|
return options
|
|
|
|
|
|
def demo_threaded_layout_vlm_pipeline(
|
|
input_doc_path: Path, out_dir_layout_aware: Path, use_api_vlm: bool
|
|
):
|
|
"""Demonstrate the threaded layout+VLM pipeline."""
|
|
|
|
vlm_options = GRANITEDOCLING_TRANSFORMERS.model_copy()
|
|
|
|
if use_api_vlm:
|
|
vlm_options = openai_compatible_vlm_options(
|
|
model="granite-docling-258m-mlx", # For VLLM use "ibm-granite/granite-docling-258M"
|
|
hostname_and_port="localhost:1234", # LM studio defaults to port 1234, VLLM to 8000
|
|
prompt="Convert this page to docling.",
|
|
format=ResponseFormat.DOCTAGS,
|
|
api_key="",
|
|
)
|
|
vlm_options.track_input_prompt = True
|
|
|
|
# Configure pipeline options
|
|
print("Configuring pipeline options...")
|
|
pipeline_options_layout_aware = ThreadedLayoutVlmPipelineOptions(
|
|
# VLM configuration - defaults to GRANITEDOCLING_TRANSFORMERS
|
|
vlm_options=vlm_options,
|
|
# Layout configuration - defaults to DOCLING_LAYOUT_HERON
|
|
# Batch sizes for parallel processing
|
|
layout_batch_size=2,
|
|
vlm_batch_size=1,
|
|
# Queue configuration
|
|
queue_max_size=10,
|
|
# Image processing
|
|
images_scale=vlm_options.scale,
|
|
generate_page_images=True,
|
|
enable_remote_services=use_api_vlm,
|
|
)
|
|
|
|
# Create converter with the new pipeline
|
|
print("Initializing DocumentConverter (this may take a while - loading models)...")
|
|
doc_converter_layout_enhanced = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(
|
|
pipeline_cls=ThreadedLayoutVlmPipeline,
|
|
pipeline_options=pipeline_options_layout_aware,
|
|
)
|
|
}
|
|
)
|
|
|
|
result_layout_aware = doc_converter_layout_enhanced.convert(
|
|
source=input_doc_path, raises_on_error=False
|
|
)
|
|
|
|
if result_layout_aware.status == ConversionStatus.FAILURE:
|
|
_log.error(f"Conversion failed: {result_layout_aware.status}")
|
|
|
|
doc_filename = result_layout_aware.input.file.stem
|
|
result_layout_aware.document.save_as_json(
|
|
out_dir_layout_aware / f"{doc_filename}.json"
|
|
)
|
|
|
|
result_layout_aware.document.save_as_html(
|
|
out_dir_layout_aware / f"{doc_filename}.html", split_page_view=True
|
|
)
|
|
for page in result_layout_aware.pages:
|
|
_log.info("Page %s of VLM response:", page.page_no)
|
|
if page.predictions.vlm_response:
|
|
_log.info(page.predictions.vlm_response)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.INFO)
|
|
try:
|
|
args = _parse_args()
|
|
_log.info(
|
|
f"Parsed arguments: input={args.input_file}, output={args.output_dir}"
|
|
)
|
|
|
|
input_path = Path(args.input_file)
|
|
|
|
if not input_path.exists():
|
|
raise FileNotFoundError(f"Input file does not exist: {input_path}")
|
|
|
|
if input_path.suffix.lower() != ".pdf":
|
|
raise ValueError(f"Input file must be a PDF: {input_path}")
|
|
|
|
out_dir_layout_aware = Path(args.output_dir) / "layout_aware/"
|
|
out_dir_layout_aware.mkdir(parents=True, exist_ok=True)
|
|
|
|
use_api_vlm = False # Set to False to use inline VLM model
|
|
|
|
demo_threaded_layout_vlm_pipeline(input_path, out_dir_layout_aware, use_api_vlm)
|
|
except Exception:
|
|
traceback.print_exc()
|
|
raise
|