Docling runner based on CLI, but its too slow to use. Pii rule fixes

2025-10-12 08:43:32 +00:00 · 2025-05-14 16:31:56 +00:00 · 2025-05-14 16:31:56 +00:00 · 2e8753af26
commit 2e8753af26
parent 74ef2b6f65
4 changed files with 98 additions and 89 deletions
--- a/olmocr/bench/runners/run_docling.py
+++ b/olmocr/bench/runners/run_docling.py
@ -1,99 +1,76 @@
-import base64
+import asyncio
 import os
-from io import BytesIO
+import tempfile
 from typing import Literal
-import torch
+from pypdf import PdfReader, PdfWriter
 from docling_core.types.doc import DoclingDocument
 from docling_core.types.doc.document import DocTagsDocument
 from PIL import Image
 from transformers import AutoModelForVision2Seq, AutoProcessor
 from olmocr.data.renderpdf import render_pdf_to_base64png
 _cached_model = None
 _cached_processor = None
-def init_model(model_name: str = "ds4sd/SmolDocling-256M-preview"):
+async def run_docling(
    """Initialize and cache the model and processor."""
    global _cached_model, _cached_processor
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if _cached_model is None:
        processor = AutoProcessor.from_pretrained(model_name)
        model = (
            AutoModelForVision2Seq.from_pretrained(
                model_name,
                torch_dtype=torch.bfloat16,
                # _attn_implementation="flash_attention_2" if device.type == "cuda" else "eager",
                _attn_implementation="eager",
            )
            .eval()
            .to(device)
        )
        _cached_model = model
        _cached_processor = processor
    return _cached_model, _cached_processor, device
 def run_docling(
    pdf_path: str,
    page_num: int = 1,
-    model_name: str = "ds4sd/SmolDocling-256M-preview",
+    output_format: Literal["markdown"] = "markdown",
-    temperature: float = 0.1,
+    use_smoldocling: bool = False,
    target_longest_image_dim: int = 1024,
    output_format: Literal["markdown", "html", "doctags"] = "markdown",
 ) -> str:
-    # Initialize the model
+    """Run docling CLI on a PDF file and return the results.
    model, processor, device = init_model(model_name)
-    # Convert PDF page to image
+    Args:
-    image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
+        pdf_path: Path to the PDF file
-    image = Image.open(BytesIO(base64.b64decode(image_base64)))
+        page_num: Page number to process (1-indexed)
        output_format: Output format (only markdown is supported for CLI version)
-    # Create input messages
+    Returns:
-    messages = [
+        String containing the markdown output
-        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]},
+    """
-    ]
+    if output_format != "markdown":
        raise ValueError("Only markdown output format is supported for CLI version")
-    # Prepare inputs
+    # Extract the specific page using pypdf
-    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    pdf_reader = PdfReader(pdf_path)
-    inputs = processor(text=prompt, images=[image], return_tensors="pt")
+    pdf_writer = PdfWriter()
    inputs = inputs.to(device)
-    # Generate outputs
+    # Convert from 1-indexed to 0-indexed
-    with torch.no_grad():
+    zero_based_page_num = page_num - 1
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=8192,
            temperature=temperature,
            do_sample=temperature > 0,
        )
-    # Process the generated output
+    if zero_based_page_num >= len(pdf_reader.pages) or zero_based_page_num < 0:
-    prompt_length = inputs.input_ids.shape[1]
+        raise ValueError(f"Page number {page_num} is out of bounds for PDF with {len(pdf_reader.pages)} pages")
    trimmed_generated_ids = generated_ids[:, prompt_length:]
    doctags = processor.batch_decode(
        trimmed_generated_ids,
        skip_special_tokens=False,
    )[0].lstrip()
-    # Create Docling document
+    # Add the selected page to the writer
-    doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
+    pdf_writer.add_page(pdf_reader.pages[zero_based_page_num])
    doc = DoclingDocument(name=os.path.basename(pdf_path))
    doc.load_from_doctags(doctags_doc)
-    # Generate output in the requested format
+    # Create temporary files for the single-page PDF and output markdown
-    result = None
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf_file, tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp_md_file:
-    if output_format == "markdown":
+        tmp_pdf_path = tmp_pdf_file.name
-        result = doc.export_to_markdown()
+        tmp_md_path = tmp_md_file.name
    elif output_format == "html":
        result = doc.export_to_html()
    elif output_format == "doctags":
        result = doctags
-    return result
+    try:
        # Write the single-page PDF to the temporary file
        with open(tmp_pdf_path, "wb") as f:
            pdf_writer.write(f)
        # Build the command to run docling on the single-page PDF
        if use_smoldocling:
            cmd = ["docling", tmp_pdf_path, "-o", tmp_md_path]  # Output file
        else:
            cmd = ["docling", "--pipeline", "vlm", "--vlm-model", "smoldocling", tmp_pdf_path, "-o", tmp_md_path]  # Output file
        # Run the command asynchronously
        proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
        stdout, stderr = await proc.communicate()
        if proc.returncode != 0:
            error_msg = stderr.decode() if stderr else "Unknown error"
            raise RuntimeError(f"docling command failed with return code {proc.returncode}: {error_msg}")
        # Read the results from the temporary markdown file
        with open(tmp_md_path, "r", encoding="utf-8") as f:
            result = f.read()
        return result
    finally:
        # Clean up the temporary files
        for path in [tmp_pdf_path, tmp_md_path]:
            if os.path.exists(path):
                os.unlink(path)
--- a/olmocr/version.py
+++ b/olmocr/version.py
@ -2,7 +2,7 @@ _MAJOR = "0"
 _MINOR = "1"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "66"
+_PATCH = "67"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
--- a/scripts/check_qual.sh
+++ b/scripts/check_qual.sh
@ -0,0 +1,12 @@
 #!/bin/bash
 set -e
 python scripts/pii_rule_comparison.py \
  --docs-folder /home/ubuntu/s2pdf_dedupe_minhash_v1_with_no_pii/documents \
  --ref-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.5" \
  --hyp-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.4" \
  --output-dir results/pii_detection \
 tinyhost results/pii_detection/*
--- a/scripts/pii_rule_comparison.py
+++ b/scripts/pii_rule_comparison.py
@ -38,6 +38,7 @@ Rule expression syntax:
 import argparse
 import gzip
 import html as pyhtml
 import io
 import json
 import logging
@ -1482,7 +1483,7 @@ def generate_html_report(docs, title, summary, output_path):
        html += f"""
            <div id="doc-{i}" class="document{selected_class}" tabindex="0">
                <div class="document-id">Document ID: {doc_id}</div>
-                <div class="document-text">{doc_text}</div>
+                <pre class="document-text">{pyhtml.escape(doc_text)}</pre>
            </div>
 """
@ -1735,15 +1736,20 @@ IoU: {iou:.4f}
    # True Positives
    generate_html_report(
-        true_positives, "True Positives - Documents matching both Reference and Hypothesis Rules", summary, os.path.join(args.output_dir, "true_positives.html")
+        true_positives[:1000],
        "True Positives - Documents matching both Reference and Hypothesis Rules",
        summary,
        os.path.join(args.output_dir, "true_positives.html"),
    )
    # True Negatives
-    generate_html_report(true_negatives, "True Negatives - Documents not matching either Rule", summary, os.path.join(args.output_dir, "true_negatives.html"))
+    generate_html_report(
        true_negatives[:1000], "True Negatives - Documents not matching either Rule", summary, os.path.join(args.output_dir, "true_negatives.html")
    )
    # False Positives
    generate_html_report(
-        false_positives,
+        false_positives[:1000],
        "False Positives - Documents matching Hypothesis but not Reference Rule",
        summary,
        os.path.join(args.output_dir, "false_positives.html"),
@ -1751,7 +1757,7 @@ IoU: {iou:.4f}
    # False Negatives
    generate_html_report(
-        false_negatives,
+        false_negatives[:1000],
        "False Negatives - Documents matching Reference but not Hypothesis Rule",
        summary,
        os.path.join(args.output_dir, "false_negatives.html"),
@ -1879,6 +1885,20 @@ IoU: {iou:.4f}
    logger.info(f"F1 Score: {f1:.4f}")
    logger.info(f"IoU: {iou:.4f}")
    # Output all available attributes that have been loaded
    logger.info("\n--- AVAILABLE ATTRIBUTES ---")
    all_attributes = set()
    for doc in all_docs:
        if "attributes" in doc and doc["attributes"]:
            all_attributes.update(doc["attributes"].keys())
    if all_attributes:
        logger.info(f"Found {len(all_attributes)} unique attributes:")
        for attr in sorted(all_attributes):
            logger.info(f"  - {attr}")
    else:
        logger.info("No attributes found in any documents.")
    logger.info(f"\nResults saved to: {args.output_dir}/index.html")