Docling runner based on CLI, but its too slow to use. Pii rule fixes

2025-11-30 17:21:32 +00:00 · 2025-05-14 16:31:56 +00:00 · 2025-05-14 16:31:56 +00:00 · 2e8753af26
commit 2e8753af26
parent 74ef2b6f65
4 changed files with 98 additions and 89 deletions
--- a/olmocr/bench/runners/run_docling.py
+++ b/olmocr/bench/runners/run_docling.py
@ -1,99 +1,76 @@
-import base64
+import asyncio
 import os
-from io import BytesIO
+import tempfile
 from typing import Literal

-import torch
-from docling_core.types.doc import DoclingDocument
-from docling_core.types.doc.document import DocTagsDocument
-from PIL import Image
-from transformers import AutoModelForVision2Seq, AutoProcessor
-
-from olmocr.data.renderpdf import render_pdf_to_base64png
-
-_cached_model = None
-_cached_processor = None
+from pypdf import PdfReader, PdfWriter


-def init_model(model_name: str = "ds4sd/SmolDocling-256M-preview"):
-    """Initialize and cache the model and processor."""
-    global _cached_model, _cached_processor
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    if _cached_model is None:
-        processor = AutoProcessor.from_pretrained(model_name)
-        model = (
-            AutoModelForVision2Seq.from_pretrained(
-                model_name,
-                torch_dtype=torch.bfloat16,
-                # _attn_implementation="flash_attention_2" if device.type == "cuda" else "eager",
-                _attn_implementation="eager",
-            )
-            .eval()
-            .to(device)
-        )
-
-        _cached_model = model
-        _cached_processor = processor
-
-    return _cached_model, _cached_processor, device
-
-
-def run_docling(
+async def run_docling(
    pdf_path: str,
    page_num: int = 1,
-    model_name: str = "ds4sd/SmolDocling-256M-preview",
-    temperature: float = 0.1,
-    target_longest_image_dim: int = 1024,
-    output_format: Literal["markdown", "html", "doctags"] = "markdown",
+    output_format: Literal["markdown"] = "markdown",
+    use_smoldocling: bool = False,
 ) -> str:
-    # Initialize the model
-    model, processor, device = init_model(model_name)
+    """Run docling CLI on a PDF file and return the results.

-    # Convert PDF page to image
-    image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
-    image = Image.open(BytesIO(base64.b64decode(image_base64)))
+    Args:
+        pdf_path: Path to the PDF file
+        page_num: Page number to process (1-indexed)
+        output_format: Output format (only markdown is supported for CLI version)

-    # Create input messages
-    messages = [
-        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]},
-    ]
+    Returns:
+        String containing the markdown output
+    """
+    if output_format != "markdown":
+        raise ValueError("Only markdown output format is supported for CLI version")

-    # Prepare inputs
-    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=[image], return_tensors="pt")
-    inputs = inputs.to(device)
+    # Extract the specific page using pypdf
+    pdf_reader = PdfReader(pdf_path)
+    pdf_writer = PdfWriter()

-    # Generate outputs
-    with torch.no_grad():
-        generated_ids = model.generate(
-            **inputs,
-            max_new_tokens=8192,
-            temperature=temperature,
-            do_sample=temperature > 0,
-        )
+    # Convert from 1-indexed to 0-indexed
+    zero_based_page_num = page_num - 1

-    # Process the generated output
-    prompt_length = inputs.input_ids.shape[1]
-    trimmed_generated_ids = generated_ids[:, prompt_length:]
-    doctags = processor.batch_decode(
-        trimmed_generated_ids,
-        skip_special_tokens=False,
-    )[0].lstrip()
+    if zero_based_page_num >= len(pdf_reader.pages) or zero_based_page_num < 0:
+        raise ValueError(f"Page number {page_num} is out of bounds for PDF with {len(pdf_reader.pages)} pages")

-    # Create Docling document
-    doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
-    doc = DoclingDocument(name=os.path.basename(pdf_path))
-    doc.load_from_doctags(doctags_doc)
+    # Add the selected page to the writer
+    pdf_writer.add_page(pdf_reader.pages[zero_based_page_num])

-    # Generate output in the requested format
-    result = None
-    if output_format == "markdown":
-        result = doc.export_to_markdown()
-    elif output_format == "html":
-        result = doc.export_to_html()
-    elif output_format == "doctags":
-        result = doctags
+    # Create temporary files for the single-page PDF and output markdown
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf_file, tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp_md_file:
+        tmp_pdf_path = tmp_pdf_file.name
+        tmp_md_path = tmp_md_file.name

-    return result
+    try:
+        # Write the single-page PDF to the temporary file
+        with open(tmp_pdf_path, "wb") as f:
+            pdf_writer.write(f)
+
+        # Build the command to run docling on the single-page PDF
+        if use_smoldocling:
+            cmd = ["docling", tmp_pdf_path, "-o", tmp_md_path]  # Output file
+        else:
+            cmd = ["docling", "--pipeline", "vlm", "--vlm-model", "smoldocling", tmp_pdf_path, "-o", tmp_md_path]  # Output file
+
+        # Run the command asynchronously
+        proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
+
+        stdout, stderr = await proc.communicate()
+
+        if proc.returncode != 0:
+            error_msg = stderr.decode() if stderr else "Unknown error"
+            raise RuntimeError(f"docling command failed with return code {proc.returncode}: {error_msg}")
+
+        # Read the results from the temporary markdown file
+        with open(tmp_md_path, "r", encoding="utf-8") as f:
+            result = f.read()
+
+        return result
+
+    finally:
+        # Clean up the temporary files
+        for path in [tmp_pdf_path, tmp_md_path]:
+            if os.path.exists(path):
+                os.unlink(path)
--- a/olmocr/version.py
+++ b/olmocr/version.py
@ -2,7 +2,7 @@ _MAJOR = "0"
 _MINOR = "1"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "66"
+_PATCH = "67"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
--- a/scripts/check_qual.sh
+++ b/scripts/check_qual.sh
@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -e
+
+python scripts/pii_rule_comparison.py \
+  --docs-folder /home/ubuntu/s2pdf_dedupe_minhash_v1_with_no_pii/documents \
+  --ref-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.5" \
+  --hyp-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.4" \
+  --output-dir results/pii_detection \
+
+
+tinyhost results/pii_detection/*
--- a/scripts/pii_rule_comparison.py
+++ b/scripts/pii_rule_comparison.py
@ -38,6 +38,7 @@ Rule expression syntax:

 import argparse
 import gzip
+import html as pyhtml
 import io
 import json
 import logging
@ -1482,7 +1483,7 @@ def generate_html_report(docs, title, summary, output_path):
        html += f"""
            <div id="doc-{i}" class="document{selected_class}" tabindex="0">
                <div class="document-id">Document ID: {doc_id}</div>
-                <div class="document-text">{doc_text}</div>
+                <pre class="document-text">{pyhtml.escape(doc_text)}</pre>
            </div>
 """

@ -1735,15 +1736,20 @@ IoU: {iou:.4f}

    # True Positives
    generate_html_report(
-        true_positives, "True Positives - Documents matching both Reference and Hypothesis Rules", summary, os.path.join(args.output_dir, "true_positives.html")
+        true_positives[:1000],
+        "True Positives - Documents matching both Reference and Hypothesis Rules",
+        summary,
+        os.path.join(args.output_dir, "true_positives.html"),
    )

    # True Negatives
-    generate_html_report(true_negatives, "True Negatives - Documents not matching either Rule", summary, os.path.join(args.output_dir, "true_negatives.html"))
+    generate_html_report(
+        true_negatives[:1000], "True Negatives - Documents not matching either Rule", summary, os.path.join(args.output_dir, "true_negatives.html")
+    )

    # False Positives
    generate_html_report(
-        false_positives,
+        false_positives[:1000],
        "False Positives - Documents matching Hypothesis but not Reference Rule",
        summary,
        os.path.join(args.output_dir, "false_positives.html"),
@ -1751,7 +1757,7 @@ IoU: {iou:.4f}

    # False Negatives
    generate_html_report(
-        false_negatives,
+        false_negatives[:1000],
        "False Negatives - Documents matching Reference but not Hypothesis Rule",
        summary,
        os.path.join(args.output_dir, "false_negatives.html"),
@ -1879,6 +1885,20 @@ IoU: {iou:.4f}
    logger.info(f"F1 Score: {f1:.4f}")
    logger.info(f"IoU: {iou:.4f}")

+    # Output all available attributes that have been loaded
+    logger.info("\n--- AVAILABLE ATTRIBUTES ---")
+    all_attributes = set()
+    for doc in all_docs:
+        if "attributes" in doc and doc["attributes"]:
+            all_attributes.update(doc["attributes"].keys())
+
+    if all_attributes:
+        logger.info(f"Found {len(all_attributes)} unique attributes:")
+        for attr in sorted(all_attributes):
+            logger.info(f"  - {attr}")
+    else:
+        logger.info("No attributes found in any documents.")
+
    logger.info(f"\nResults saved to: {args.output_dir}/index.html")