Docling runner based on CLI, but its too slow to use. Pii rule fixes

This commit is contained in:
Jake Poznanski 2025-05-14 16:31:56 +00:00
parent 74ef2b6f65
commit 2e8753af26
4 changed files with 98 additions and 89 deletions

View File

@ -1,99 +1,76 @@
import base64 import asyncio
import os import os
from io import BytesIO import tempfile
from typing import Literal from typing import Literal
import torch from pypdf import PdfReader, PdfWriter
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
from PIL import Image
from transformers import AutoModelForVision2Seq, AutoProcessor
from olmocr.data.renderpdf import render_pdf_to_base64png
_cached_model = None
_cached_processor = None
def init_model(model_name: str = "ds4sd/SmolDocling-256M-preview"): async def run_docling(
"""Initialize and cache the model and processor."""
global _cached_model, _cached_processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if _cached_model is None:
processor = AutoProcessor.from_pretrained(model_name)
model = (
AutoModelForVision2Seq.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
# _attn_implementation="flash_attention_2" if device.type == "cuda" else "eager",
_attn_implementation="eager",
)
.eval()
.to(device)
)
_cached_model = model
_cached_processor = processor
return _cached_model, _cached_processor, device
def run_docling(
pdf_path: str, pdf_path: str,
page_num: int = 1, page_num: int = 1,
model_name: str = "ds4sd/SmolDocling-256M-preview", output_format: Literal["markdown"] = "markdown",
temperature: float = 0.1, use_smoldocling: bool = False,
target_longest_image_dim: int = 1024,
output_format: Literal["markdown", "html", "doctags"] = "markdown",
) -> str: ) -> str:
# Initialize the model """Run docling CLI on a PDF file and return the results.
model, processor, device = init_model(model_name)
# Convert PDF page to image Args:
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim) pdf_path: Path to the PDF file
image = Image.open(BytesIO(base64.b64decode(image_base64))) page_num: Page number to process (1-indexed)
output_format: Output format (only markdown is supported for CLI version)
# Create input messages Returns:
messages = [ String containing the markdown output
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]}, """
] if output_format != "markdown":
raise ValueError("Only markdown output format is supported for CLI version")
# Prepare inputs # Extract the specific page using pypdf
prompt = processor.apply_chat_template(messages, add_generation_prompt=True) pdf_reader = PdfReader(pdf_path)
inputs = processor(text=prompt, images=[image], return_tensors="pt") pdf_writer = PdfWriter()
inputs = inputs.to(device)
# Generate outputs # Convert from 1-indexed to 0-indexed
with torch.no_grad(): zero_based_page_num = page_num - 1
generated_ids = model.generate(
**inputs,
max_new_tokens=8192,
temperature=temperature,
do_sample=temperature > 0,
)
# Process the generated output if zero_based_page_num >= len(pdf_reader.pages) or zero_based_page_num < 0:
prompt_length = inputs.input_ids.shape[1] raise ValueError(f"Page number {page_num} is out of bounds for PDF with {len(pdf_reader.pages)} pages")
trimmed_generated_ids = generated_ids[:, prompt_length:]
doctags = processor.batch_decode(
trimmed_generated_ids,
skip_special_tokens=False,
)[0].lstrip()
# Create Docling document # Add the selected page to the writer
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image]) pdf_writer.add_page(pdf_reader.pages[zero_based_page_num])
doc = DoclingDocument(name=os.path.basename(pdf_path))
doc.load_from_doctags(doctags_doc)
# Generate output in the requested format # Create temporary files for the single-page PDF and output markdown
result = None with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf_file, tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp_md_file:
if output_format == "markdown": tmp_pdf_path = tmp_pdf_file.name
result = doc.export_to_markdown() tmp_md_path = tmp_md_file.name
elif output_format == "html":
result = doc.export_to_html()
elif output_format == "doctags":
result = doctags
return result try:
# Write the single-page PDF to the temporary file
with open(tmp_pdf_path, "wb") as f:
pdf_writer.write(f)
# Build the command to run docling on the single-page PDF
if use_smoldocling:
cmd = ["docling", tmp_pdf_path, "-o", tmp_md_path] # Output file
else:
cmd = ["docling", "--pipeline", "vlm", "--vlm-model", "smoldocling", tmp_pdf_path, "-o", tmp_md_path] # Output file
# Run the command asynchronously
proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
stdout, stderr = await proc.communicate()
if proc.returncode != 0:
error_msg = stderr.decode() if stderr else "Unknown error"
raise RuntimeError(f"docling command failed with return code {proc.returncode}: {error_msg}")
# Read the results from the temporary markdown file
with open(tmp_md_path, "r", encoding="utf-8") as f:
result = f.read()
return result
finally:
# Clean up the temporary files
for path in [tmp_pdf_path, tmp_md_path]:
if os.path.exists(path):
os.unlink(path)

View File

@ -2,7 +2,7 @@ _MAJOR = "0"
_MINOR = "1" _MINOR = "1"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "66" _PATCH = "67"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""

12
scripts/check_qual.sh Executable file
View File

@ -0,0 +1,12 @@
#!/bin/bash
set -e
python scripts/pii_rule_comparison.py \
--docs-folder /home/ubuntu/s2pdf_dedupe_minhash_v1_with_no_pii/documents \
--ref-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.5" \
--hyp-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.4" \
--output-dir results/pii_detection \
tinyhost results/pii_detection/*

View File

@ -38,6 +38,7 @@ Rule expression syntax:
import argparse import argparse
import gzip import gzip
import html as pyhtml
import io import io
import json import json
import logging import logging
@ -1482,7 +1483,7 @@ def generate_html_report(docs, title, summary, output_path):
html += f""" html += f"""
<div id="doc-{i}" class="document{selected_class}" tabindex="0"> <div id="doc-{i}" class="document{selected_class}" tabindex="0">
<div class="document-id">Document ID: {doc_id}</div> <div class="document-id">Document ID: {doc_id}</div>
<div class="document-text">{doc_text}</div> <pre class="document-text">{pyhtml.escape(doc_text)}</pre>
</div> </div>
""" """
@ -1735,15 +1736,20 @@ IoU: {iou:.4f}
# True Positives # True Positives
generate_html_report( generate_html_report(
true_positives, "True Positives - Documents matching both Reference and Hypothesis Rules", summary, os.path.join(args.output_dir, "true_positives.html") true_positives[:1000],
"True Positives - Documents matching both Reference and Hypothesis Rules",
summary,
os.path.join(args.output_dir, "true_positives.html"),
) )
# True Negatives # True Negatives
generate_html_report(true_negatives, "True Negatives - Documents not matching either Rule", summary, os.path.join(args.output_dir, "true_negatives.html")) generate_html_report(
true_negatives[:1000], "True Negatives - Documents not matching either Rule", summary, os.path.join(args.output_dir, "true_negatives.html")
)
# False Positives # False Positives
generate_html_report( generate_html_report(
false_positives, false_positives[:1000],
"False Positives - Documents matching Hypothesis but not Reference Rule", "False Positives - Documents matching Hypothesis but not Reference Rule",
summary, summary,
os.path.join(args.output_dir, "false_positives.html"), os.path.join(args.output_dir, "false_positives.html"),
@ -1751,7 +1757,7 @@ IoU: {iou:.4f}
# False Negatives # False Negatives
generate_html_report( generate_html_report(
false_negatives, false_negatives[:1000],
"False Negatives - Documents matching Reference but not Hypothesis Rule", "False Negatives - Documents matching Reference but not Hypothesis Rule",
summary, summary,
os.path.join(args.output_dir, "false_negatives.html"), os.path.join(args.output_dir, "false_negatives.html"),
@ -1879,6 +1885,20 @@ IoU: {iou:.4f}
logger.info(f"F1 Score: {f1:.4f}") logger.info(f"F1 Score: {f1:.4f}")
logger.info(f"IoU: {iou:.4f}") logger.info(f"IoU: {iou:.4f}")
# Output all available attributes that have been loaded
logger.info("\n--- AVAILABLE ATTRIBUTES ---")
all_attributes = set()
for doc in all_docs:
if "attributes" in doc and doc["attributes"]:
all_attributes.update(doc["attributes"].keys())
if all_attributes:
logger.info(f"Found {len(all_attributes)} unique attributes:")
for attr in sorted(all_attributes):
logger.info(f" - {attr}")
else:
logger.info("No attributes found in any documents.")
logger.info(f"\nResults saved to: {args.output_dir}/index.html") logger.info(f"\nResults saved to: {args.output_dir}/index.html")