mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-12 08:43:32 +00:00
Docling runner based on CLI, but its too slow to use. Pii rule fixes
This commit is contained in:
parent
74ef2b6f65
commit
2e8753af26
@ -1,99 +1,76 @@
|
||||
import base64
|
||||
import asyncio
|
||||
import os
|
||||
from io import BytesIO
|
||||
import tempfile
|
||||
from typing import Literal
|
||||
|
||||
import torch
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
from docling_core.types.doc.document import DocTagsDocument
|
||||
from PIL import Image
|
||||
from transformers import AutoModelForVision2Seq, AutoProcessor
|
||||
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||
|
||||
_cached_model = None
|
||||
_cached_processor = None
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
|
||||
def init_model(model_name: str = "ds4sd/SmolDocling-256M-preview"):
|
||||
"""Initialize and cache the model and processor."""
|
||||
global _cached_model, _cached_processor
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
if _cached_model is None:
|
||||
processor = AutoProcessor.from_pretrained(model_name)
|
||||
model = (
|
||||
AutoModelForVision2Seq.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch.bfloat16,
|
||||
# _attn_implementation="flash_attention_2" if device.type == "cuda" else "eager",
|
||||
_attn_implementation="eager",
|
||||
)
|
||||
.eval()
|
||||
.to(device)
|
||||
)
|
||||
|
||||
_cached_model = model
|
||||
_cached_processor = processor
|
||||
|
||||
return _cached_model, _cached_processor, device
|
||||
|
||||
|
||||
def run_docling(
|
||||
async def run_docling(
|
||||
pdf_path: str,
|
||||
page_num: int = 1,
|
||||
model_name: str = "ds4sd/SmolDocling-256M-preview",
|
||||
temperature: float = 0.1,
|
||||
target_longest_image_dim: int = 1024,
|
||||
output_format: Literal["markdown", "html", "doctags"] = "markdown",
|
||||
output_format: Literal["markdown"] = "markdown",
|
||||
use_smoldocling: bool = False,
|
||||
) -> str:
|
||||
# Initialize the model
|
||||
model, processor, device = init_model(model_name)
|
||||
"""Run docling CLI on a PDF file and return the results.
|
||||
|
||||
# Convert PDF page to image
|
||||
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
|
||||
image = Image.open(BytesIO(base64.b64decode(image_base64)))
|
||||
Args:
|
||||
pdf_path: Path to the PDF file
|
||||
page_num: Page number to process (1-indexed)
|
||||
output_format: Output format (only markdown is supported for CLI version)
|
||||
|
||||
# Create input messages
|
||||
messages = [
|
||||
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]},
|
||||
]
|
||||
Returns:
|
||||
String containing the markdown output
|
||||
"""
|
||||
if output_format != "markdown":
|
||||
raise ValueError("Only markdown output format is supported for CLI version")
|
||||
|
||||
# Prepare inputs
|
||||
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
||||
inputs = processor(text=prompt, images=[image], return_tensors="pt")
|
||||
inputs = inputs.to(device)
|
||||
# Extract the specific page using pypdf
|
||||
pdf_reader = PdfReader(pdf_path)
|
||||
pdf_writer = PdfWriter()
|
||||
|
||||
# Generate outputs
|
||||
with torch.no_grad():
|
||||
generated_ids = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=8192,
|
||||
temperature=temperature,
|
||||
do_sample=temperature > 0,
|
||||
)
|
||||
# Convert from 1-indexed to 0-indexed
|
||||
zero_based_page_num = page_num - 1
|
||||
|
||||
# Process the generated output
|
||||
prompt_length = inputs.input_ids.shape[1]
|
||||
trimmed_generated_ids = generated_ids[:, prompt_length:]
|
||||
doctags = processor.batch_decode(
|
||||
trimmed_generated_ids,
|
||||
skip_special_tokens=False,
|
||||
)[0].lstrip()
|
||||
if zero_based_page_num >= len(pdf_reader.pages) or zero_based_page_num < 0:
|
||||
raise ValueError(f"Page number {page_num} is out of bounds for PDF with {len(pdf_reader.pages)} pages")
|
||||
|
||||
# Create Docling document
|
||||
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
||||
doc = DoclingDocument(name=os.path.basename(pdf_path))
|
||||
doc.load_from_doctags(doctags_doc)
|
||||
# Add the selected page to the writer
|
||||
pdf_writer.add_page(pdf_reader.pages[zero_based_page_num])
|
||||
|
||||
# Generate output in the requested format
|
||||
result = None
|
||||
if output_format == "markdown":
|
||||
result = doc.export_to_markdown()
|
||||
elif output_format == "html":
|
||||
result = doc.export_to_html()
|
||||
elif output_format == "doctags":
|
||||
result = doctags
|
||||
# Create temporary files for the single-page PDF and output markdown
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf_file, tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp_md_file:
|
||||
tmp_pdf_path = tmp_pdf_file.name
|
||||
tmp_md_path = tmp_md_file.name
|
||||
|
||||
return result
|
||||
try:
|
||||
# Write the single-page PDF to the temporary file
|
||||
with open(tmp_pdf_path, "wb") as f:
|
||||
pdf_writer.write(f)
|
||||
|
||||
# Build the command to run docling on the single-page PDF
|
||||
if use_smoldocling:
|
||||
cmd = ["docling", tmp_pdf_path, "-o", tmp_md_path] # Output file
|
||||
else:
|
||||
cmd = ["docling", "--pipeline", "vlm", "--vlm-model", "smoldocling", tmp_pdf_path, "-o", tmp_md_path] # Output file
|
||||
|
||||
# Run the command asynchronously
|
||||
proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
|
||||
|
||||
stdout, stderr = await proc.communicate()
|
||||
|
||||
if proc.returncode != 0:
|
||||
error_msg = stderr.decode() if stderr else "Unknown error"
|
||||
raise RuntimeError(f"docling command failed with return code {proc.returncode}: {error_msg}")
|
||||
|
||||
# Read the results from the temporary markdown file
|
||||
with open(tmp_md_path, "r", encoding="utf-8") as f:
|
||||
result = f.read()
|
||||
|
||||
return result
|
||||
|
||||
finally:
|
||||
# Clean up the temporary files
|
||||
for path in [tmp_pdf_path, tmp_md_path]:
|
||||
if os.path.exists(path):
|
||||
os.unlink(path)
|
||||
|
@ -2,7 +2,7 @@ _MAJOR = "0"
|
||||
_MINOR = "1"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "66"
|
||||
_PATCH = "67"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
12
scripts/check_qual.sh
Executable file
12
scripts/check_qual.sh
Executable file
@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
python scripts/pii_rule_comparison.py \
|
||||
--docs-folder /home/ubuntu/s2pdf_dedupe_minhash_v1_with_no_pii/documents \
|
||||
--ref-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.5" \
|
||||
--hyp-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.4" \
|
||||
--output-dir results/pii_detection \
|
||||
|
||||
|
||||
tinyhost results/pii_detection/*
|
@ -38,6 +38,7 @@ Rule expression syntax:
|
||||
|
||||
import argparse
|
||||
import gzip
|
||||
import html as pyhtml
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
@ -1482,7 +1483,7 @@ def generate_html_report(docs, title, summary, output_path):
|
||||
html += f"""
|
||||
<div id="doc-{i}" class="document{selected_class}" tabindex="0">
|
||||
<div class="document-id">Document ID: {doc_id}</div>
|
||||
<div class="document-text">{doc_text}</div>
|
||||
<pre class="document-text">{pyhtml.escape(doc_text)}</pre>
|
||||
</div>
|
||||
"""
|
||||
|
||||
@ -1735,15 +1736,20 @@ IoU: {iou:.4f}
|
||||
|
||||
# True Positives
|
||||
generate_html_report(
|
||||
true_positives, "True Positives - Documents matching both Reference and Hypothesis Rules", summary, os.path.join(args.output_dir, "true_positives.html")
|
||||
true_positives[:1000],
|
||||
"True Positives - Documents matching both Reference and Hypothesis Rules",
|
||||
summary,
|
||||
os.path.join(args.output_dir, "true_positives.html"),
|
||||
)
|
||||
|
||||
# True Negatives
|
||||
generate_html_report(true_negatives, "True Negatives - Documents not matching either Rule", summary, os.path.join(args.output_dir, "true_negatives.html"))
|
||||
generate_html_report(
|
||||
true_negatives[:1000], "True Negatives - Documents not matching either Rule", summary, os.path.join(args.output_dir, "true_negatives.html")
|
||||
)
|
||||
|
||||
# False Positives
|
||||
generate_html_report(
|
||||
false_positives,
|
||||
false_positives[:1000],
|
||||
"False Positives - Documents matching Hypothesis but not Reference Rule",
|
||||
summary,
|
||||
os.path.join(args.output_dir, "false_positives.html"),
|
||||
@ -1751,7 +1757,7 @@ IoU: {iou:.4f}
|
||||
|
||||
# False Negatives
|
||||
generate_html_report(
|
||||
false_negatives,
|
||||
false_negatives[:1000],
|
||||
"False Negatives - Documents matching Reference but not Hypothesis Rule",
|
||||
summary,
|
||||
os.path.join(args.output_dir, "false_negatives.html"),
|
||||
@ -1879,6 +1885,20 @@ IoU: {iou:.4f}
|
||||
logger.info(f"F1 Score: {f1:.4f}")
|
||||
logger.info(f"IoU: {iou:.4f}")
|
||||
|
||||
# Output all available attributes that have been loaded
|
||||
logger.info("\n--- AVAILABLE ATTRIBUTES ---")
|
||||
all_attributes = set()
|
||||
for doc in all_docs:
|
||||
if "attributes" in doc and doc["attributes"]:
|
||||
all_attributes.update(doc["attributes"].keys())
|
||||
|
||||
if all_attributes:
|
||||
logger.info(f"Found {len(all_attributes)} unique attributes:")
|
||||
for attr in sorted(all_attributes):
|
||||
logger.info(f" - {attr}")
|
||||
else:
|
||||
logger.info("No attributes found in any documents.")
|
||||
|
||||
logger.info(f"\nResults saved to: {args.output_dir}/index.html")
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user