mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-12 08:43:32 +00:00
Docling runner based on CLI, but its too slow to use. Pii rule fixes
This commit is contained in:
parent
74ef2b6f65
commit
2e8753af26
@ -1,99 +1,76 @@
|
|||||||
import base64
|
import asyncio
|
||||||
import os
|
import os
|
||||||
from io import BytesIO
|
import tempfile
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
import torch
|
from pypdf import PdfReader, PdfWriter
|
||||||
from docling_core.types.doc import DoclingDocument
|
|
||||||
from docling_core.types.doc.document import DocTagsDocument
|
|
||||||
from PIL import Image
|
|
||||||
from transformers import AutoModelForVision2Seq, AutoProcessor
|
|
||||||
|
|
||||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
|
||||||
|
|
||||||
_cached_model = None
|
|
||||||
_cached_processor = None
|
|
||||||
|
|
||||||
|
|
||||||
def init_model(model_name: str = "ds4sd/SmolDocling-256M-preview"):
|
async def run_docling(
|
||||||
"""Initialize and cache the model and processor."""
|
|
||||||
global _cached_model, _cached_processor
|
|
||||||
|
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
||||||
|
|
||||||
if _cached_model is None:
|
|
||||||
processor = AutoProcessor.from_pretrained(model_name)
|
|
||||||
model = (
|
|
||||||
AutoModelForVision2Seq.from_pretrained(
|
|
||||||
model_name,
|
|
||||||
torch_dtype=torch.bfloat16,
|
|
||||||
# _attn_implementation="flash_attention_2" if device.type == "cuda" else "eager",
|
|
||||||
_attn_implementation="eager",
|
|
||||||
)
|
|
||||||
.eval()
|
|
||||||
.to(device)
|
|
||||||
)
|
|
||||||
|
|
||||||
_cached_model = model
|
|
||||||
_cached_processor = processor
|
|
||||||
|
|
||||||
return _cached_model, _cached_processor, device
|
|
||||||
|
|
||||||
|
|
||||||
def run_docling(
|
|
||||||
pdf_path: str,
|
pdf_path: str,
|
||||||
page_num: int = 1,
|
page_num: int = 1,
|
||||||
model_name: str = "ds4sd/SmolDocling-256M-preview",
|
output_format: Literal["markdown"] = "markdown",
|
||||||
temperature: float = 0.1,
|
use_smoldocling: bool = False,
|
||||||
target_longest_image_dim: int = 1024,
|
|
||||||
output_format: Literal["markdown", "html", "doctags"] = "markdown",
|
|
||||||
) -> str:
|
) -> str:
|
||||||
# Initialize the model
|
"""Run docling CLI on a PDF file and return the results.
|
||||||
model, processor, device = init_model(model_name)
|
|
||||||
|
|
||||||
# Convert PDF page to image
|
Args:
|
||||||
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim)
|
pdf_path: Path to the PDF file
|
||||||
image = Image.open(BytesIO(base64.b64decode(image_base64)))
|
page_num: Page number to process (1-indexed)
|
||||||
|
output_format: Output format (only markdown is supported for CLI version)
|
||||||
|
|
||||||
# Create input messages
|
Returns:
|
||||||
messages = [
|
String containing the markdown output
|
||||||
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Convert this page to docling."}]},
|
"""
|
||||||
]
|
if output_format != "markdown":
|
||||||
|
raise ValueError("Only markdown output format is supported for CLI version")
|
||||||
|
|
||||||
# Prepare inputs
|
# Extract the specific page using pypdf
|
||||||
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
pdf_reader = PdfReader(pdf_path)
|
||||||
inputs = processor(text=prompt, images=[image], return_tensors="pt")
|
pdf_writer = PdfWriter()
|
||||||
inputs = inputs.to(device)
|
|
||||||
|
|
||||||
# Generate outputs
|
# Convert from 1-indexed to 0-indexed
|
||||||
with torch.no_grad():
|
zero_based_page_num = page_num - 1
|
||||||
generated_ids = model.generate(
|
|
||||||
**inputs,
|
|
||||||
max_new_tokens=8192,
|
|
||||||
temperature=temperature,
|
|
||||||
do_sample=temperature > 0,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Process the generated output
|
if zero_based_page_num >= len(pdf_reader.pages) or zero_based_page_num < 0:
|
||||||
prompt_length = inputs.input_ids.shape[1]
|
raise ValueError(f"Page number {page_num} is out of bounds for PDF with {len(pdf_reader.pages)} pages")
|
||||||
trimmed_generated_ids = generated_ids[:, prompt_length:]
|
|
||||||
doctags = processor.batch_decode(
|
|
||||||
trimmed_generated_ids,
|
|
||||||
skip_special_tokens=False,
|
|
||||||
)[0].lstrip()
|
|
||||||
|
|
||||||
# Create Docling document
|
# Add the selected page to the writer
|
||||||
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
pdf_writer.add_page(pdf_reader.pages[zero_based_page_num])
|
||||||
doc = DoclingDocument(name=os.path.basename(pdf_path))
|
|
||||||
doc.load_from_doctags(doctags_doc)
|
|
||||||
|
|
||||||
# Generate output in the requested format
|
# Create temporary files for the single-page PDF and output markdown
|
||||||
result = None
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_pdf_file, tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp_md_file:
|
||||||
if output_format == "markdown":
|
tmp_pdf_path = tmp_pdf_file.name
|
||||||
result = doc.export_to_markdown()
|
tmp_md_path = tmp_md_file.name
|
||||||
elif output_format == "html":
|
|
||||||
result = doc.export_to_html()
|
|
||||||
elif output_format == "doctags":
|
|
||||||
result = doctags
|
|
||||||
|
|
||||||
return result
|
try:
|
||||||
|
# Write the single-page PDF to the temporary file
|
||||||
|
with open(tmp_pdf_path, "wb") as f:
|
||||||
|
pdf_writer.write(f)
|
||||||
|
|
||||||
|
# Build the command to run docling on the single-page PDF
|
||||||
|
if use_smoldocling:
|
||||||
|
cmd = ["docling", tmp_pdf_path, "-o", tmp_md_path] # Output file
|
||||||
|
else:
|
||||||
|
cmd = ["docling", "--pipeline", "vlm", "--vlm-model", "smoldocling", tmp_pdf_path, "-o", tmp_md_path] # Output file
|
||||||
|
|
||||||
|
# Run the command asynchronously
|
||||||
|
proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
|
||||||
|
|
||||||
|
stdout, stderr = await proc.communicate()
|
||||||
|
|
||||||
|
if proc.returncode != 0:
|
||||||
|
error_msg = stderr.decode() if stderr else "Unknown error"
|
||||||
|
raise RuntimeError(f"docling command failed with return code {proc.returncode}: {error_msg}")
|
||||||
|
|
||||||
|
# Read the results from the temporary markdown file
|
||||||
|
with open(tmp_md_path, "r", encoding="utf-8") as f:
|
||||||
|
result = f.read()
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Clean up the temporary files
|
||||||
|
for path in [tmp_pdf_path, tmp_md_path]:
|
||||||
|
if os.path.exists(path):
|
||||||
|
os.unlink(path)
|
||||||
|
@ -2,7 +2,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "1"
|
_MINOR = "1"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "66"
|
_PATCH = "67"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
12
scripts/check_qual.sh
Executable file
12
scripts/check_qual.sh
Executable file
@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
python scripts/pii_rule_comparison.py \
|
||||||
|
--docs-folder /home/ubuntu/s2pdf_dedupe_minhash_v1_with_no_pii/documents \
|
||||||
|
--ref-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.5" \
|
||||||
|
--hyp-rule "ft_lang_id_en_doc_v2__ft_lang_id_en_doc_v2__en:avg>0.4" \
|
||||||
|
--output-dir results/pii_detection \
|
||||||
|
|
||||||
|
|
||||||
|
tinyhost results/pii_detection/*
|
@ -38,6 +38,7 @@ Rule expression syntax:
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import gzip
|
import gzip
|
||||||
|
import html as pyhtml
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@ -1482,7 +1483,7 @@ def generate_html_report(docs, title, summary, output_path):
|
|||||||
html += f"""
|
html += f"""
|
||||||
<div id="doc-{i}" class="document{selected_class}" tabindex="0">
|
<div id="doc-{i}" class="document{selected_class}" tabindex="0">
|
||||||
<div class="document-id">Document ID: {doc_id}</div>
|
<div class="document-id">Document ID: {doc_id}</div>
|
||||||
<div class="document-text">{doc_text}</div>
|
<pre class="document-text">{pyhtml.escape(doc_text)}</pre>
|
||||||
</div>
|
</div>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -1735,15 +1736,20 @@ IoU: {iou:.4f}
|
|||||||
|
|
||||||
# True Positives
|
# True Positives
|
||||||
generate_html_report(
|
generate_html_report(
|
||||||
true_positives, "True Positives - Documents matching both Reference and Hypothesis Rules", summary, os.path.join(args.output_dir, "true_positives.html")
|
true_positives[:1000],
|
||||||
|
"True Positives - Documents matching both Reference and Hypothesis Rules",
|
||||||
|
summary,
|
||||||
|
os.path.join(args.output_dir, "true_positives.html"),
|
||||||
)
|
)
|
||||||
|
|
||||||
# True Negatives
|
# True Negatives
|
||||||
generate_html_report(true_negatives, "True Negatives - Documents not matching either Rule", summary, os.path.join(args.output_dir, "true_negatives.html"))
|
generate_html_report(
|
||||||
|
true_negatives[:1000], "True Negatives - Documents not matching either Rule", summary, os.path.join(args.output_dir, "true_negatives.html")
|
||||||
|
)
|
||||||
|
|
||||||
# False Positives
|
# False Positives
|
||||||
generate_html_report(
|
generate_html_report(
|
||||||
false_positives,
|
false_positives[:1000],
|
||||||
"False Positives - Documents matching Hypothesis but not Reference Rule",
|
"False Positives - Documents matching Hypothesis but not Reference Rule",
|
||||||
summary,
|
summary,
|
||||||
os.path.join(args.output_dir, "false_positives.html"),
|
os.path.join(args.output_dir, "false_positives.html"),
|
||||||
@ -1751,7 +1757,7 @@ IoU: {iou:.4f}
|
|||||||
|
|
||||||
# False Negatives
|
# False Negatives
|
||||||
generate_html_report(
|
generate_html_report(
|
||||||
false_negatives,
|
false_negatives[:1000],
|
||||||
"False Negatives - Documents matching Reference but not Hypothesis Rule",
|
"False Negatives - Documents matching Reference but not Hypothesis Rule",
|
||||||
summary,
|
summary,
|
||||||
os.path.join(args.output_dir, "false_negatives.html"),
|
os.path.join(args.output_dir, "false_negatives.html"),
|
||||||
@ -1879,6 +1885,20 @@ IoU: {iou:.4f}
|
|||||||
logger.info(f"F1 Score: {f1:.4f}")
|
logger.info(f"F1 Score: {f1:.4f}")
|
||||||
logger.info(f"IoU: {iou:.4f}")
|
logger.info(f"IoU: {iou:.4f}")
|
||||||
|
|
||||||
|
# Output all available attributes that have been loaded
|
||||||
|
logger.info("\n--- AVAILABLE ATTRIBUTES ---")
|
||||||
|
all_attributes = set()
|
||||||
|
for doc in all_docs:
|
||||||
|
if "attributes" in doc and doc["attributes"]:
|
||||||
|
all_attributes.update(doc["attributes"].keys())
|
||||||
|
|
||||||
|
if all_attributes:
|
||||||
|
logger.info(f"Found {len(all_attributes)} unique attributes:")
|
||||||
|
for attr in sorted(all_attributes):
|
||||||
|
logger.info(f" - {attr}")
|
||||||
|
else:
|
||||||
|
logger.info("No attributes found in any documents.")
|
||||||
|
|
||||||
logger.info(f"\nResults saved to: {args.output_dir}/index.html")
|
logger.info(f"\nResults saved to: {args.output_dir}/index.html")
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user