mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-13 17:22:13 +00:00
Merge branch 'main' of https://github.com/allenai/olmocr
This commit is contained in:
commit
f1188dc85d
@ -132,7 +132,7 @@ def detect_headers_footers(pdf_path: str, page_num: int, api_key: str) -> Option
|
||||
parts=[
|
||||
image_part,
|
||||
types.Part.from_text(
|
||||
text="""Please tell me which text in this image is part of any headers/footers and would therefore be skipped it someone were reading it outloud to another person. Include page numbers and document-level headers and footers, but not inner subsections."""
|
||||
text="""Please extract and display the complete text from the document without omission. Include all sections and ensure nothing is summarized or abbreviated. I want the entire text of the document at any cost. Do not hallucinate"""
|
||||
),
|
||||
],
|
||||
),
|
||||
|
407
olmocr/bench/miners/mine_multi_column.py
Normal file
407
olmocr/bench/miners/mine_multi_column.py
Normal file
@ -0,0 +1,407 @@
|
||||
"""
|
||||
mine_multi_column.py - Extract text from PDF documents which has multiple columns.
|
||||
|
||||
This script:
|
||||
1. Takes a file containing folder path which contains PDF documents as input
|
||||
2. Process each PDF to generate an HTML representation
|
||||
3. For each PDF, it renders to an image
|
||||
4. Uses Claude Sonnet to identify text from multiple columns in the rendered image
|
||||
5. Creates a test file asserting that the order (before/after) of text should appear
|
||||
6. Extracts the page from the PDF and saves it to an output folder
|
||||
|
||||
Usage:
|
||||
python mine_headers_footers.py --input_dir path/to/pdfs --output_dir path/to/output --api_key your_anthropic_api_key
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import subprocess
|
||||
import uuid
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Dict, List
|
||||
|
||||
import pypdf
|
||||
from anthropic import Anthropic
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.async_api import async_playwright
|
||||
from syntok.segmenter import process
|
||||
from tqdm import tqdm
|
||||
|
||||
from olmocr.data.renderpdf import (
|
||||
get_png_dimensions_from_base64,
|
||||
render_pdf_to_base64png,
|
||||
)
|
||||
|
||||
|
||||
def extract_code_block(initial_response):
|
||||
html_blocks = re.findall(r"```html\n(.*?)```", initial_response, re.DOTALL)
|
||||
if html_blocks:
|
||||
return html_blocks[-1].strip()
|
||||
code_blocks = re.findall(r"```\n(.*?)```", initial_response, re.DOTALL)
|
||||
if code_blocks:
|
||||
return code_blocks[-1].strip()
|
||||
html_blocks_no_newline = re.findall(r"```html(.*?)```", initial_response, re.DOTALL)
|
||||
if html_blocks_no_newline:
|
||||
return html_blocks_no_newline[-1].strip()
|
||||
code_blocks_no_newline = re.findall(r"```(.*?)```", initial_response, re.DOTALL)
|
||||
if code_blocks_no_newline:
|
||||
return code_blocks_no_newline[-1].strip()
|
||||
return None
|
||||
|
||||
|
||||
def generate_html_from_image(client, image_base64):
|
||||
"""Call Claude API to generate HTML from an image using a multi-step prompting strategy."""
|
||||
png_width, png_height = get_png_dimensions_from_base64(image_base64)
|
||||
try:
|
||||
analysis_response = client.messages.create(
|
||||
model="claude-3-7-sonnet-20250219",
|
||||
max_tokens=2000,
|
||||
temperature=0.1,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
|
||||
{
|
||||
"type": "text",
|
||||
"text": (
|
||||
"Analyze this document and provide a detailed assessment of its structure. "
|
||||
"Focus on the layout, headings, footers, and any complex formatting. Please be precise."
|
||||
),
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
analysis_text = ""
|
||||
for content in analysis_response.content:
|
||||
if content.type == "text":
|
||||
analysis_text += content.text
|
||||
|
||||
initial_response = client.messages.create(
|
||||
model="claude-3-7-sonnet-20250219",
|
||||
max_tokens=6000,
|
||||
temperature=0.2,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}},
|
||||
{
|
||||
"type": "text",
|
||||
"text": (
|
||||
"Render this document as clean, semantic HTML. Here is the analysis of the document structure:\n\n"
|
||||
f"{analysis_text}\n\n"
|
||||
"Requirements:\n"
|
||||
"1. Use appropriate HTML tags for headings, paragraphs, and lists.\n"
|
||||
"2. Use <header> and <footer> for top and bottom content.\n"
|
||||
"3. For images, use a placeholder <div> with class 'image'.\n"
|
||||
"4. Render math equations inline using \\( \\) or \\[ \\].\n"
|
||||
"5. Preserve any multi-column layout using CSS flexbox or grid.\n"
|
||||
f"6. The viewport is fixed at {png_width // 2}x{png_height // 2} pixels.\n\n"
|
||||
"Enclose your HTML in a ```html code block."
|
||||
),
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
initial_html = ""
|
||||
for content in initial_response.content:
|
||||
if content.type == "text":
|
||||
initial_html += content.text
|
||||
|
||||
return extract_code_block(initial_html)
|
||||
except Exception as e:
|
||||
print(f"Error calling Claude API: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> List[Dict]:
|
||||
"""
|
||||
Generate order tests from HTML content by splitting the main text into sentences and pairing them.
|
||||
Only order tests are generated.
|
||||
"""
|
||||
tests = []
|
||||
pdf_filename = pdf_id
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
full_text = soup.get_text(separator=" ").strip()
|
||||
|
||||
sentences = []
|
||||
for paragraph in process(full_text):
|
||||
for sentence in paragraph:
|
||||
sentence_str = "".join(token.spacing + token.value for token in sentence).strip()
|
||||
if sentence_str:
|
||||
sentences.append(sentence_str)
|
||||
|
||||
if len(sentences) < 2:
|
||||
return tests
|
||||
|
||||
all_indexes = list(range(len(sentences)))
|
||||
random.shuffle(all_indexes)
|
||||
random_pairs = [(all_indexes[i * 2], all_indexes[i * 2 + 1]) for i in range(len(all_indexes) // 2)]
|
||||
random_pairs = [(min(i, j), max(i, j)) for (i, j) in random_pairs]
|
||||
|
||||
num_order_tests = 0
|
||||
for i, j in random_pairs:
|
||||
first_sentence = sentences[i]
|
||||
second_sentence = sentences[j]
|
||||
if len(first_sentence) < 10 or len(second_sentence) < 10:
|
||||
continue
|
||||
first_sentence = first_sentence.split("\n")[0].strip()
|
||||
second_sentence = second_sentence.split("\n")[0].strip()
|
||||
max_diffs = round(max(len(first_sentence), len(second_sentence)) * 0.05)
|
||||
if max_diffs > len(first_sentence) // 2 or max_diffs > len(second_sentence) // 2:
|
||||
continue
|
||||
tests.append(
|
||||
{
|
||||
"pdf": pdf_filename,
|
||||
"page": page_num,
|
||||
"id": f"{pdf_id}_order_{uuid.uuid4().hex[:8]}",
|
||||
"type": "order",
|
||||
"before": first_sentence,
|
||||
"after": second_sentence,
|
||||
"max_diffs": max_diffs,
|
||||
}
|
||||
)
|
||||
num_order_tests += 1
|
||||
if num_order_tests >= 5:
|
||||
break
|
||||
|
||||
return tests
|
||||
|
||||
|
||||
async def render_pdf_with_playwright(html_content, output_pdf_path, png_width, png_height):
|
||||
"""
|
||||
Render HTML content using Playwright and save it as a PDF.
|
||||
Tries different scale factors until the output PDF has exactly one page.
|
||||
"""
|
||||
scale_factors = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5]
|
||||
for scale in scale_factors:
|
||||
try:
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch()
|
||||
page = await browser.new_page(viewport={"width": int(png_width // 2 * scale), "height": int(png_height // 2 * scale)})
|
||||
await page.set_content(html_content)
|
||||
|
||||
# Add KaTeX assets for math rendering
|
||||
katex_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "katex")
|
||||
katex_css_path = os.path.join(katex_dir, "katex.min.css")
|
||||
katex_js_path = os.path.join(katex_dir, "katex.min.js")
|
||||
katex_autorender_js_path = os.path.join(katex_dir, "auto-render.min.js")
|
||||
|
||||
await page.add_style_tag(path=katex_css_path)
|
||||
await page.add_script_tag(path=katex_js_path)
|
||||
await page.add_script_tag(path=katex_autorender_js_path)
|
||||
|
||||
await page.evaluate(
|
||||
"""
|
||||
renderMathInElement(document.body, {
|
||||
delimiters: [
|
||||
{left: '\\\$begin:math:text$', right: '\\\\\\$end:math:text$', display: false},
|
||||
{left: '\\\$begin:math:display$', right: '\\\\\\$end:math:display$', display: true}
|
||||
],
|
||||
throwOnError: false
|
||||
});
|
||||
"""
|
||||
)
|
||||
|
||||
await page.pdf(path=output_pdf_path, scale=scale, print_background=True)
|
||||
await browser.close()
|
||||
|
||||
try:
|
||||
reader = pypdf.PdfReader(output_pdf_path)
|
||||
if len(reader.pages) == 1:
|
||||
print(f"Successfully rendered as a single page PDF with scale factor {scale}")
|
||||
return True
|
||||
else:
|
||||
print(f"PDF has {len(reader.pages)} pages with scale factor {scale}, trying a smaller scale...")
|
||||
except Exception as pdf_check_error:
|
||||
print(f"Error checking PDF page count: {pdf_check_error}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error rendering PDF with Playwright at scale {scale}: {e}")
|
||||
print("Failed to render PDF as a single page with any scale factor")
|
||||
return False
|
||||
|
||||
|
||||
def process_pdf(pdf_info, args, client):
|
||||
"""
|
||||
Process a single PDF from a local folder:
|
||||
- Select a random page from the PDF.
|
||||
- Render that page as a base64 PNG.
|
||||
- Generate HTML from the image using Claude.
|
||||
- Optionally render the HTML to a PDF using Playwright.
|
||||
- Generate order tests from the HTML.
|
||||
"""
|
||||
local_pdf_path, index = pdf_info
|
||||
original_pdf_name = os.path.basename(local_pdf_path)
|
||||
pdf_id = original_pdf_name
|
||||
temp_pdf_dir = os.path.join(args.temp_dir, f"{os.path.splitext(original_pdf_name)[0]}")
|
||||
os.makedirs(temp_pdf_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
reader = pypdf.PdfReader(local_pdf_path)
|
||||
num_pages = len(reader.pages)
|
||||
if num_pages == 0:
|
||||
print(f"PDF has no pages: {local_pdf_path}")
|
||||
return None
|
||||
page_num = random.randint(1, num_pages)
|
||||
except Exception as e:
|
||||
print(f"Error reading {local_pdf_path}: {e}")
|
||||
return None
|
||||
|
||||
try:
|
||||
image_base64 = render_pdf_to_base64png(local_pdf_path, page_num, target_longest_image_dim=2048)
|
||||
except Exception as e:
|
||||
print(f"Error rendering page {page_num} from {local_pdf_path}: {e}")
|
||||
return None
|
||||
|
||||
try:
|
||||
html_content = generate_html_from_image(client, image_base64)
|
||||
if not html_content:
|
||||
print(f"Failed to generate HTML for {local_pdf_path}, page {page_num}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Error generating HTML for {local_pdf_path}: {e}")
|
||||
return None
|
||||
|
||||
html_dir = os.path.join(args.output_dir, "html")
|
||||
pdfs_dir = os.path.join(args.output_dir, "pdfs")
|
||||
os.makedirs(html_dir, exist_ok=True)
|
||||
os.makedirs(pdfs_dir, exist_ok=True)
|
||||
|
||||
html_path = os.path.join(html_dir, f"{os.path.splitext(original_pdf_name)[0]}_page{page_num}.html")
|
||||
try:
|
||||
with open(html_path, "w") as f:
|
||||
f.write(html_content)
|
||||
except Exception as e:
|
||||
print(f"Error saving HTML for {local_pdf_path}: {e}")
|
||||
return None
|
||||
|
||||
playwright_pdf_path = None
|
||||
render_success = False
|
||||
playwright_pdf_filename = f"{os.path.splitext(original_pdf_name)[0]}_page{page_num}.pdf"
|
||||
if not args.skip_playwright:
|
||||
playwright_pdf_path = os.path.join(pdfs_dir, playwright_pdf_filename)
|
||||
try:
|
||||
png_width, png_height = get_png_dimensions_from_base64(image_base64)
|
||||
render_success = asyncio.run(render_pdf_with_playwright(html_content, playwright_pdf_path, png_width, png_height))
|
||||
if render_success:
|
||||
print(f"Successfully rendered with Playwright: {playwright_pdf_path}")
|
||||
else:
|
||||
print(f"Failed to render as a single page PDF: {playwright_pdf_path}")
|
||||
playwright_pdf_path = None
|
||||
except Exception as e:
|
||||
print(f"Failed to render with Playwright: {e}")
|
||||
playwright_pdf_path = None
|
||||
render_success = False
|
||||
if not args.skip_playwright and not render_success:
|
||||
return None
|
||||
|
||||
tests = generate_tests_from_html(html_content, pdf_id, page_num)
|
||||
# IMPORTANT: Preserve the original PDF filename in the tests.
|
||||
for test in tests:
|
||||
test["pdf"] = original_pdf_name
|
||||
|
||||
try:
|
||||
if os.path.exists(temp_pdf_dir):
|
||||
subprocess.run(["rm", "-rf", temp_pdf_dir])
|
||||
except Exception as e:
|
||||
print(f"Error cleaning up temp directory {temp_pdf_dir}: {e}")
|
||||
|
||||
return {
|
||||
"pdf_id": pdf_id,
|
||||
"pdf_path": local_pdf_path,
|
||||
"page_number": page_num,
|
||||
"html_path": html_path,
|
||||
"playwright_pdf_path": playwright_pdf_path,
|
||||
"tests": tests,
|
||||
"num_tests": len(tests),
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Convert PDFs in a folder to HTML templates and render with Playwright (order tests only)")
|
||||
parser.add_argument("--input_dir", required=True, help="Folder containing PDF files")
|
||||
parser.add_argument("--output_dir", required=True, help="Directory to store HTML and tests")
|
||||
parser.add_argument("--temp_dir", default="/tmp/mine_tables", help="Directory for temporary files")
|
||||
parser.add_argument("--max_tests", type=int, default=100, help="Maximum number of PDFs to process (randomly selected)")
|
||||
parser.add_argument("--parallel", type=int, default=1, help="Number of parallel threads to use")
|
||||
parser.add_argument("--api_key", help="Claude API key (or set ANTHROPIC_API_KEY environment variable)")
|
||||
parser.add_argument("--skip_playwright", action="store_true", help="Skip Playwright PDF rendering")
|
||||
parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
|
||||
args = parser.parse_args()
|
||||
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
os.makedirs(args.temp_dir, exist_ok=True)
|
||||
|
||||
api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY")
|
||||
if not api_key:
|
||||
print("Error: API key not provided. Use --api_key or set ANTHROPIC_API_KEY environment variable.")
|
||||
return
|
||||
|
||||
client = Anthropic(api_key=api_key)
|
||||
|
||||
pdf_paths = []
|
||||
for root, dirs, files in os.walk(args.input_dir):
|
||||
for file in files:
|
||||
if file.lower().endswith(".pdf"):
|
||||
pdf_paths.append(os.path.join(root, file))
|
||||
print(f"Found {len(pdf_paths)} PDF files in {args.input_dir}")
|
||||
|
||||
random.shuffle(pdf_paths)
|
||||
pdf_paths = pdf_paths[: args.max_tests]
|
||||
|
||||
synthetic_json_path = os.path.join(args.output_dir, "synthetic.jsonl")
|
||||
open(synthetic_json_path, "w").close()
|
||||
|
||||
test_counter = 0
|
||||
test_types = {"order": 0}
|
||||
results = []
|
||||
|
||||
import threading
|
||||
|
||||
file_lock = threading.Lock()
|
||||
|
||||
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
||||
futures = {executor.submit(process_pdf, (pdf_path, i), args, client): pdf_path for i, pdf_path in enumerate(pdf_paths)}
|
||||
for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing PDFs"):
|
||||
pdf_path = futures[future]
|
||||
try:
|
||||
result = future.result()
|
||||
if result and result.get("tests"):
|
||||
results.append(result)
|
||||
with file_lock:
|
||||
with open(synthetic_json_path, "a") as f:
|
||||
for test in result["tests"]:
|
||||
f.write(json.dumps(test) + "\n")
|
||||
test_counter += len(result["tests"])
|
||||
for test in result["tests"]:
|
||||
if test.get("type") == "order":
|
||||
test_types["order"] += 1
|
||||
print(f"Added {len(result['tests'])} tests from {result['pdf_id']}, total: {test_counter}")
|
||||
except Exception as e:
|
||||
print(f"Error processing {pdf_path}: {e}")
|
||||
|
||||
print(f"Generated {len(results)} HTML templates")
|
||||
if not args.skip_playwright:
|
||||
playwright_success = sum(1 for r in results if r and r.get("playwright_pdf_path"))
|
||||
print(f"Playwright PDF rendering: {playwright_success}/{len(results)} successful")
|
||||
print(f"Saved {test_counter} tests to {synthetic_json_path}")
|
||||
print(f"Generated a total of {test_counter} tests across {len(results)} templates")
|
||||
print("Test type distribution:")
|
||||
for test_type, count in test_types.items():
|
||||
print(f" - {test_type}: {count} tests")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
55
scripts/jsonl_to_markdown.py
Normal file
55
scripts/jsonl_to_markdown.py
Normal file
@ -0,0 +1,55 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
# This is a simple script to convert JSONL files to Markdown format.
|
||||
# It reads each line of the JSONL file, extracts the 'text' field,
|
||||
# and saves it as a Markdown file with the line number as the filename.
|
||||
# The script also handles potential JSON decoding errors and prints relevant messages.
|
||||
def jsonl_to_markdown(input_file, output_dir):
|
||||
"""
|
||||
Reads a JSONL file, extracts the 'text' field from each line, and saves it as a Markdown file.
|
||||
|
||||
Args:
|
||||
input_file (str): Path to the input JSONL file.
|
||||
output_dir (str): Directory to save the Markdown files.
|
||||
"""
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
with open(input_file, 'r', encoding='utf-8') as file:
|
||||
for i, line in enumerate(file):
|
||||
try:
|
||||
# Parse the JSON line
|
||||
data = json.loads(line)
|
||||
text_content = data.get("text", "")
|
||||
|
||||
# Save to a Markdown file
|
||||
output_file = os.path.join(output_dir, f"line_{i + 1}.md")
|
||||
with open(output_file, 'w', encoding='utf-8') as md_file:
|
||||
md_file.write(text_content)
|
||||
|
||||
print(f"Extracted and saved line {i + 1} to {output_file}")
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error decoding JSON on line {i + 1}: {e}")
|
||||
except Exception as e:
|
||||
print(f"Unexpected error on line {i + 1}: {e}")
|
||||
|
||||
# Example usage
|
||||
# input_jsonl_file = "/path/to/test.jsonl" # Replace with the actual path to your JSONL file
|
||||
# output_directory = "/path/to/output_markdown" # Replace with the desired output directory
|
||||
# jsonl_to_markdown(input_jsonl_file, output_directory)
|
||||
|
||||
# This is the main entrypoint to use the script from the command line.
|
||||
# It takes two arguments: the input JSONL file and the output directory.
|
||||
# The script will create the output directory if it does not exist.
|
||||
if __name__ == "__main__":
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python jsonl_to_markdown.py <input_file> <output_dir>")
|
||||
sys.exit(1)
|
||||
|
||||
input_file = sys.argv[1]
|
||||
output_dir = sys.argv[2]
|
||||
|
||||
jsonl_to_markdown(input_file, output_dir)
|
Loading…
x
Reference in New Issue
Block a user