Fixes for multipage runners

This commit is contained in:
Jake Poznanski 2025-03-12 10:29:49 -07:00
parent 743e48e4ad
commit 8b3a9e4201
9 changed files with 59 additions and 29 deletions

View File

@ -4,7 +4,7 @@ This script runs olmocr bench.
It will take as an argument a folder, and scan it for .jsonl files which contain the various rules and properties that we will check.
It will then validate the JSON files to make sure they are all valid.
Then, each other folder in there (besides /pdfs) represents a pipeline tool that we will evaluate.
We will validate that each one of those contains at least one .md file (or repeated generations, e.g. _1.md, _2.md, etc.)
We will validate that each one of those contains at least one .md file (or repeated generations, e.g. _pg{page}_repeat{repeat}.md)
corresponding to its parse for every .pdf in the /pdfs folder.
Then, we will read each one, and check if they pass against all the rules.
If a rule fails on some of the repeats, a short explanation is printed.
@ -30,7 +30,7 @@ def evaluate_candidate(
) -> Tuple[float, int, List[str], List[str], Dict[str, List[float]], List[float]]:
"""
For the candidate folder (pipeline tool output), validate that it contains at least one .md file
(i.e. repeated generations like _1.md, _2.md, etc.) for every PDF in the pdf folder.
(i.e. repeated generations like _pg{page}_repeat{repeat}.md) for every PDF in the pdf folder.
Then, run each rule against all corresponding .md files and average the results.
Returns a tuple:
@ -49,11 +49,12 @@ def evaluate_candidate(
all_test_scores = [] # Store all individual test scores for bootstrapping
candidate_name = os.path.basename(candidate_folder)
# Map each PDF to its corresponding MD repeats (e.g., doc1_1.md, doc1_2.md, etc.)
# Map each PDF to its corresponding MD repeats (e.g., doc1_pg1_repeat1.md, doc1_pg2_repeat2.md, etc.)
pdf_to_md_files = {}
for pdf_name in pdf_basenames:
md_base = os.path.splitext(pdf_name)[0]
md_regex = re.compile(rf"^{re.escape(md_base)}_\d+\.md$")
# Updated regex for new format: {pdf_name}_pg<page>_repeat<repeat>.md
md_regex = re.compile(rf"^{re.escape(md_base)}_pg\d+_repeat\d+\.md$")
# List all files in the candidate folder and filter using regex
all_files = os.listdir(candidate_folder)
@ -62,7 +63,7 @@ def evaluate_candidate(
if not md_files and not force:
candidate_errors.append(
f"Candidate '{candidate_name}' is missing MD repeats for {pdf_name} "
f"(expected files matching {md_base}_*.md)."
f"(expected files matching {md_base}_pg{{page}}_repeat*.md)."
)
else:
pdf_to_md_files[pdf_name] = md_files
@ -72,7 +73,7 @@ def evaluate_candidate(
total_test_score = 0.0
# Evaluate each test. Each test references a PDF (e.g., "doc1.pdf") so we get all its MD repeats.
# Evaluate each test. Each test references a PDF (e.g., "doc1.pdf") and a specific page.
for test in all_tests:
test_type = test.type
if test_type not in test_type_breakdown:
@ -80,12 +81,19 @@ def evaluate_candidate(
pdf_name = test.pdf
md_base = os.path.splitext(pdf_name)[0]
md_files = pdf_to_md_files.get(pdf_name, [])
if not md_files:
continue # Should not occur due to earlier check.
# Filter MD files for the specific page corresponding to the test
page_md_files = [f for f in md_files if re.search(rf"_pg{test.page}_", os.path.basename(f))]
if not page_md_files:
candidate_errors.append(
f"Candidate '{candidate_name}' is missing MD repeats for {pdf_name} page {test.page} "
f"(expected files matching {md_base}_pg{test.page}_repeat*.md)."
)
continue
repeat_passes = 0
num_repeats = 0
explanations = []
for md_path in md_files:
for md_path in page_md_files:
num_repeats += 1
try:
with open(md_path, "r", encoding="utf-8") as f:
@ -110,8 +118,8 @@ def evaluate_candidate(
total_test_score += test_avg
if test_avg < 1.0:
test_failures.append(
f"Test {test.id} on {md_base} average pass ratio: {test_avg:.3f} ({repeat_passes}/{num_repeats} repeats passed). "
f"Ex: {explanations[0] if explanations else 'No explanation'}"
f"Test {test.id} on {md_base} page {test.page} average pass ratio: {test_avg:.3f} "
f"({repeat_passes}/{num_repeats} repeats passed). Ex: {explanations[0] if explanations else 'No explanation'}"
)
test_type_breakdown[test_type].append(test_avg)
@ -327,4 +335,4 @@ def main():
if __name__ == "__main__":
main()
main()

View File

@ -4,9 +4,10 @@ import glob
import importlib
import os
from functools import partial
from itertools import product
from tqdm import tqdm
from pypdf import PdfReader
def parse_method_arg(method_arg):
"""
@ -48,12 +49,12 @@ async def run_sync_in_executor(func, *args, **kwargs):
return await loop.run_in_executor(None, partial(func, *args, **kwargs))
async def process_pdf(pdf_path, method, kwargs, output_path, is_async):
async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async):
"""Process a single PDF and save the result to output_path"""
try:
if is_async:
# Run async function directly
markdown = await method(pdf_path, page_num=1, **kwargs)
markdown = await method(pdf_path, page_num=page_num, **kwargs)
else:
# Run synchronous function in the executor
markdown = await run_sync_in_executor(method, pdf_path, page_num=1, **kwargs)
@ -101,21 +102,25 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, force, ma
task_descriptions = {}
for pdf_path in all_pdfs:
pdf = PdfReader(pdf_path)
num_pages = len(pdf.pages)
base_name = os.path.basename(pdf_path).replace(".pdf", "")
for i in range(1, repeats + 1):
output_filename = f"{base_name}_{i}.md"
output_path = os.path.join(candidate_output_dir, output_filename)
if os.path.exists(output_path) and not force:
print(f"Skipping {base_name}_{i} for {candidate}, file already exists")
print("Rerun with --force flag to force regeneration")
continue
task = process_pdf(pdf_path, method, kwargs, output_path, is_async)
tasks.append(task)
task_descriptions[id(task)] = f"{base_name}_{i} ({candidate})"
for repeat in range(1, repeats + 1):
for page_num in range(1, num_pages + 1):
output_filename = f"{base_name}_pg{page_num}_repeat{repeat}.md"
output_path = os.path.join(candidate_output_dir, output_filename)
if os.path.exists(output_path) and not force:
print(f"Skipping {base_name}_pg{page_num}_repeat{repeat} for {candidate}, file already exists")
print("Rerun with --force flag to force regeneration")
continue
task = process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async)
tasks.append(task)
task_descriptions[id(task)] = f"{base_name}_pg{page_num}_repeat{repeat} ({candidate})"
# Process tasks with semaphore to limit concurrency
semaphore = asyncio.Semaphore(max_parallel or 1) # Default to 1 if not specified

View File

@ -26,6 +26,10 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0
# Convert the first page of the PDF to a base64-encoded PNG image.
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
if not os.getenv("OPENAI_API_KEY"):
raise SystemExit("You must specify an OPENAI_API_KEY")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
response = client.chat.completions.create(

View File

@ -27,6 +27,9 @@ def run_claude(pdf_path: str, page_num: int = 1, model: str = "claude-3-7-sonnet
str: The OCR result in markdown format.
"""
if not os.getenv("ANTHROPIC_API_KEY"):
raise SystemExit("You must specify an ANTHROPIC_API_KEY")
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

View File

@ -24,6 +24,9 @@ def run_gemini(pdf_path: str, page_num: int = 1, model: str = "gemini-2.0-flash"
Returns:
str: The OCR result in markdown format.
"""
if not os.getenv("GEMINI_API_KEY"):
raise SystemExit("You must specify an GEMINI_API_KEY")
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
api_key = os.getenv("GEMINI_API_KEY")

View File

@ -15,6 +15,9 @@ def run_mistral(pdf_path: str, page_num: int = 1) -> str:
Returns:
str: The OCR result in markdown format.
"""
if not os.getenv("MISTRAL_API_KEY"):
raise SystemExit("You must specify an MISTRAL_API_KEY")
api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)

View File

@ -74,4 +74,4 @@
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_02", "type": "math", "math": "u \\in\\left(R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)^{\\times}"}
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_03", "type": "math", "math": "\\lambda_{g}=\\sum_{i=1}^{k} c\\left(g, R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)"}
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_04", "type": "present", "text": "We also thank Ján Mináč for his constant encouragement and support."}
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_04", "type": "present", "text": "Allgemeine theorie der Gaußschen Summen in endlichen kommutativen Ringe"}
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_05", "type": "present", "text": "Allgemeine theorie der Gaußschen Summen in endlichen kommutativen Ringe"}

View File

@ -581,10 +581,13 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
tests.append(test)
except json.JSONDecodeError as e:
print(f"Error parsing JSON on line {line_number}: {e}")
raise
except (ValidationError, KeyError) as e:
print(f"Error on line {line_number}: {e}")
raise
except Exception as e:
print(f"Unexpected error on line {line_number}: {e}")
raise
return tests

View File

@ -83,6 +83,7 @@ bench = [
"google-genai",
"google-generativeai",
"playwright",
"mistralai",
]
train = [