Fixes for multipage runners

This commit is contained in:
Jake Poznanski 2025-03-12 10:29:49 -07:00
parent 743e48e4ad
commit 8b3a9e4201
9 changed files with 59 additions and 29 deletions

View File

@ -4,7 +4,7 @@ This script runs olmocr bench.
It will take as an argument a folder, and scan it for .jsonl files which contain the various rules and properties that we will check. It will take as an argument a folder, and scan it for .jsonl files which contain the various rules and properties that we will check.
It will then validate the JSON files to make sure they are all valid. It will then validate the JSON files to make sure they are all valid.
Then, each other folder in there (besides /pdfs) represents a pipeline tool that we will evaluate. Then, each other folder in there (besides /pdfs) represents a pipeline tool that we will evaluate.
We will validate that each one of those contains at least one .md file (or repeated generations, e.g. _1.md, _2.md, etc.) We will validate that each one of those contains at least one .md file (or repeated generations, e.g. _pg{page}_repeat{repeat}.md)
corresponding to its parse for every .pdf in the /pdfs folder. corresponding to its parse for every .pdf in the /pdfs folder.
Then, we will read each one, and check if they pass against all the rules. Then, we will read each one, and check if they pass against all the rules.
If a rule fails on some of the repeats, a short explanation is printed. If a rule fails on some of the repeats, a short explanation is printed.
@ -30,7 +30,7 @@ def evaluate_candidate(
) -> Tuple[float, int, List[str], List[str], Dict[str, List[float]], List[float]]: ) -> Tuple[float, int, List[str], List[str], Dict[str, List[float]], List[float]]:
""" """
For the candidate folder (pipeline tool output), validate that it contains at least one .md file For the candidate folder (pipeline tool output), validate that it contains at least one .md file
(i.e. repeated generations like _1.md, _2.md, etc.) for every PDF in the pdf folder. (i.e. repeated generations like _pg{page}_repeat{repeat}.md) for every PDF in the pdf folder.
Then, run each rule against all corresponding .md files and average the results. Then, run each rule against all corresponding .md files and average the results.
Returns a tuple: Returns a tuple:
@ -49,11 +49,12 @@ def evaluate_candidate(
all_test_scores = [] # Store all individual test scores for bootstrapping all_test_scores = [] # Store all individual test scores for bootstrapping
candidate_name = os.path.basename(candidate_folder) candidate_name = os.path.basename(candidate_folder)
# Map each PDF to its corresponding MD repeats (e.g., doc1_1.md, doc1_2.md, etc.) # Map each PDF to its corresponding MD repeats (e.g., doc1_pg1_repeat1.md, doc1_pg2_repeat2.md, etc.)
pdf_to_md_files = {} pdf_to_md_files = {}
for pdf_name in pdf_basenames: for pdf_name in pdf_basenames:
md_base = os.path.splitext(pdf_name)[0] md_base = os.path.splitext(pdf_name)[0]
md_regex = re.compile(rf"^{re.escape(md_base)}_\d+\.md$") # Updated regex for new format: {pdf_name}_pg<page>_repeat<repeat>.md
md_regex = re.compile(rf"^{re.escape(md_base)}_pg\d+_repeat\d+\.md$")
# List all files in the candidate folder and filter using regex # List all files in the candidate folder and filter using regex
all_files = os.listdir(candidate_folder) all_files = os.listdir(candidate_folder)
@ -62,7 +63,7 @@ def evaluate_candidate(
if not md_files and not force: if not md_files and not force:
candidate_errors.append( candidate_errors.append(
f"Candidate '{candidate_name}' is missing MD repeats for {pdf_name} " f"Candidate '{candidate_name}' is missing MD repeats for {pdf_name} "
f"(expected files matching {md_base}_*.md)." f"(expected files matching {md_base}_pg{{page}}_repeat*.md)."
) )
else: else:
pdf_to_md_files[pdf_name] = md_files pdf_to_md_files[pdf_name] = md_files
@ -72,7 +73,7 @@ def evaluate_candidate(
total_test_score = 0.0 total_test_score = 0.0
# Evaluate each test. Each test references a PDF (e.g., "doc1.pdf") so we get all its MD repeats. # Evaluate each test. Each test references a PDF (e.g., "doc1.pdf") and a specific page.
for test in all_tests: for test in all_tests:
test_type = test.type test_type = test.type
if test_type not in test_type_breakdown: if test_type not in test_type_breakdown:
@ -80,12 +81,19 @@ def evaluate_candidate(
pdf_name = test.pdf pdf_name = test.pdf
md_base = os.path.splitext(pdf_name)[0] md_base = os.path.splitext(pdf_name)[0]
md_files = pdf_to_md_files.get(pdf_name, []) md_files = pdf_to_md_files.get(pdf_name, [])
if not md_files: # Filter MD files for the specific page corresponding to the test
continue # Should not occur due to earlier check. page_md_files = [f for f in md_files if re.search(rf"_pg{test.page}_", os.path.basename(f))]
if not page_md_files:
candidate_errors.append(
f"Candidate '{candidate_name}' is missing MD repeats for {pdf_name} page {test.page} "
f"(expected files matching {md_base}_pg{test.page}_repeat*.md)."
)
continue
repeat_passes = 0 repeat_passes = 0
num_repeats = 0 num_repeats = 0
explanations = [] explanations = []
for md_path in md_files: for md_path in page_md_files:
num_repeats += 1 num_repeats += 1
try: try:
with open(md_path, "r", encoding="utf-8") as f: with open(md_path, "r", encoding="utf-8") as f:
@ -110,8 +118,8 @@ def evaluate_candidate(
total_test_score += test_avg total_test_score += test_avg
if test_avg < 1.0: if test_avg < 1.0:
test_failures.append( test_failures.append(
f"Test {test.id} on {md_base} average pass ratio: {test_avg:.3f} ({repeat_passes}/{num_repeats} repeats passed). " f"Test {test.id} on {md_base} page {test.page} average pass ratio: {test_avg:.3f} "
f"Ex: {explanations[0] if explanations else 'No explanation'}" f"({repeat_passes}/{num_repeats} repeats passed). Ex: {explanations[0] if explanations else 'No explanation'}"
) )
test_type_breakdown[test_type].append(test_avg) test_type_breakdown[test_type].append(test_avg)

View File

@ -4,9 +4,10 @@ import glob
import importlib import importlib
import os import os
from functools import partial from functools import partial
from itertools import product
from tqdm import tqdm from tqdm import tqdm
from pypdf import PdfReader
def parse_method_arg(method_arg): def parse_method_arg(method_arg):
""" """
@ -48,12 +49,12 @@ async def run_sync_in_executor(func, *args, **kwargs):
return await loop.run_in_executor(None, partial(func, *args, **kwargs)) return await loop.run_in_executor(None, partial(func, *args, **kwargs))
async def process_pdf(pdf_path, method, kwargs, output_path, is_async): async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async):
"""Process a single PDF and save the result to output_path""" """Process a single PDF and save the result to output_path"""
try: try:
if is_async: if is_async:
# Run async function directly # Run async function directly
markdown = await method(pdf_path, page_num=1, **kwargs) markdown = await method(pdf_path, page_num=page_num, **kwargs)
else: else:
# Run synchronous function in the executor # Run synchronous function in the executor
markdown = await run_sync_in_executor(method, pdf_path, page_num=1, **kwargs) markdown = await run_sync_in_executor(method, pdf_path, page_num=1, **kwargs)
@ -101,20 +102,24 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, force, ma
task_descriptions = {} task_descriptions = {}
for pdf_path in all_pdfs: for pdf_path in all_pdfs:
pdf = PdfReader(pdf_path)
num_pages = len(pdf.pages)
base_name = os.path.basename(pdf_path).replace(".pdf", "") base_name = os.path.basename(pdf_path).replace(".pdf", "")
for i in range(1, repeats + 1): for repeat in range(1, repeats + 1):
output_filename = f"{base_name}_{i}.md" for page_num in range(1, num_pages + 1):
output_filename = f"{base_name}_pg{page_num}_repeat{repeat}.md"
output_path = os.path.join(candidate_output_dir, output_filename) output_path = os.path.join(candidate_output_dir, output_filename)
if os.path.exists(output_path) and not force: if os.path.exists(output_path) and not force:
print(f"Skipping {base_name}_{i} for {candidate}, file already exists") print(f"Skipping {base_name}_pg{page_num}_repeat{repeat} for {candidate}, file already exists")
print("Rerun with --force flag to force regeneration") print("Rerun with --force flag to force regeneration")
continue continue
task = process_pdf(pdf_path, method, kwargs, output_path, is_async) task = process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async)
tasks.append(task) tasks.append(task)
task_descriptions[id(task)] = f"{base_name}_{i} ({candidate})" task_descriptions[id(task)] = f"{base_name}_pg{page_num}_repeat{repeat} ({candidate})"
# Process tasks with semaphore to limit concurrency # Process tasks with semaphore to limit concurrency
semaphore = asyncio.Semaphore(max_parallel or 1) # Default to 1 if not specified semaphore = asyncio.Semaphore(max_parallel or 1) # Default to 1 if not specified

View File

@ -26,6 +26,10 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0
# Convert the first page of the PDF to a base64-encoded PNG image. # Convert the first page of the PDF to a base64-encoded PNG image.
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048) image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
if not os.getenv("OPENAI_API_KEY"):
raise SystemExit("You must specify an OPENAI_API_KEY")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
response = client.chat.completions.create( response = client.chat.completions.create(

View File

@ -27,6 +27,9 @@ def run_claude(pdf_path: str, page_num: int = 1, model: str = "claude-3-7-sonnet
str: The OCR result in markdown format. str: The OCR result in markdown format.
""" """
if not os.getenv("ANTHROPIC_API_KEY"):
raise SystemExit("You must specify an ANTHROPIC_API_KEY")
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048) image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

View File

@ -24,6 +24,9 @@ def run_gemini(pdf_path: str, page_num: int = 1, model: str = "gemini-2.0-flash"
Returns: Returns:
str: The OCR result in markdown format. str: The OCR result in markdown format.
""" """
if not os.getenv("GEMINI_API_KEY"):
raise SystemExit("You must specify an GEMINI_API_KEY")
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048) image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
api_key = os.getenv("GEMINI_API_KEY") api_key = os.getenv("GEMINI_API_KEY")

View File

@ -15,6 +15,9 @@ def run_mistral(pdf_path: str, page_num: int = 1) -> str:
Returns: Returns:
str: The OCR result in markdown format. str: The OCR result in markdown format.
""" """
if not os.getenv("MISTRAL_API_KEY"):
raise SystemExit("You must specify an MISTRAL_API_KEY")
api_key = os.environ["MISTRAL_API_KEY"] api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key) client = Mistral(api_key=api_key)

View File

@ -74,4 +74,4 @@
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_02", "type": "math", "math": "u \\in\\left(R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)^{\\times}"} {"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_02", "type": "math", "math": "u \\in\\left(R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)^{\\times}"}
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_03", "type": "math", "math": "\\lambda_{g}=\\sum_{i=1}^{k} c\\left(g, R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)"} {"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_03", "type": "math", "math": "\\lambda_{g}=\\sum_{i=1}^{k} c\\left(g, R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)"}
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_04", "type": "present", "text": "We also thank Ján Mináč for his constant encouragement and support."} {"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_04", "type": "present", "text": "We also thank Ján Mináč for his constant encouragement and support."}
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_04", "type": "present", "text": "Allgemeine theorie der Gaußschen Summen in endlichen kommutativen Ringe"} {"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_05", "type": "present", "text": "Allgemeine theorie der Gaußschen Summen in endlichen kommutativen Ringe"}

View File

@ -581,10 +581,13 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
tests.append(test) tests.append(test)
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
print(f"Error parsing JSON on line {line_number}: {e}") print(f"Error parsing JSON on line {line_number}: {e}")
raise
except (ValidationError, KeyError) as e: except (ValidationError, KeyError) as e:
print(f"Error on line {line_number}: {e}") print(f"Error on line {line_number}: {e}")
raise
except Exception as e: except Exception as e:
print(f"Unexpected error on line {line_number}: {e}") print(f"Unexpected error on line {line_number}: {e}")
raise
return tests return tests

View File

@ -83,6 +83,7 @@ bench = [
"google-genai", "google-genai",
"google-generativeai", "google-generativeai",
"playwright", "playwright",
"mistralai",
] ]
train = [ train = [