mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-25 16:30:28 +00:00
Fixes for multipage runners
This commit is contained in:
parent
743e48e4ad
commit
8b3a9e4201
@ -4,7 +4,7 @@ This script runs olmocr bench.
|
||||
It will take as an argument a folder, and scan it for .jsonl files which contain the various rules and properties that we will check.
|
||||
It will then validate the JSON files to make sure they are all valid.
|
||||
Then, each other folder in there (besides /pdfs) represents a pipeline tool that we will evaluate.
|
||||
We will validate that each one of those contains at least one .md file (or repeated generations, e.g. _1.md, _2.md, etc.)
|
||||
We will validate that each one of those contains at least one .md file (or repeated generations, e.g. _pg{page}_repeat{repeat}.md)
|
||||
corresponding to its parse for every .pdf in the /pdfs folder.
|
||||
Then, we will read each one, and check if they pass against all the rules.
|
||||
If a rule fails on some of the repeats, a short explanation is printed.
|
||||
@ -30,7 +30,7 @@ def evaluate_candidate(
|
||||
) -> Tuple[float, int, List[str], List[str], Dict[str, List[float]], List[float]]:
|
||||
"""
|
||||
For the candidate folder (pipeline tool output), validate that it contains at least one .md file
|
||||
(i.e. repeated generations like _1.md, _2.md, etc.) for every PDF in the pdf folder.
|
||||
(i.e. repeated generations like _pg{page}_repeat{repeat}.md) for every PDF in the pdf folder.
|
||||
Then, run each rule against all corresponding .md files and average the results.
|
||||
|
||||
Returns a tuple:
|
||||
@ -49,11 +49,12 @@ def evaluate_candidate(
|
||||
all_test_scores = [] # Store all individual test scores for bootstrapping
|
||||
candidate_name = os.path.basename(candidate_folder)
|
||||
|
||||
# Map each PDF to its corresponding MD repeats (e.g., doc1_1.md, doc1_2.md, etc.)
|
||||
# Map each PDF to its corresponding MD repeats (e.g., doc1_pg1_repeat1.md, doc1_pg2_repeat2.md, etc.)
|
||||
pdf_to_md_files = {}
|
||||
for pdf_name in pdf_basenames:
|
||||
md_base = os.path.splitext(pdf_name)[0]
|
||||
md_regex = re.compile(rf"^{re.escape(md_base)}_\d+\.md$")
|
||||
# Updated regex for new format: {pdf_name}_pg<page>_repeat<repeat>.md
|
||||
md_regex = re.compile(rf"^{re.escape(md_base)}_pg\d+_repeat\d+\.md$")
|
||||
|
||||
# List all files in the candidate folder and filter using regex
|
||||
all_files = os.listdir(candidate_folder)
|
||||
@ -62,7 +63,7 @@ def evaluate_candidate(
|
||||
if not md_files and not force:
|
||||
candidate_errors.append(
|
||||
f"Candidate '{candidate_name}' is missing MD repeats for {pdf_name} "
|
||||
f"(expected files matching {md_base}_*.md)."
|
||||
f"(expected files matching {md_base}_pg{{page}}_repeat*.md)."
|
||||
)
|
||||
else:
|
||||
pdf_to_md_files[pdf_name] = md_files
|
||||
@ -72,7 +73,7 @@ def evaluate_candidate(
|
||||
|
||||
total_test_score = 0.0
|
||||
|
||||
# Evaluate each test. Each test references a PDF (e.g., "doc1.pdf") so we get all its MD repeats.
|
||||
# Evaluate each test. Each test references a PDF (e.g., "doc1.pdf") and a specific page.
|
||||
for test in all_tests:
|
||||
test_type = test.type
|
||||
if test_type not in test_type_breakdown:
|
||||
@ -80,12 +81,19 @@ def evaluate_candidate(
|
||||
pdf_name = test.pdf
|
||||
md_base = os.path.splitext(pdf_name)[0]
|
||||
md_files = pdf_to_md_files.get(pdf_name, [])
|
||||
if not md_files:
|
||||
continue # Should not occur due to earlier check.
|
||||
# Filter MD files for the specific page corresponding to the test
|
||||
page_md_files = [f for f in md_files if re.search(rf"_pg{test.page}_", os.path.basename(f))]
|
||||
if not page_md_files:
|
||||
candidate_errors.append(
|
||||
f"Candidate '{candidate_name}' is missing MD repeats for {pdf_name} page {test.page} "
|
||||
f"(expected files matching {md_base}_pg{test.page}_repeat*.md)."
|
||||
)
|
||||
continue
|
||||
|
||||
repeat_passes = 0
|
||||
num_repeats = 0
|
||||
explanations = []
|
||||
for md_path in md_files:
|
||||
for md_path in page_md_files:
|
||||
num_repeats += 1
|
||||
try:
|
||||
with open(md_path, "r", encoding="utf-8") as f:
|
||||
@ -110,8 +118,8 @@ def evaluate_candidate(
|
||||
total_test_score += test_avg
|
||||
if test_avg < 1.0:
|
||||
test_failures.append(
|
||||
f"Test {test.id} on {md_base} average pass ratio: {test_avg:.3f} ({repeat_passes}/{num_repeats} repeats passed). "
|
||||
f"Ex: {explanations[0] if explanations else 'No explanation'}"
|
||||
f"Test {test.id} on {md_base} page {test.page} average pass ratio: {test_avg:.3f} "
|
||||
f"({repeat_passes}/{num_repeats} repeats passed). Ex: {explanations[0] if explanations else 'No explanation'}"
|
||||
)
|
||||
test_type_breakdown[test_type].append(test_avg)
|
||||
|
||||
@ -327,4 +335,4 @@ def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
@ -4,9 +4,10 @@ import glob
|
||||
import importlib
|
||||
import os
|
||||
from functools import partial
|
||||
from itertools import product
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
def parse_method_arg(method_arg):
|
||||
"""
|
||||
@ -48,12 +49,12 @@ async def run_sync_in_executor(func, *args, **kwargs):
|
||||
return await loop.run_in_executor(None, partial(func, *args, **kwargs))
|
||||
|
||||
|
||||
async def process_pdf(pdf_path, method, kwargs, output_path, is_async):
|
||||
async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async):
|
||||
"""Process a single PDF and save the result to output_path"""
|
||||
try:
|
||||
if is_async:
|
||||
# Run async function directly
|
||||
markdown = await method(pdf_path, page_num=1, **kwargs)
|
||||
markdown = await method(pdf_path, page_num=page_num, **kwargs)
|
||||
else:
|
||||
# Run synchronous function in the executor
|
||||
markdown = await run_sync_in_executor(method, pdf_path, page_num=1, **kwargs)
|
||||
@ -101,21 +102,25 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, force, ma
|
||||
task_descriptions = {}
|
||||
|
||||
for pdf_path in all_pdfs:
|
||||
pdf = PdfReader(pdf_path)
|
||||
num_pages = len(pdf.pages)
|
||||
|
||||
base_name = os.path.basename(pdf_path).replace(".pdf", "")
|
||||
|
||||
for i in range(1, repeats + 1):
|
||||
output_filename = f"{base_name}_{i}.md"
|
||||
output_path = os.path.join(candidate_output_dir, output_filename)
|
||||
|
||||
if os.path.exists(output_path) and not force:
|
||||
print(f"Skipping {base_name}_{i} for {candidate}, file already exists")
|
||||
print("Rerun with --force flag to force regeneration")
|
||||
continue
|
||||
|
||||
task = process_pdf(pdf_path, method, kwargs, output_path, is_async)
|
||||
tasks.append(task)
|
||||
task_descriptions[id(task)] = f"{base_name}_{i} ({candidate})"
|
||||
|
||||
for repeat in range(1, repeats + 1):
|
||||
for page_num in range(1, num_pages + 1):
|
||||
output_filename = f"{base_name}_pg{page_num}_repeat{repeat}.md"
|
||||
output_path = os.path.join(candidate_output_dir, output_filename)
|
||||
|
||||
if os.path.exists(output_path) and not force:
|
||||
print(f"Skipping {base_name}_pg{page_num}_repeat{repeat} for {candidate}, file already exists")
|
||||
print("Rerun with --force flag to force regeneration")
|
||||
continue
|
||||
|
||||
task = process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async)
|
||||
tasks.append(task)
|
||||
task_descriptions[id(task)] = f"{base_name}_pg{page_num}_repeat{repeat} ({candidate})"
|
||||
|
||||
# Process tasks with semaphore to limit concurrency
|
||||
semaphore = asyncio.Semaphore(max_parallel or 1) # Default to 1 if not specified
|
||||
|
||||
|
@ -26,6 +26,10 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0
|
||||
# Convert the first page of the PDF to a base64-encoded PNG image.
|
||||
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
|
||||
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
|
||||
|
||||
if not os.getenv("OPENAI_API_KEY"):
|
||||
raise SystemExit("You must specify an OPENAI_API_KEY")
|
||||
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
response = client.chat.completions.create(
|
||||
|
@ -27,6 +27,9 @@ def run_claude(pdf_path: str, page_num: int = 1, model: str = "claude-3-7-sonnet
|
||||
str: The OCR result in markdown format.
|
||||
"""
|
||||
|
||||
if not os.getenv("ANTHROPIC_API_KEY"):
|
||||
raise SystemExit("You must specify an ANTHROPIC_API_KEY")
|
||||
|
||||
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
|
||||
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
|
||||
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
|
@ -24,6 +24,9 @@ def run_gemini(pdf_path: str, page_num: int = 1, model: str = "gemini-2.0-flash"
|
||||
Returns:
|
||||
str: The OCR result in markdown format.
|
||||
"""
|
||||
if not os.getenv("GEMINI_API_KEY"):
|
||||
raise SystemExit("You must specify an GEMINI_API_KEY")
|
||||
|
||||
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
|
||||
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
|
||||
api_key = os.getenv("GEMINI_API_KEY")
|
||||
|
@ -15,6 +15,9 @@ def run_mistral(pdf_path: str, page_num: int = 1) -> str:
|
||||
Returns:
|
||||
str: The OCR result in markdown format.
|
||||
"""
|
||||
if not os.getenv("MISTRAL_API_KEY"):
|
||||
raise SystemExit("You must specify an MISTRAL_API_KEY")
|
||||
|
||||
api_key = os.environ["MISTRAL_API_KEY"]
|
||||
client = Mistral(api_key=api_key)
|
||||
|
||||
|
@ -74,4 +74,4 @@
|
||||
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_02", "type": "math", "math": "u \\in\\left(R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)^{\\times}"}
|
||||
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_03", "type": "math", "math": "\\lambda_{g}=\\sum_{i=1}^{k} c\\left(g, R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)"}
|
||||
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_04", "type": "present", "text": "We also thank Ján Mináč for his constant encouragement and support."}
|
||||
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_04", "type": "present", "text": "Allgemeine theorie der Gaußschen Summen in endlichen kommutativen Ringe"}
|
||||
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_05", "type": "present", "text": "Allgemeine theorie der Gaußschen Summen in endlichen kommutativen Ringe"}
|
||||
|
@ -581,10 +581,13 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
|
||||
tests.append(test)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error parsing JSON on line {line_number}: {e}")
|
||||
raise
|
||||
except (ValidationError, KeyError) as e:
|
||||
print(f"Error on line {line_number}: {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Unexpected error on line {line_number}: {e}")
|
||||
raise
|
||||
|
||||
return tests
|
||||
|
||||
|
@ -83,6 +83,7 @@ bench = [
|
||||
"google-genai",
|
||||
"google-generativeai",
|
||||
"playwright",
|
||||
"mistralai",
|
||||
]
|
||||
|
||||
train = [
|
||||
|
Loading…
x
Reference in New Issue
Block a user