mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-26 00:40:31 +00:00
Fixes for multipage runners
This commit is contained in:
parent
743e48e4ad
commit
8b3a9e4201
@ -4,7 +4,7 @@ This script runs olmocr bench.
|
|||||||
It will take as an argument a folder, and scan it for .jsonl files which contain the various rules and properties that we will check.
|
It will take as an argument a folder, and scan it for .jsonl files which contain the various rules and properties that we will check.
|
||||||
It will then validate the JSON files to make sure they are all valid.
|
It will then validate the JSON files to make sure they are all valid.
|
||||||
Then, each other folder in there (besides /pdfs) represents a pipeline tool that we will evaluate.
|
Then, each other folder in there (besides /pdfs) represents a pipeline tool that we will evaluate.
|
||||||
We will validate that each one of those contains at least one .md file (or repeated generations, e.g. _1.md, _2.md, etc.)
|
We will validate that each one of those contains at least one .md file (or repeated generations, e.g. _pg{page}_repeat{repeat}.md)
|
||||||
corresponding to its parse for every .pdf in the /pdfs folder.
|
corresponding to its parse for every .pdf in the /pdfs folder.
|
||||||
Then, we will read each one, and check if they pass against all the rules.
|
Then, we will read each one, and check if they pass against all the rules.
|
||||||
If a rule fails on some of the repeats, a short explanation is printed.
|
If a rule fails on some of the repeats, a short explanation is printed.
|
||||||
@ -30,7 +30,7 @@ def evaluate_candidate(
|
|||||||
) -> Tuple[float, int, List[str], List[str], Dict[str, List[float]], List[float]]:
|
) -> Tuple[float, int, List[str], List[str], Dict[str, List[float]], List[float]]:
|
||||||
"""
|
"""
|
||||||
For the candidate folder (pipeline tool output), validate that it contains at least one .md file
|
For the candidate folder (pipeline tool output), validate that it contains at least one .md file
|
||||||
(i.e. repeated generations like _1.md, _2.md, etc.) for every PDF in the pdf folder.
|
(i.e. repeated generations like _pg{page}_repeat{repeat}.md) for every PDF in the pdf folder.
|
||||||
Then, run each rule against all corresponding .md files and average the results.
|
Then, run each rule against all corresponding .md files and average the results.
|
||||||
|
|
||||||
Returns a tuple:
|
Returns a tuple:
|
||||||
@ -49,11 +49,12 @@ def evaluate_candidate(
|
|||||||
all_test_scores = [] # Store all individual test scores for bootstrapping
|
all_test_scores = [] # Store all individual test scores for bootstrapping
|
||||||
candidate_name = os.path.basename(candidate_folder)
|
candidate_name = os.path.basename(candidate_folder)
|
||||||
|
|
||||||
# Map each PDF to its corresponding MD repeats (e.g., doc1_1.md, doc1_2.md, etc.)
|
# Map each PDF to its corresponding MD repeats (e.g., doc1_pg1_repeat1.md, doc1_pg2_repeat2.md, etc.)
|
||||||
pdf_to_md_files = {}
|
pdf_to_md_files = {}
|
||||||
for pdf_name in pdf_basenames:
|
for pdf_name in pdf_basenames:
|
||||||
md_base = os.path.splitext(pdf_name)[0]
|
md_base = os.path.splitext(pdf_name)[0]
|
||||||
md_regex = re.compile(rf"^{re.escape(md_base)}_\d+\.md$")
|
# Updated regex for new format: {pdf_name}_pg<page>_repeat<repeat>.md
|
||||||
|
md_regex = re.compile(rf"^{re.escape(md_base)}_pg\d+_repeat\d+\.md$")
|
||||||
|
|
||||||
# List all files in the candidate folder and filter using regex
|
# List all files in the candidate folder and filter using regex
|
||||||
all_files = os.listdir(candidate_folder)
|
all_files = os.listdir(candidate_folder)
|
||||||
@ -62,7 +63,7 @@ def evaluate_candidate(
|
|||||||
if not md_files and not force:
|
if not md_files and not force:
|
||||||
candidate_errors.append(
|
candidate_errors.append(
|
||||||
f"Candidate '{candidate_name}' is missing MD repeats for {pdf_name} "
|
f"Candidate '{candidate_name}' is missing MD repeats for {pdf_name} "
|
||||||
f"(expected files matching {md_base}_*.md)."
|
f"(expected files matching {md_base}_pg{{page}}_repeat*.md)."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
pdf_to_md_files[pdf_name] = md_files
|
pdf_to_md_files[pdf_name] = md_files
|
||||||
@ -72,7 +73,7 @@ def evaluate_candidate(
|
|||||||
|
|
||||||
total_test_score = 0.0
|
total_test_score = 0.0
|
||||||
|
|
||||||
# Evaluate each test. Each test references a PDF (e.g., "doc1.pdf") so we get all its MD repeats.
|
# Evaluate each test. Each test references a PDF (e.g., "doc1.pdf") and a specific page.
|
||||||
for test in all_tests:
|
for test in all_tests:
|
||||||
test_type = test.type
|
test_type = test.type
|
||||||
if test_type not in test_type_breakdown:
|
if test_type not in test_type_breakdown:
|
||||||
@ -80,12 +81,19 @@ def evaluate_candidate(
|
|||||||
pdf_name = test.pdf
|
pdf_name = test.pdf
|
||||||
md_base = os.path.splitext(pdf_name)[0]
|
md_base = os.path.splitext(pdf_name)[0]
|
||||||
md_files = pdf_to_md_files.get(pdf_name, [])
|
md_files = pdf_to_md_files.get(pdf_name, [])
|
||||||
if not md_files:
|
# Filter MD files for the specific page corresponding to the test
|
||||||
continue # Should not occur due to earlier check.
|
page_md_files = [f for f in md_files if re.search(rf"_pg{test.page}_", os.path.basename(f))]
|
||||||
|
if not page_md_files:
|
||||||
|
candidate_errors.append(
|
||||||
|
f"Candidate '{candidate_name}' is missing MD repeats for {pdf_name} page {test.page} "
|
||||||
|
f"(expected files matching {md_base}_pg{test.page}_repeat*.md)."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
repeat_passes = 0
|
repeat_passes = 0
|
||||||
num_repeats = 0
|
num_repeats = 0
|
||||||
explanations = []
|
explanations = []
|
||||||
for md_path in md_files:
|
for md_path in page_md_files:
|
||||||
num_repeats += 1
|
num_repeats += 1
|
||||||
try:
|
try:
|
||||||
with open(md_path, "r", encoding="utf-8") as f:
|
with open(md_path, "r", encoding="utf-8") as f:
|
||||||
@ -110,8 +118,8 @@ def evaluate_candidate(
|
|||||||
total_test_score += test_avg
|
total_test_score += test_avg
|
||||||
if test_avg < 1.0:
|
if test_avg < 1.0:
|
||||||
test_failures.append(
|
test_failures.append(
|
||||||
f"Test {test.id} on {md_base} average pass ratio: {test_avg:.3f} ({repeat_passes}/{num_repeats} repeats passed). "
|
f"Test {test.id} on {md_base} page {test.page} average pass ratio: {test_avg:.3f} "
|
||||||
f"Ex: {explanations[0] if explanations else 'No explanation'}"
|
f"({repeat_passes}/{num_repeats} repeats passed). Ex: {explanations[0] if explanations else 'No explanation'}"
|
||||||
)
|
)
|
||||||
test_type_breakdown[test_type].append(test_avg)
|
test_type_breakdown[test_type].append(test_avg)
|
||||||
|
|
||||||
@ -327,4 +335,4 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
@ -4,9 +4,10 @@ import glob
|
|||||||
import importlib
|
import importlib
|
||||||
import os
|
import os
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
from itertools import product
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
def parse_method_arg(method_arg):
|
def parse_method_arg(method_arg):
|
||||||
"""
|
"""
|
||||||
@ -48,12 +49,12 @@ async def run_sync_in_executor(func, *args, **kwargs):
|
|||||||
return await loop.run_in_executor(None, partial(func, *args, **kwargs))
|
return await loop.run_in_executor(None, partial(func, *args, **kwargs))
|
||||||
|
|
||||||
|
|
||||||
async def process_pdf(pdf_path, method, kwargs, output_path, is_async):
|
async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async):
|
||||||
"""Process a single PDF and save the result to output_path"""
|
"""Process a single PDF and save the result to output_path"""
|
||||||
try:
|
try:
|
||||||
if is_async:
|
if is_async:
|
||||||
# Run async function directly
|
# Run async function directly
|
||||||
markdown = await method(pdf_path, page_num=1, **kwargs)
|
markdown = await method(pdf_path, page_num=page_num, **kwargs)
|
||||||
else:
|
else:
|
||||||
# Run synchronous function in the executor
|
# Run synchronous function in the executor
|
||||||
markdown = await run_sync_in_executor(method, pdf_path, page_num=1, **kwargs)
|
markdown = await run_sync_in_executor(method, pdf_path, page_num=1, **kwargs)
|
||||||
@ -101,21 +102,25 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, force, ma
|
|||||||
task_descriptions = {}
|
task_descriptions = {}
|
||||||
|
|
||||||
for pdf_path in all_pdfs:
|
for pdf_path in all_pdfs:
|
||||||
|
pdf = PdfReader(pdf_path)
|
||||||
|
num_pages = len(pdf.pages)
|
||||||
|
|
||||||
base_name = os.path.basename(pdf_path).replace(".pdf", "")
|
base_name = os.path.basename(pdf_path).replace(".pdf", "")
|
||||||
|
|
||||||
for i in range(1, repeats + 1):
|
for repeat in range(1, repeats + 1):
|
||||||
output_filename = f"{base_name}_{i}.md"
|
for page_num in range(1, num_pages + 1):
|
||||||
output_path = os.path.join(candidate_output_dir, output_filename)
|
output_filename = f"{base_name}_pg{page_num}_repeat{repeat}.md"
|
||||||
|
output_path = os.path.join(candidate_output_dir, output_filename)
|
||||||
if os.path.exists(output_path) and not force:
|
|
||||||
print(f"Skipping {base_name}_{i} for {candidate}, file already exists")
|
if os.path.exists(output_path) and not force:
|
||||||
print("Rerun with --force flag to force regeneration")
|
print(f"Skipping {base_name}_pg{page_num}_repeat{repeat} for {candidate}, file already exists")
|
||||||
continue
|
print("Rerun with --force flag to force regeneration")
|
||||||
|
continue
|
||||||
task = process_pdf(pdf_path, method, kwargs, output_path, is_async)
|
|
||||||
tasks.append(task)
|
task = process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async)
|
||||||
task_descriptions[id(task)] = f"{base_name}_{i} ({candidate})"
|
tasks.append(task)
|
||||||
|
task_descriptions[id(task)] = f"{base_name}_pg{page_num}_repeat{repeat} ({candidate})"
|
||||||
|
|
||||||
# Process tasks with semaphore to limit concurrency
|
# Process tasks with semaphore to limit concurrency
|
||||||
semaphore = asyncio.Semaphore(max_parallel or 1) # Default to 1 if not specified
|
semaphore = asyncio.Semaphore(max_parallel or 1) # Default to 1 if not specified
|
||||||
|
|
||||||
|
@ -26,6 +26,10 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0
|
|||||||
# Convert the first page of the PDF to a base64-encoded PNG image.
|
# Convert the first page of the PDF to a base64-encoded PNG image.
|
||||||
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
|
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
|
||||||
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
|
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
|
||||||
|
|
||||||
|
if not os.getenv("OPENAI_API_KEY"):
|
||||||
|
raise SystemExit("You must specify an OPENAI_API_KEY")
|
||||||
|
|
||||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
|
@ -27,6 +27,9 @@ def run_claude(pdf_path: str, page_num: int = 1, model: str = "claude-3-7-sonnet
|
|||||||
str: The OCR result in markdown format.
|
str: The OCR result in markdown format.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if not os.getenv("ANTHROPIC_API_KEY"):
|
||||||
|
raise SystemExit("You must specify an ANTHROPIC_API_KEY")
|
||||||
|
|
||||||
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
|
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
|
||||||
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
|
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
|
||||||
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||||
|
@ -24,6 +24,9 @@ def run_gemini(pdf_path: str, page_num: int = 1, model: str = "gemini-2.0-flash"
|
|||||||
Returns:
|
Returns:
|
||||||
str: The OCR result in markdown format.
|
str: The OCR result in markdown format.
|
||||||
"""
|
"""
|
||||||
|
if not os.getenv("GEMINI_API_KEY"):
|
||||||
|
raise SystemExit("You must specify an GEMINI_API_KEY")
|
||||||
|
|
||||||
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
|
image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048)
|
||||||
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
|
anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport")
|
||||||
api_key = os.getenv("GEMINI_API_KEY")
|
api_key = os.getenv("GEMINI_API_KEY")
|
||||||
|
@ -15,6 +15,9 @@ def run_mistral(pdf_path: str, page_num: int = 1) -> str:
|
|||||||
Returns:
|
Returns:
|
||||||
str: The OCR result in markdown format.
|
str: The OCR result in markdown format.
|
||||||
"""
|
"""
|
||||||
|
if not os.getenv("MISTRAL_API_KEY"):
|
||||||
|
raise SystemExit("You must specify an MISTRAL_API_KEY")
|
||||||
|
|
||||||
api_key = os.environ["MISTRAL_API_KEY"]
|
api_key = os.environ["MISTRAL_API_KEY"]
|
||||||
client = Mistral(api_key=api_key)
|
client = Mistral(api_key=api_key)
|
||||||
|
|
||||||
|
@ -74,4 +74,4 @@
|
|||||||
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_02", "type": "math", "math": "u \\in\\left(R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)^{\\times}"}
|
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_02", "type": "math", "math": "u \\in\\left(R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)^{\\times}"}
|
||||||
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_03", "type": "math", "math": "\\lambda_{g}=\\sum_{i=1}^{k} c\\left(g, R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)"}
|
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_03", "type": "math", "math": "\\lambda_{g}=\\sum_{i=1}^{k} c\\left(g, R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)"}
|
||||||
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_04", "type": "present", "text": "We also thank Ján Mináč for his constant encouragement and support."}
|
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_04", "type": "present", "text": "We also thank Ján Mináč for his constant encouragement and support."}
|
||||||
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_04", "type": "present", "text": "Allgemeine theorie der Gaußschen Summen in endlichen kommutativen Ringe"}
|
{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_05", "type": "present", "text": "Allgemeine theorie der Gaußschen Summen in endlichen kommutativen Ringe"}
|
||||||
|
@ -581,10 +581,13 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]:
|
|||||||
tests.append(test)
|
tests.append(test)
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
print(f"Error parsing JSON on line {line_number}: {e}")
|
print(f"Error parsing JSON on line {line_number}: {e}")
|
||||||
|
raise
|
||||||
except (ValidationError, KeyError) as e:
|
except (ValidationError, KeyError) as e:
|
||||||
print(f"Error on line {line_number}: {e}")
|
print(f"Error on line {line_number}: {e}")
|
||||||
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Unexpected error on line {line_number}: {e}")
|
print(f"Unexpected error on line {line_number}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
return tests
|
return tests
|
||||||
|
|
||||||
|
@ -83,6 +83,7 @@ bench = [
|
|||||||
"google-genai",
|
"google-genai",
|
||||||
"google-generativeai",
|
"google-generativeai",
|
||||||
"playwright",
|
"playwright",
|
||||||
|
"mistralai",
|
||||||
]
|
]
|
||||||
|
|
||||||
train = [
|
train = [
|
||||||
|
Loading…
x
Reference in New Issue
Block a user