diff --git a/olmocr/bench/benchmark.py b/olmocr/bench/benchmark.py index bcf9cb1..4264478 100644 --- a/olmocr/bench/benchmark.py +++ b/olmocr/bench/benchmark.py @@ -4,7 +4,7 @@ This script runs olmocr bench. It will take as an argument a folder, and scan it for .jsonl files which contain the various rules and properties that we will check. It will then validate the JSON files to make sure they are all valid. Then, each other folder in there (besides /pdfs) represents a pipeline tool that we will evaluate. -We will validate that each one of those contains at least one .md file (or repeated generations, e.g. _1.md, _2.md, etc.) +We will validate that each one of those contains at least one .md file (or repeated generations, e.g. _pg{page}_repeat{repeat}.md) corresponding to its parse for every .pdf in the /pdfs folder. Then, we will read each one, and check if they pass against all the rules. If a rule fails on some of the repeats, a short explanation is printed. @@ -30,7 +30,7 @@ def evaluate_candidate( ) -> Tuple[float, int, List[str], List[str], Dict[str, List[float]], List[float]]: """ For the candidate folder (pipeline tool output), validate that it contains at least one .md file - (i.e. repeated generations like _1.md, _2.md, etc.) for every PDF in the pdf folder. + (i.e. repeated generations like _pg{page}_repeat{repeat}.md) for every PDF in the pdf folder. Then, run each rule against all corresponding .md files and average the results. Returns a tuple: @@ -49,11 +49,12 @@ def evaluate_candidate( all_test_scores = [] # Store all individual test scores for bootstrapping candidate_name = os.path.basename(candidate_folder) - # Map each PDF to its corresponding MD repeats (e.g., doc1_1.md, doc1_2.md, etc.) + # Map each PDF to its corresponding MD repeats (e.g., doc1_pg1_repeat1.md, doc1_pg2_repeat2.md, etc.) pdf_to_md_files = {} for pdf_name in pdf_basenames: md_base = os.path.splitext(pdf_name)[0] - md_regex = re.compile(rf"^{re.escape(md_base)}_\d+\.md$") + # Updated regex for new format: {pdf_name}_pg_repeat.md + md_regex = re.compile(rf"^{re.escape(md_base)}_pg\d+_repeat\d+\.md$") # List all files in the candidate folder and filter using regex all_files = os.listdir(candidate_folder) @@ -62,7 +63,7 @@ def evaluate_candidate( if not md_files and not force: candidate_errors.append( f"Candidate '{candidate_name}' is missing MD repeats for {pdf_name} " - f"(expected files matching {md_base}_*.md)." + f"(expected files matching {md_base}_pg{{page}}_repeat*.md)." ) else: pdf_to_md_files[pdf_name] = md_files @@ -72,7 +73,7 @@ def evaluate_candidate( total_test_score = 0.0 - # Evaluate each test. Each test references a PDF (e.g., "doc1.pdf") so we get all its MD repeats. + # Evaluate each test. Each test references a PDF (e.g., "doc1.pdf") and a specific page. for test in all_tests: test_type = test.type if test_type not in test_type_breakdown: @@ -80,12 +81,19 @@ def evaluate_candidate( pdf_name = test.pdf md_base = os.path.splitext(pdf_name)[0] md_files = pdf_to_md_files.get(pdf_name, []) - if not md_files: - continue # Should not occur due to earlier check. + # Filter MD files for the specific page corresponding to the test + page_md_files = [f for f in md_files if re.search(rf"_pg{test.page}_", os.path.basename(f))] + if not page_md_files: + candidate_errors.append( + f"Candidate '{candidate_name}' is missing MD repeats for {pdf_name} page {test.page} " + f"(expected files matching {md_base}_pg{test.page}_repeat*.md)." + ) + continue + repeat_passes = 0 num_repeats = 0 explanations = [] - for md_path in md_files: + for md_path in page_md_files: num_repeats += 1 try: with open(md_path, "r", encoding="utf-8") as f: @@ -110,8 +118,8 @@ def evaluate_candidate( total_test_score += test_avg if test_avg < 1.0: test_failures.append( - f"Test {test.id} on {md_base} average pass ratio: {test_avg:.3f} ({repeat_passes}/{num_repeats} repeats passed). " - f"Ex: {explanations[0] if explanations else 'No explanation'}" + f"Test {test.id} on {md_base} page {test.page} average pass ratio: {test_avg:.3f} " + f"({repeat_passes}/{num_repeats} repeats passed). Ex: {explanations[0] if explanations else 'No explanation'}" ) test_type_breakdown[test_type].append(test_avg) @@ -327,4 +335,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/olmocr/bench/convert.py b/olmocr/bench/convert.py index f990da6..dae3dcb 100644 --- a/olmocr/bench/convert.py +++ b/olmocr/bench/convert.py @@ -4,9 +4,10 @@ import glob import importlib import os from functools import partial +from itertools import product from tqdm import tqdm - +from pypdf import PdfReader def parse_method_arg(method_arg): """ @@ -48,12 +49,12 @@ async def run_sync_in_executor(func, *args, **kwargs): return await loop.run_in_executor(None, partial(func, *args, **kwargs)) -async def process_pdf(pdf_path, method, kwargs, output_path, is_async): +async def process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async): """Process a single PDF and save the result to output_path""" try: if is_async: # Run async function directly - markdown = await method(pdf_path, page_num=1, **kwargs) + markdown = await method(pdf_path, page_num=page_num, **kwargs) else: # Run synchronous function in the executor markdown = await run_sync_in_executor(method, pdf_path, page_num=1, **kwargs) @@ -101,21 +102,25 @@ async def process_pdfs(config, pdf_directory, data_directory, repeats, force, ma task_descriptions = {} for pdf_path in all_pdfs: + pdf = PdfReader(pdf_path) + num_pages = len(pdf.pages) + base_name = os.path.basename(pdf_path).replace(".pdf", "") - for i in range(1, repeats + 1): - output_filename = f"{base_name}_{i}.md" - output_path = os.path.join(candidate_output_dir, output_filename) - - if os.path.exists(output_path) and not force: - print(f"Skipping {base_name}_{i} for {candidate}, file already exists") - print("Rerun with --force flag to force regeneration") - continue - - task = process_pdf(pdf_path, method, kwargs, output_path, is_async) - tasks.append(task) - task_descriptions[id(task)] = f"{base_name}_{i} ({candidate})" - + for repeat in range(1, repeats + 1): + for page_num in range(1, num_pages + 1): + output_filename = f"{base_name}_pg{page_num}_repeat{repeat}.md" + output_path = os.path.join(candidate_output_dir, output_filename) + + if os.path.exists(output_path) and not force: + print(f"Skipping {base_name}_pg{page_num}_repeat{repeat} for {candidate}, file already exists") + print("Rerun with --force flag to force regeneration") + continue + + task = process_pdf(pdf_path, page_num, method, kwargs, output_path, is_async) + tasks.append(task) + task_descriptions[id(task)] = f"{base_name}_pg{page_num}_repeat{repeat} ({candidate})" + # Process tasks with semaphore to limit concurrency semaphore = asyncio.Semaphore(max_parallel or 1) # Default to 1 if not specified diff --git a/olmocr/bench/runners/run_chatgpt.py b/olmocr/bench/runners/run_chatgpt.py index 67f8702..f95beba 100644 --- a/olmocr/bench/runners/run_chatgpt.py +++ b/olmocr/bench/runners/run_chatgpt.py @@ -26,6 +26,10 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0 # Convert the first page of the PDF to a base64-encoded PNG image. image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048) anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") + + if not os.getenv("OPENAI_API_KEY"): + raise SystemExit("You must specify an OPENAI_API_KEY") + client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) response = client.chat.completions.create( diff --git a/olmocr/bench/runners/run_claude.py b/olmocr/bench/runners/run_claude.py index e7148a9..d71f7c6 100644 --- a/olmocr/bench/runners/run_claude.py +++ b/olmocr/bench/runners/run_claude.py @@ -27,6 +27,9 @@ def run_claude(pdf_path: str, page_num: int = 1, model: str = "claude-3-7-sonnet str: The OCR result in markdown format. """ + if not os.getenv("ANTHROPIC_API_KEY"): + raise SystemExit("You must specify an ANTHROPIC_API_KEY") + image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048) anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) diff --git a/olmocr/bench/runners/run_gemini.py b/olmocr/bench/runners/run_gemini.py index fcd47a4..b44ef62 100644 --- a/olmocr/bench/runners/run_gemini.py +++ b/olmocr/bench/runners/run_gemini.py @@ -24,6 +24,9 @@ def run_gemini(pdf_path: str, page_num: int = 1, model: str = "gemini-2.0-flash" Returns: str: The OCR result in markdown format. """ + if not os.getenv("GEMINI_API_KEY"): + raise SystemExit("You must specify an GEMINI_API_KEY") + image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048) anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") api_key = os.getenv("GEMINI_API_KEY") diff --git a/olmocr/bench/runners/run_mistral.py b/olmocr/bench/runners/run_mistral.py index cbed1fe..423e2d4 100644 --- a/olmocr/bench/runners/run_mistral.py +++ b/olmocr/bench/runners/run_mistral.py @@ -15,6 +15,9 @@ def run_mistral(pdf_path: str, page_num: int = 1) -> str: Returns: str: The OCR result in markdown format. """ + if not os.getenv("MISTRAL_API_KEY"): + raise SystemExit("You must specify an MISTRAL_API_KEY") + api_key = os.environ["MISTRAL_API_KEY"] client = Mistral(api_key=api_key) diff --git a/olmocr/bench/sample_data/dataset.jsonl b/olmocr/bench/sample_data/dataset.jsonl index fbe69f4..f962ac2 100644 --- a/olmocr/bench/sample_data/dataset.jsonl +++ b/olmocr/bench/sample_data/dataset.jsonl @@ -74,4 +74,4 @@ {"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_02", "type": "math", "math": "u \\in\\left(R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)^{\\times}"} {"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_03", "type": "math", "math": "\\lambda_{g}=\\sum_{i=1}^{k} c\\left(g, R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)"} {"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_04", "type": "present", "text": "We also thank Ján Mináč for his constant encouragement and support."} -{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_04", "type": "present", "text": "Allgemeine theorie der Gaußschen Summen in endlichen kommutativen Ringe"} +{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_05", "type": "present", "text": "Allgemeine theorie der Gaußschen Summen in endlichen kommutativen Ringe"} diff --git a/olmocr/bench/tests.py b/olmocr/bench/tests.py index 6df28f8..02c7030 100644 --- a/olmocr/bench/tests.py +++ b/olmocr/bench/tests.py @@ -581,10 +581,13 @@ def load_tests(jsonl_file: str) -> List[BasePDFTest]: tests.append(test) except json.JSONDecodeError as e: print(f"Error parsing JSON on line {line_number}: {e}") + raise except (ValidationError, KeyError) as e: print(f"Error on line {line_number}: {e}") + raise except Exception as e: print(f"Unexpected error on line {line_number}: {e}") + raise return tests diff --git a/pyproject.toml b/pyproject.toml index bca39ee..30c9985 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,6 +83,7 @@ bench = [ "google-genai", "google-generativeai", "playwright", + "mistralai", ] train = [