From 07be9ea6e34e88b98906d4ee8fd9f5a858285331 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Mon, 10 Mar 2025 21:55:33 +0000 Subject: [PATCH] More math testing --- olmocr/bench/benchmark.py | 4 ++++ olmocr/bench/katex/render.py | 12 ++++++++++++ olmocr/bench/sample_data/dataset.jsonl | 5 +++++ 3 files changed, 21 insertions(+) diff --git a/olmocr/bench/benchmark.py b/olmocr/bench/benchmark.py index f21b39a..a7da1f1 100644 --- a/olmocr/bench/benchmark.py +++ b/olmocr/bench/benchmark.py @@ -21,6 +21,7 @@ import sys from typing import Dict, List, Tuple, Optional from .tests import BasePDFTest, BaselineTest, load_tests +from .katex.render import clear_cache_dir from .utils import calculate_bootstrap_ci, perform_permutation_test def evaluate_candidate( @@ -152,6 +153,9 @@ def main(): ci_level = args.confidence_level pdf_folder = os.path.join(input_folder, "pdfs") + # Clear equation cache directory + clear_cache_dir() + # Check that the pdfs folder exists if not os.path.exists(pdf_folder): print("Error: /pdfs folder must exist in your data directory.", file=sys.stderr) diff --git a/olmocr/bench/katex/render.py b/olmocr/bench/katex/render.py index 7fa41fd..4708a6d 100644 --- a/olmocr/bench/katex/render.py +++ b/olmocr/bench/katex/render.py @@ -16,6 +16,7 @@ import hashlib import pathlib import json import re +import shutil from dataclasses import dataclass from typing import List import unittest @@ -55,6 +56,17 @@ def get_cache_dir(): cache_dir.mkdir(parents=True, exist_ok=True) return cache_dir + +def clear_cache_dir(): + """ + Clear all files and subdirectories in the cache directory. + """ + cache_dir = get_cache_dir() + if cache_dir.exists() and cache_dir.is_dir(): + shutil.rmtree(cache_dir) + cache_dir.mkdir(parents=True, exist_ok=True) # Recreate the empty directory + + def render_equation( equation, bg_color="white", diff --git a/olmocr/bench/sample_data/dataset.jsonl b/olmocr/bench/sample_data/dataset.jsonl index 820cbf5..de12ff2 100644 --- a/olmocr/bench/sample_data/dataset.jsonl +++ b/olmocr/bench/sample_data/dataset.jsonl @@ -67,3 +67,8 @@ {"pdf": "mathfuncs_colswitch.pdf", "page": 1, "id": "mathfuncscol_00", "type": "order", "before": "Euler's Identity", "after": "Pythagorean Theorem"} {"pdf": "mathfuncs_colswitch.pdf", "page": 1, "id": "mathfuncscol_01", "type": "order", "before": "Pythagorean Theorem", "after": "The Fundamental Theorem of Calculus"} {"pdf": "mathfuncs_colswitch.pdf", "page": 1, "id": "mathfuncscol_02", "type": "order", "before": "The Fundamental Theorem of Calculus", "after": "Maxwell's Equations"} + +{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_00", "type": "math", "math": "\\lambda_{g}=\\sum_{s \\in S} \\zeta_{n}^{\\psi(g s)}=\\sum_{i=1}^{k}\\left[\\sum_{s, R s=\\mathcal{I}_{i}} \\zeta_{n}^{\\varphi(g s)}\\right]"} +{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_01", "type": "math", "math": "\\lambda_{g}=\\lambda_{g^{\\prime}}"} +{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_02", "type": "math", "math": "u \\in\\left(R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)^{\\times}"} +{"pdf": "math_2503_04086.pdf", "page": 1, "id": "math_2503_04086_03", "type": "math", "math": "\\lambda_{g}=\\sum_{i=1}^{k} c\\left(g, R / \\operatorname{Ann}_{R}\\left(x_{i}\\right)\\right)"}