diff --git a/olmocr/bench/benchmark.py b/olmocr/bench/benchmark.py index aff9de4..f21b39a 100644 --- a/olmocr/bench/benchmark.py +++ b/olmocr/bench/benchmark.py @@ -24,7 +24,7 @@ from .tests import BasePDFTest, BaselineTest, load_tests from .utils import calculate_bootstrap_ci, perform_permutation_test def evaluate_candidate( - candidate_folder: str, all_tests: List[BasePDFTest], pdf_basenames: List[str] + candidate_folder: str, all_tests: List[BasePDFTest], pdf_basenames: List[str], force: bool=False ) -> Tuple[float, int, List[str], List[str], Dict[str, List[float]], List[float]]: """ For the candidate folder (pipeline tool output), validate that it contains at least one .md file @@ -53,7 +53,7 @@ def evaluate_candidate( md_base = os.path.splitext(pdf_name)[0] md_pattern = os.path.join(candidate_folder, f"{md_base}_*.md") md_files = glob.glob(md_pattern) - if not md_files: + if not md_files and not force: candidate_errors.append(f"Candidate '{candidate_name}' is missing MD repeats for {pdf_name} (expected files matching {md_base}_*.md).") else: pdf_to_md_files[pdf_name] = md_files @@ -117,6 +117,11 @@ def main(): default=os.path.join(os.path.dirname(__file__), "sample_data"), help="Path to the folder containing .jsonl files, /pdfs folder, and pipeline tool subfolders.", ) + parser.add_argument( + "--force", + action="store_true", + help="Run benchmark even if some files are missing", + ) parser.add_argument( "--candidate", type=str, @@ -205,7 +210,7 @@ def main(): for candidate in candidate_folders: candidate_name = os.path.basename(candidate) overall_score, total_tests, candidate_errors, test_failures, test_type_breakdown, all_test_scores = evaluate_candidate( - candidate, all_tests, pdf_basenames + candidate, all_tests, pdf_basenames, args.force, ) # Calculate confidence interval diff --git a/olmocr/bench/katex/__init__.py b/olmocr/bench/katex/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/olmocr/bench/katex/compare.py b/olmocr/bench/katex/compare.py deleted file mode 100644 index 1f5cfce..0000000 --- a/olmocr/bench/katex/compare.py +++ /dev/null @@ -1,296 +0,0 @@ -import unittest -import numpy as np -import torch -import torch.nn.functional as F -from PIL import Image - -def find_image_match(large_pil, small_pil, device=None) -> tuple[float, int, int]: - """ - Finds the best matching location of a small image inside a large image using 2D convolution. - Returns a matching score and the coordinates of the best match. - - The matching score is computed using Intersection over Union (IoU) of binary images. - Each image is converted to a binary mask where pixels > 0.5 are True, otherwise False. - The IoU is calculated as: (intersection) / (union) of the two binary masks. - - If the extracted patch and the template differ in shape (which can happen when the patch goes out of bounds), - the smaller array is padded with False values so that they can be compared elementwise. - - Args: - large_pil (PIL.Image): The large image. - small_pil (PIL.Image): The small image (patch). - device (str, optional): "cuda" or "cpu". If None, auto-select based on availability. - - Returns: - (score, x, y): - - score: IoU matching score (0.0 to 1.0) - - x, y: Coordinates (top-left corner) of the best match in the large image. - """ - # Auto-select device. - if device is None: - device = "cuda" if torch.cuda.is_available() else "cpu" - - # Convert images to grayscale and normalize to [0, 1] - large_img = np.array(large_pil.convert("L"), dtype=np.float32) / 255.0 - small_img = np.array(small_pil.convert("L"), dtype=np.float32) / 255.0 - - # If the "small" image is larger than the "large" image in any dimension, swap them. - if small_img.shape[0] > large_img.shape[0] or small_img.shape[1] > large_img.shape[1]: - small_img, large_img = large_img, small_img - - # Convert images to torch tensors with shape (1, 1, H, W) - large_tensor = torch.tensor(large_img).unsqueeze(0).unsqueeze(0).to(device) - small_tensor = torch.tensor(small_img).unsqueeze(0).unsqueeze(0).to(device) - - # Normalize the small image (template) for proper correlation calculation. - small_mean = torch.mean(small_tensor) - small_std = torch.std(small_tensor) - small_normalized = (small_tensor - small_mean) / (small_std + 1e-7) - - # Perform convolution with same padding. - result = F.conv2d(large_tensor, small_normalized, padding="same") - - # Find the maximum correlation value and its flat index. - max_val, max_loc = torch.max(result.view(-1), 0) - - # Extract the coordinates from the convolution result. - if result.squeeze().dim() == 0: - conv_y, conv_x = 0, 0 - else: - result_size = result.squeeze().size() # expected shape: (H, W) - conv_y = (max_loc // result_size[1]).item() - conv_x = (max_loc % result_size[1]).item() - - # Compute the offset introduced by "same" padding. - patch_h, patch_w = small_img.shape - offset_y = (patch_h - 1) // 2 - offset_x = (patch_w - 1) // 2 - - # Adjust the convolution coordinate to get the top-left corner of the patch. - match_y = conv_y - offset_y - match_x = conv_x - offset_x - - # Clamp coordinates to be within valid bounds. - match_y = max(0, min(match_y, large_img.shape[0] - patch_h)) - match_x = max(0, min(match_x, large_img.shape[1] - patch_w)) - - # Extract the corresponding patch from the large image. - large_patch = large_img[match_y:match_y+patch_h, match_x:match_x+patch_w] - - # If there is a shape mismatch (e.g. when near image boundaries), pad the smaller array with False (0.0) - if large_patch.shape != small_img.shape: - target_shape = (max(large_patch.shape[0], small_img.shape[0]), - max(large_patch.shape[1], small_img.shape[1])) - def pad_to_shape(arr, target_shape, pad_value=0.0): - pad_h = target_shape[0] - arr.shape[0] - pad_w = target_shape[1] - arr.shape[1] - return np.pad(arr, ((0, pad_h), (0, pad_w)), mode='constant', constant_values=pad_value) - if large_patch.shape != target_shape: - large_patch = pad_to_shape(large_patch, target_shape, pad_value=0.0) - if small_img.shape != target_shape: - small_img = pad_to_shape(small_img, target_shape, pad_value=0.0) - - # Create binary masks (True if > 0.5, else False) - large_binary = large_patch > 0.5 - small_binary = small_img > 0.5 - - # Create masks for very bright pixels (> 0.99) - large_white = large_patch > 0.99 - small_white = small_img > 0.99 - - # Create a mask for pixels to exclude (where both images are very bright) - exclude_mask = np.logical_and(large_white, small_white) - - # Apply the exclusion mask to the binary masks - large_binary_filtered = np.logical_and(large_binary, ~exclude_mask) - small_binary_filtered = np.logical_and(small_binary, ~exclude_mask) - - # Calculate intersection and union on the filtered binary masks - intersection = np.logical_and(large_binary_filtered, small_binary_filtered).sum() - union = np.logical_or(large_binary_filtered, small_binary_filtered).sum() - - # Calculate IoU score - # Handle the case where union is zero (both images empty) - if union == 0: - score = 1.0 if intersection == 0 else 0.0 - else: - score = float(intersection / union) - - return score, match_x, match_y - - -class TestFindImageMatch(unittest.TestCase): - def setUp(self): - np.random.seed(42) - torch.manual_seed(42) - - def create_random_image(self, shape): - """ - Create a random grayscale image with the given shape (height, width). - Pixel values are in the range 0-255. - """ - arr = np.random.randint(0, 256, shape, dtype=np.uint8) - return Image.fromarray(arr, mode='L') - - def test_exact_match(self): - """Test that a patch cropped from a larger image is found at the correct location.""" - np.random.seed(123) - large_array = np.random.randint(0, 256, (100, 100), dtype=np.uint8) - large_pil = Image.fromarray(large_array, mode='L') - - top, left = 30, 40 # expected best_y, best_x - patch_height, patch_width = 20, 20 - patch_array = large_array[top:top+patch_height, left:left+patch_width] - small_pil = Image.fromarray(patch_array, mode='L') - - score, best_x, best_y = find_image_match(large_pil, small_pil) - self.assertEqual(best_x, left, f"Expected best_x to be {left} but got {best_x}") - self.assertEqual(best_y, top, f"Expected best_y to be {top} but got {best_y}") - self.assertGreater(score, 0.99, f"Expected high score for an exact match, got {score}") - - def test_full_image_match(self): - """Test when the small image is identical to the large image.""" - large_pil = self.create_random_image((50, 50)) - small_pil = large_pil.copy() - score, best_x, best_y = find_image_match(large_pil, small_pil) - self.assertEqual(best_x, 0, f"Expected best_x to be 0, got {best_x}") - self.assertEqual(best_y, 0, f"Expected best_y to be 0, got {best_y}") - self.assertGreater(score, 0.99, f"Expected high score for an exact match, got {score}") - - def test_swap_images(self): - """ - Test the swapping logic by passing in images in reversed order. - When the "small" image is actually larger than the "large" image, - the function should swap them internally. - """ - large_img = self.create_random_image((100, 100)) - large_array = np.array(large_img) - top, left = 20, 20 - patch_height, patch_width = 40, 40 - patch_array = large_array[top:top+patch_height, left:left+patch_width] - small_img = Image.fromarray(patch_array, mode='L') - # Pass in swapped: the larger image as the patch and vice versa. - score, best_x, best_y = find_image_match(small_img, large_img) - self.assertEqual(best_x, left, f"Expected best_x to be {left} after swap, got {best_x}") - self.assertEqual(best_y, top, f"Expected best_y to be {top} after swap, got {best_y}") - - def test_single_pixel_match(self): - """Test the function with 1x1 images.""" - arr = np.array([[128]], dtype=np.uint8) - pil_img = Image.fromarray(arr, mode='L') - score, best_x, best_y = find_image_match(pil_img, pil_img) - self.assertEqual(best_x, 0) - self.assertEqual(best_y, 0) - self.assertGreater(score, 0.99, f"Expected high score for an exact match, got {score}") - - def test_out_of_bounds_coordinates(self): - """ - Test that the returned best match coordinates are within - the bounds of the large image. - """ - large_pil = self.create_random_image((80, 80)) - large_array = np.array(large_pil) - left, top = 30, 30 - patch_width, patch_height = 20, 20 - patch_array = large_array[top:top+patch_height, left:left+patch_width] - small_pil = Image.fromarray(patch_array, mode='L') - score, best_x, best_y = find_image_match(large_pil, small_pil) - width, height = large_pil.size - self.assertTrue(0 <= best_x < width, f"best_x {best_x} is out of bounds for width {width}") - self.assertTrue(0 <= best_y < height, f"best_y {best_y} is out of bounds for height {height}") - - def test_padding_mismatch(self): - """ - Test a case where the computed patch from the large image is smaller than the - template due to clamping at the boundary. In such cases, the smaller array should be - padded with white pixels (1.0) so that both arrays have the same shape. - - Here, we force a mismatch by providing images whose sizes (after potential swap) cause - the extracted patch to be truncated. - """ - # Create a "large" image that is too small in height compared to the template. - # For example, after swap, effective large image will be (50, 150) and template is (100, 100) - large_pil = self.create_random_image((50, 150)) - small_pil = self.create_random_image((100, 100)) - # Calling with these images: since small_pil is larger in height than large_pil, - # a swap will occur. After swap: - # effective large image: (100, 100) - # effective small image: (50, 150) - # Then patch size is taken from effective small image: (50, 150) - # However, the effective large image is (100, 100), so when extracting a patch of size (50,150) - # from a (100, 100) image (clamped to width 100), the patch will be (50, 100). - # Our padding logic should pad the extracted patch from width 100 to 150. - score, best_x, best_y = find_image_match(large_pil, small_pil) - # After padding, the per-pixel score should be computed without error. - self.assertIsInstance(score, float) - self.assertGreaterEqual(score, 0.0) - self.assertLessEqual(score, 1.0) - - -class TestRenderMathMatches(unittest.TestCase): - # def testBasicMatch1(self): - # from .render import render_equation - - # ref_image = render_equation("\int_{a}^{b} f(x) \, dx = F(b) - F(a)") - # hyp_image = render_equation("\int_a^b f(x) \, dx = F(b) - F(a)") - - # score, best_x, best_y = find_image_match(ref_image, hyp_image) - # self.assertGreater(score, 0.99) - - # def testBasicMatch2(self): - # from .render import render_equation - - # ref_image = render_equation("s(t) = t^2 + 8t - 1") - # hyp_image = render_equation("s(t) = t^2 + 8t + 1") - - # score, best_x, best_y = find_image_match(ref_image, hyp_image) - # print("Should be high diff") - # print(score, best_x, best_y) - # self.assertLess(score, 0.95) - - # def testBasicMatch3(self): - # from .render import render_equation - - # ref_image = render_equation("s(t) = t^2 + 8t - 1") - - # new_image = Image.new(ref_image.mode, (ref_image.width + 20, ref_image.height), (255, 255, 255)) - - # # Paste the original image onto the new image, offset by the padding amount - # new_image.paste(ref_image, (20, 0)) - - # score, best_x, best_y = find_image_match(ref_image, new_image) - # print("Should be exactly the same, shfited over") - # print(score, best_x, best_y) - # self.assertGreater(score, 0.99) - # self.assertEqual(best_x, 20) - # self.assertEqual(best_y, 0) - - # def testBasicMatch4(self): - # from .render import render_equation - - # ref_image = render_equation("s(t) = t^2 + 8t - 1") - # hyp_image = render_equation("e^{i\pi} + 1 = 0") - - # score, best_x, best_y = find_image_match(ref_image, hyp_image) - # print("Should be way off") - # print(score, best_x, best_y) - # self.assertLess(score, 0.5) - - def testMultiline(self): - from .render import render_equation - - ref_image = render_equation("\\nabla \\cdot \\mathbf{E} = \\frac{\\rho}{\\varepsilon_0}") - hyp_image = render_equation("""\\begin{align*}\\nabla \\cdot \\mathbf{E} = \\frac{\\rho}{\\varepsilon_0}\\end{align*}""") - - ref_image.save("ref1.png") - hyp_image.save("hyp1.png") - - score, best_x, best_y = find_image_match(ref_image, hyp_image) - print("Should be way in there") - print(score, best_x, best_y) - self.assertGreater(score, 0.95) - - - -if __name__ == "__main__": - unittest.main() diff --git a/olmocr/bench/katex/render.py b/olmocr/bench/katex/render.py index 3faf399..7fa41fd 100644 --- a/olmocr/bench/katex/render.py +++ b/olmocr/bench/katex/render.py @@ -1,45 +1,55 @@ #!/usr/bin/env python3 """ -Render LaTeX equations to Pillow images using Playwright and KaTeX -with SHA1-based caching mechanism. +Extract inner-most spans and their bounding boxes, and the mathML output, +from rendered LaTeX equations using Playwright and KaTeX. +Caching is maintained via a SHA1-based hash stored as a JSON file. Requirements: - pip install playwright pillow + pip install playwright python -m playwright install chromium - + Place katex.min.css and katex.min.js in the same directory as this script """ import os import hashlib import pathlib +import json +import re +from dataclasses import dataclass +from typing import List +import unittest +import xml.etree.ElementTree as ET -from PIL import Image from playwright.sync_api import sync_playwright, Error as PlaywrightError +@dataclass +class BoundingBox: + x: float + y: float + width: float + height: float + +@dataclass +class SpanInfo: + text: str + bounding_box: BoundingBox + +@dataclass +class RenderedEquation: + mathml: str + spans: List[SpanInfo] + def get_equation_hash(equation, bg_color="white", text_color="black", font_size=24): """ Calculate SHA1 hash of the equation string and rendering parameters. - - Args: - equation (str): LaTeX equation to hash - bg_color (str): Background color - text_color (str): Text color - font_size (int): Font size in pixels - - Returns: - str: SHA1 hash of the equation and parameters """ - # Combine all parameters that affect the output into a single string params_str = f"{equation}|{bg_color}|{text_color}|{font_size}" return hashlib.sha1(params_str.encode('utf-8')).hexdigest() def get_cache_dir(): """ Get the cache directory for equations, creating it if it doesn't exist. - - Returns: - pathlib.Path: Path to the cache directory """ cache_dir = pathlib.Path.home() / '.cache' / 'olmocr' / 'bench' / 'equations' cache_dir.mkdir(parents=True, exist_ok=True) @@ -54,51 +64,52 @@ def render_equation( debug_dom=False, ): """ - Render a LaTeX equation to a Pillow Image using Playwright and KaTeX. - Uses caching based on SHA1 hash of the equation. - - Args: - equation (str): LaTeX equation to render - bg_color (str): Background color - text_color (str): Text color - font_size (int): Font size in pixels - use_cache (bool): Whether to use caching - debug_dom (bool): Whether to print the KaTeX DOM structure - + Render a LaTeX equation using Playwright and KaTeX, extract the inner-most span elements + (those without child elements that contain non-whitespace text) along with their bounding boxes, + and also extract the MathML output generated by KaTeX. + Returns: - PIL.Image.Image: Pillow image of the rendered equation + RenderedEquation: A dataclass containing the mathml string and a list of SpanInfo dataclasses. """ - # Calculate the equation's hash for caching, including all rendering parameters + # Calculate hash for caching eq_hash = get_equation_hash(equation, bg_color, text_color, font_size) cache_dir = get_cache_dir() - cache_file = cache_dir / f"{eq_hash}.png" + cache_file = cache_dir / f"{eq_hash}.json" cache_error_file = cache_dir / f"{eq_hash}_error" - # Check if the equation is already cached if use_cache: if cache_error_file.exists(): return None - if cache_file.exists(): - return Image.open(cache_file) + with open(cache_file, 'r') as f: + data = json.load(f) + spans = [ + SpanInfo( + text=s["text"], + bounding_box=BoundingBox( + x=s["boundingBox"]["x"], + y=s["boundingBox"]["y"], + width=s["boundingBox"]["width"], + height=s["boundingBox"]["height"], + ) + ) + for s in data["spans"] + ] + return RenderedEquation(mathml=data["mathml"], spans=spans) - # We need to escape backslashes for JavaScript string - escaped_equation = equation.replace("\\", "\\\\") + # Escape backslashes for JavaScript string + escaped_equation = json.dumps(equation) - # Get the directory of the script to reference local files + # Get local paths for KaTeX files script_dir = os.path.dirname(os.path.abspath(__file__)) katex_css_path = os.path.join(script_dir, "katex.min.css") katex_js_path = os.path.join(script_dir, "katex.min.js") - # Check if the files exist if not os.path.exists(katex_css_path) or not os.path.exists(katex_js_path): raise FileNotFoundError(f"KaTeX files not found. Please ensure katex.min.css and katex.min.js are in {script_dir}") with sync_playwright() as p: - # Launch a headless browser browser = p.chromium.launch() - - # Create a new page with a reasonable viewport size page = browser.new_page(viewport={"width": 800, "height": 400}) # Basic HTML structure @@ -117,7 +128,7 @@ def render_equation( color: {text_color}; }} #equation-container {{ - padding: 0px; + padding: 0; font-size: {font_size}px; }} @@ -127,131 +138,225 @@ def render_equation( """ - - # Set the page content page.set_content(html) - - # Add KaTeX CSS and JS files page.add_style_tag(path=katex_css_path) page.add_script_tag(path=katex_js_path) - page.wait_for_load_state("networkidle") - # Check if KaTeX is properly loaded katex_loaded = page.evaluate("typeof katex !== 'undefined'") if not katex_loaded: raise RuntimeError("KaTeX library failed to load. Check your katex.min.js file.") - # Render the equation and check for errors try: - has_error = page.evaluate(f""" + error_message = page.evaluate(f""" () => {{ try {{ - katex.render("{escaped_equation}", document.getElementById("equation-container"), {{ + katex.render({escaped_equation}, document.getElementById("equation-container"), {{ displayMode: true, throwOnError: true }}); - return false; // No error + return null; }} catch (error) {{ console.error("KaTeX error:", error.message); - return true; // Error occurred + return error.message; }} }} """) except PlaywrightError as ex: - has_error = True + print(escaped_equation) + error_message = str(ex) + raise - if has_error: + if error_message: print(f"Error rendering equation: '{equation}'") + print(error_message) cache_error_file.touch() browser.close() return None - # Wait for the equation to be rendered page.wait_for_selector(".katex", state="attached") - # Extract and print KaTeX DOM HTML if debug_dom is enabled if debug_dom: - # Get the HTML structure of the rendered equation katex_dom_html = page.evaluate(""" () => { - const container = document.getElementById("equation-container"); - return container.innerHTML; + return document.getElementById("equation-container").innerHTML; } """) - - # Print the KaTeX DOM HTML print("\n===== KaTeX DOM HTML =====") print(katex_dom_html) - # Get the container element and take a screenshot - container = page.query_selector("#equation-container") + # Extract inner-most spans with non-whitespace text + spans_info = page.evaluate(""" + () => { + const spans = Array.from(document.querySelectorAll('span')); + const list = []; + spans.forEach(span => { + // Check if this span has no child elements and contains non-whitespace text + if (span.children.length === 0 && /\S/.test(span.textContent)) { + const rect = span.getBoundingClientRect(); + list.push({ + text: span.textContent.trim(), + boundingBox: { + x: rect.x, + y: rect.y, + width: rect.width, + height: rect.height + } + }); + } + }); + return list; + } + """) - # Take the screenshot - container.screenshot(path=str(cache_file)) + if debug_dom: + print("\n===== Extracted Span Information =====") + print(spans_info) + + # Extract mathML output (if available) from the KaTeX output. + # We try to get the element within an element with class "katex-mathml". + mathml = page.evaluate(""" + () => { + const mathElem = document.querySelector('.katex-mathml math'); + return mathElem ? mathElem.outerHTML : ""; + } + """) - # Close the browser browser.close() - # Return the image as a Pillow Image - return Image.open(cache_file) + # Build the result as a RenderedEquation dataclass + rendered_eq = RenderedEquation( + mathml=mathml, + spans=[ + SpanInfo( + text=s["text"], + bounding_box=BoundingBox( + x=s["boundingBox"]["x"], + y=s["boundingBox"]["y"], + width=s["boundingBox"]["width"], + height=s["boundingBox"]["height"] + ) + ) + for s in spans_info + ] + ) + + # Save to cache (convert dataclasses to a JSON-serializable dict) + cache_data = { + "mathml": rendered_eq.mathml, + "spans": [ + { + "text": span.text, + "boundingBox": { + "x": span.bounding_box.x, + "y": span.bounding_box.y, + "width": span.bounding_box.width, + "height": span.bounding_box.height + } + } + for span in rendered_eq.spans + ] + } + with open(cache_file, 'w') as f: + json.dump(cache_data, f) + return rendered_eq -def main(): - # Example equation: Einstein's famous equation - simple_equation = "E = mc^2" - - # More complex equation: Quadratic formula - complex_equation = "x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}" - - # Maxwell's equations in differential form - maxwell_equation = "\\begin{aligned} \\nabla \\cdot \\vec{E} &= \\frac{\\rho}{\\epsilon_0} \\\\ \\nabla \\cdot \\vec{B} &= 0 \\\\ \\nabla \\times \\vec{E} &= -\\frac{\\partial\\vec{B}}{\\partial t} \\\\ \\nabla \\times \\vec{B} &= \\mu_0 \\vec{J} + \\mu_0\\epsilon_0\\frac{\\partial\\vec{E}}{\\partial t} \\end{aligned}" - - # Render the equations - # Default parameters - bg_color = "white" - text_color = "black" - font_size = 24 - - print("========== Rendering Einstein's Equation ==========") - image1 = render_equation(simple_equation, bg_color, text_color, font_size) - image1.save("einstein_equation.png") - print(f"Einstein's equation hash: {get_equation_hash(simple_equation, bg_color, text_color, font_size)}") - - print("\n========== Rendering Quadratic Formula ==========") - image2 = render_equation(complex_equation, bg_color, text_color, font_size) - image2.save("quadratic_formula.png") - print(f"Quadratic formula hash: {get_equation_hash(complex_equation, bg_color, text_color, font_size)}") - - # Different styling for Maxwell's equations - maxwell_bg = "black" - maxwell_text = "white" - maxwell_size = 20 - - print("\n========== Rendering Maxwell's Equations ==========") - image3 = render_equation(maxwell_equation, maxwell_bg, maxwell_text, maxwell_size) - image3.save("maxwell_equations.png") - print(f"Maxwell's equations hash: {get_equation_hash(maxwell_equation, maxwell_bg, maxwell_text, maxwell_size)}") - - # Example of retrieving from cache with same parameters - print("\n========== Retrieving Einstein's Equation from Cache ==========") - # Set debug_dom to False for cached version to avoid duplicate debug output - image_from_cache = render_equation(simple_equation, bg_color, text_color, font_size, debug_dom=False) - print("Retrieved Einstein's equation from cache.") - - # Example of different styling for the same equation (will render and cache separately) - alt_bg = "lightblue" - alt_text = "darkblue" - alt_size = 30 - - print("\n========== Rendering Einstein's Equation with Alternate Style ==========") - image_alt_style = render_equation(simple_equation, alt_bg, alt_text, alt_size) - image_alt_style.save("einstein_equation_alt_style.png") - print(f"Einstein's equation with alternate style hash: {get_equation_hash(simple_equation, alt_bg, alt_text, alt_size)}") - print("\n========== Rendering Invalid Equation ==========") - invalid = render_equation("$150. \\quad s(t) = 2t^3 - 3t^2 - 12t + 8") +def compare_rendered_equations(haystack: RenderedEquation, needle: RenderedEquation) -> bool: + """ + Compare two rendered equations by cleaning the MathML (removing namespaces), + extracting the inner content of any element (ignoring ), + normalizing whitespace, and checking if the needle's inner MathML is a substring + of the haystack's inner MathML. + """ + + def strip_namespaces(elem: ET.Element) -> ET.Element: + """ + Recursively remove namespace prefixes from an ElementTree element. + """ + for sub in elem.iter(): + if '}' in sub.tag: + sub.tag = sub.tag.split('}', 1)[1] + return elem + + def extract_inner(mathml: str) -> str: + """ + Parse the MathML, remove namespaces, and if a element exists, + concatenate the string representations of its children (except ). + Otherwise, return the whole cleaned MathML. + """ + try: + root = ET.fromstring(mathml) + root = strip_namespaces(root) + semantics = root.find('semantics') + if semantics is not None: + inner_parts = [] + for child in semantics: + if child.tag != 'annotation': + inner_parts.append(ET.tostring(child, encoding='unicode')) + return ''.join(inner_parts) + else: + return ET.tostring(root, encoding='unicode') + except Exception as e: + # For debugging purposes, print the error + print("Error parsing MathML:", e) + return mathml + + def normalize(s: str) -> str: + """ + Remove all whitespace from the string. + """ + return re.sub(r'\s+', '', s) + + # Clean and extract the inner MathML for both haystack and needle. + haystack_inner = normalize(extract_inner(haystack.mathml)) + needle_inner = normalize(extract_inner(needle.mathml)) + + # # For debugging: print the cleaned MathML strings. + # print("Cleaned haystack MathML:", haystack_inner) + # print("Cleaned needle MathML:", needle_inner) + + # If needle is longer than haystack, swap them. + if len(needle_inner) > len(haystack_inner): + needle_inner, haystack_inner = haystack_inner, needle_inner + + return needle_inner in haystack_inner + +class TestRenderedEquationComparison(unittest.TestCase): + def test_exact_match(self): + # Both calls with identical LaTeX should produce matching MathML output. + eq1 = render_equation("a+b", use_cache=False) + eq2 = render_equation("a+b", use_cache=False) + self.assertTrue(compare_rendered_equations(eq1, eq2)) - print("\nAll equations rendered successfully!") + def test_whitespace_difference(self): + # Differences in whitespace in the LaTeX input should not affect the MathML output. + eq1 = render_equation("a+b", use_cache=False) + eq2 = render_equation("a + b", use_cache=False) + self.assertTrue(compare_rendered_equations(eq1, eq2)) + + def test_not_found(self): + # Completely different equations should not match. + eq1 = render_equation("c-d", use_cache=False) + eq2 = render_equation("a+b", use_cache=False) + self.assertFalse(compare_rendered_equations(eq1, eq2)) + + def test_align_block_contains_needle(self): + # The MathML output of the plain equation should be found within the align block output. + eq_plain = render_equation("a+b", use_cache=False) + eq_align = render_equation("\\begin{align*}a+b\\end{align*}", use_cache=False) + self.assertTrue(compare_rendered_equations(eq_align, eq_plain)) + + def test_align_block_needle_not_in(self): + # An align block rendering a different equation should not contain the MathML of an unrelated equation. + eq_align = render_equation("\\begin{align*}a+b\\end{align*}", use_cache=False) + eq_diff = render_equation("c-d", use_cache=False) + self.assertFalse(compare_rendered_equations(eq_align, eq_diff)) + + def test_big(self): + ref_rendered = render_equation("\\nabla \\cdot \\mathbf{E} = \\frac{\\rho}{\\varepsilon_0}", use_cache=False, debug_dom=False) + align_rendered = render_equation("""\\begin{align*}\\nabla \\cdot \\mathbf{E} = \\frac{\\rho}{\\varepsilon_0}\\end{align*}""", use_cache=False, debug_dom=False) + self.assertTrue(compare_rendered_equations(ref_rendered, align_rendered)) if __name__ == "__main__": - main() \ No newline at end of file + unittest.main() diff --git a/olmocr/bench/sample_data/mistral/math_2503_04086_1.md b/olmocr/bench/sample_data/mistral/math_2503_04086_1.md new file mode 100644 index 0000000..d048c89 --- /dev/null +++ b/olmocr/bench/sample_data/mistral/math_2503_04086_1.md @@ -0,0 +1,33 @@ +Proof. Let $S$ be the generating set associated with $D$ as described in Proposition 2.5. By the circulant diagonalization theorem, the spectrum of $G_{R}(D)=\Gamma(R, S)$ is the multiset $\left\{\lambda_{g}\right\}_{g \in R}$ where + +$$ +\lambda_{g}=\sum_{s \in S} \zeta_{n}^{\varphi(g s)}=\sum_{i=1}^{k}\left[\sum_{s, R s=\mathcal{I}_{i}} \zeta_{n}^{\varphi(g s)}\right] +$$ + +We remark that by Corollary 2.7, if $s \in R$ such that $R s=\mathcal{I}_{i}=R x_{i}$ then $s$ has a unique representation of the form $s=\hat{u} x_{i}$ where $u \in\left(R / \operatorname{Ann}_{R}\left(x_{i}\right)\right)^{\times}$and $\hat{u}$ is a fixed lift of $u$ to $R^{\times}$. With this presentation, we can write + +$$ +\sum_{s, R s=\mathcal{I}_{i}} \zeta_{n}^{\varphi(g s)}=\sum_{u \in\left(R / \operatorname{Ann}_{R}\left(x_{i}\right)\right)^{\times}} \zeta_{n}^{\varphi\left(g u x_{i}\right)}=\sum_{u \in\left(R / \operatorname{Ann}_{R}\left(x_{i}\right)\right)^{\times}} \zeta_{n}^{\psi_{x_{i}}(g u)}=c\left(g, R / \operatorname{Ann}_{R}\left(x_{i}\right)\right) +$$ + +Here we recall that $\psi_{x_{i}}$ is the induced linear functional on $R / \operatorname{Ann}_{R}\left(x_{i}\right)$. We conclude that $\lambda_{g}=\sum_{i=1}^{k} c\left(g, R / \operatorname{Ann}_{R}\left(x_{i}\right)\right)$. + +The following corollary is simple yet important for our future work on perfect state transfers on gcd-graphs. + +Corollary 4.17. Suppose that $g^{\prime}=u g$ for some $u \in R^{\times}$. Then $\lambda_{g}=\lambda_{g^{\prime}}$. + +# ACKNOWLEDGEMENTS + +We thank the Department of Mathematics and Computer Science at Lake Forest College for their generous financial support through an Overleaf subscription. We also thank Ján Mináč for his constant encouragement and support. + +## REFERENCES + +1. Reza Akhtar, Megan Boggess, Tiffany Jackson-Henderson, Isidora Jiménez, Rachel Karpman, Amanda Kinzel, and Dan Pritikin, On the unitary Cayley graph of a finite ring, Electron. J. Combin. 16 (2009), no. 1, Research Paper 117, 13 pages. +2. Milan Bašić, Aleksandar Ilić, and Aleksandar Stamenković, Maximal diameter of integral circulant graphs, Information and Computation 301 (2024), 105208. +3. Maria Chudnovsky, Michal Cizek, Logan Crew, Ján Mináč, Tung T. Nguyen, Sophie Spirkl, and Nguyễn Duy Tân, On prime Cayley graphs, arXiv:2401.06062, to appear in Journal of Combinatorics (2024). +4. Thomas Honold, Characterization of finite frobenius rings, Archiv der Mathematik 76 (2001), no. 6, 406415 . +5. Irving Kaplansky, Elementary divisors and modules, Transactions of the American Mathematical Society 66 (1949), no. 2, 464-491. +6. Walter Klotz and Torsten Sander, Some properties of unitary Cayley graphs, The Electronic Journal of Combinatorics 14 (2007), no. 1, R45, 12 pages. +7. Erich Lamprecht, Allgemeine theorie der Gaußschen Summen in endlichen kommutativen Ringen, Mathematische Nachrichten 9 (1953), no. 3, 149-196. +8. Ján Mináč, Tung T Nguyen, and Nguyen Duy Tân, Isomorphic gcd-graphs over polynomial rings, arXiv preprint arXiv:2411.01768 (2024). +9. , On the gcd graphs over polynomial rings, arXiv preprint arXiv:2409.01929 (2024). \ No newline at end of file diff --git a/olmocr/bench/sample_data/olmocr_test/mathfuncs_1.md b/olmocr/bench/sample_data/olmocr_test/mathfuncs_1.md new file mode 100644 index 0000000..1d32fea --- /dev/null +++ b/olmocr/bench/sample_data/olmocr_test/mathfuncs_1.md @@ -0,0 +1,42 @@ +# The 20 Most Important Mathematical Equations + +A journey through the most elegant and influential formulas in mathematics + +--- + +**1. Euler's Identity** + +\[ e^{i\pi} + 1 = 0 \] + +Connects five fundamental constants (\(e, i, \pi, 1, 0\)), revealing the profound relationship between exponential functions and trigonometry. + +--- + +**2. Pythagorean Theorem** + +\[ a^2 + b^2 = c^2 \] + +In right triangles, the hypotenuse squared equals the sum of the squares of the other sides. Cornerstone of geometry with applications in navigation and architecture. + +--- + +**3. The Fundamental Theorem of Calculus** + +\[ \int_a^b f(x) \, dx = F(b) - F(a) \] + +Establishes that differentiation and integration are inverse operations. If \(F\) is an antiderivative of \(f\), the definite integral equals \(F(b) - F(a)\). Revolutionized mathematical problem-solving. + +--- + +**4. Maxwell's Equations** + +\[ +\begin{align*} +\nabla \cdot \mathbf{E} &= \frac{Q}{\varepsilon_0} \\ +\nabla \cdot \mathbf{B} &= 0 \\ +\n\nabla \times \mathbf{E} &= -\frac{\partial \mathbf{B}}{\partial t} \\ +\n\nabla \times \mathbf{B} &= \mu_0 \mathbf{J} + \mu_0 \varepsilon_0 \frac{\partial \mathbf{E}}{\partial t} +\end{align*} +\] + +Unified electricity and magnetism as manifestations of the same force. Describes electromagnetic field behavior, predicting waves traveling at light speed. Enabled technologies from radio to smartphones. \ No newline at end of file diff --git a/olmocr/bench/sample_data/pdfs/math_2503_04086.pdf b/olmocr/bench/sample_data/pdfs/math_2503_04086.pdf new file mode 100644 index 0000000..ceaf665 Binary files /dev/null and b/olmocr/bench/sample_data/pdfs/math_2503_04086.pdf differ diff --git a/olmocr/bench/tests.py b/olmocr/bench/tests.py index 05a092d..8ba2e75 100644 --- a/olmocr/bench/tests.py +++ b/olmocr/bench/tests.py @@ -11,8 +11,7 @@ from fuzzysearch import find_near_matches from rapidfuzz import fuzz from olmocr.repeatdetect import RepeatDetector -from .katex.render import render_equation -from .katex.compare import find_image_match +from .katex.render import render_equation, compare_rendered_equations class TestType(str, Enum): BASELINE = "baseline" @@ -510,14 +509,7 @@ class MathTest(BasePDFTest): if not hypothesis_render: continue - # Now, let's see what the matchup is between the two images - match_score, x, y = find_image_match(hypothesis_render, self.reference_render) - - if match_score > best_match_score: - best_match_score = match_score - best_match_render = hypothesis_render - - if match_score >= self.threshold: + if compare_rendered_equations(self.reference_render, hypothesis_render): return True, "" # self.reference_render.save(f"maths/{self.id}_ref.png", format="PNG") diff --git a/pyproject.toml b/pyproject.toml index 51dd33d..62639fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,7 @@ bench = [ "sequence_align", "syntok", "google-genai", + "playwright", ] train = [