diff --git a/olmocr/bench/katex/render.py b/olmocr/bench/katex/render.py index d1bab5e..9da9a3e 100644 --- a/olmocr/bench/katex/render.py +++ b/olmocr/bench/katex/render.py @@ -12,6 +12,7 @@ Requirements: """ import os +import re import html import hashlib import pathlib @@ -21,7 +22,8 @@ import shutil from dataclasses import dataclass from typing import List import unittest -import xml.etree.ElementTree as ET +import html.entities +from lxml import etree from playwright.sync_api import sync_playwright, Error as PlaywrightError @@ -239,7 +241,7 @@ def render_equation( # Build the result as a RenderedEquation dataclass rendered_eq = RenderedEquation( - mathml=html.unescape(mathml), + mathml=mathml, spans=[ SpanInfo( text=s["text"], @@ -286,30 +288,26 @@ def compare_rendered_equations(reference: RenderedEquation, hypothesis: Rendered for the hypothesis neighbor – otherwise, the candidate must have the same text as the hypothesis neighbor. The algorithm uses backtracking to explore all possible assignments. """ - import xml.etree.ElementTree as ET - import re - - def strip_namespaces(elem: ET.Element) -> ET.Element: - for sub in elem.iter(): - if '}' in sub.tag: - sub.tag = sub.tag.split('}', 1)[1] - return elem + from bs4 import BeautifulSoup def extract_inner(mathml: str) -> str: try: - root = ET.fromstring(mathml) - root = strip_namespaces(root) - semantics = root.find('semantics') - if semantics is not None: - inner_parts = [] - for child in semantics: - if child.tag != 'annotation': - inner_parts.append(ET.tostring(child, encoding='unicode')) + # Use the "xml" parser so that BeautifulSoup parses MathML correctly, + # handling HTML entities along the way. + soup = BeautifulSoup(mathml, "xml") + semantics = soup.find("semantics") + if semantics: + # Concatenate the string representation of all children except + inner_parts = [ + str(child) + for child in semantics.contents + if getattr(child, "name", None) != "annotation" + ] return ''.join(inner_parts) else: - return ET.tostring(root, encoding='unicode') + return str(soup) except Exception as e: - print("Error parsing MathML:", e) + print("Error parsing MathML with BeautifulSoup:", e) print(mathml) return mathml diff --git a/olmocr/bench/miners/download_math.py b/olmocr/bench/miners/download_math.py index 6c4a722..9a84a19 100644 --- a/olmocr/bench/miners/download_math.py +++ b/olmocr/bench/miners/download_math.py @@ -12,6 +12,7 @@ import time import io import tarfile import requests +from tqdm import tqdm def download_and_extract_source(paper_id, data_dir): source_url = f"https://export.arxiv.org/src/{paper_id}" @@ -97,7 +98,7 @@ def main(): print(f"Found {len(paper_ids)} papers.") # For each paper, only keep the files if both the tex extraction and pdf download succeed. - for paper_id in paper_ids: + for paper_id in tqdm(paper_ids): tex_success = download_and_extract_source(paper_id, args.data_dir) if not tex_success: print(f"Skipping PDF download for {paper_id} because tex extraction failed.") diff --git a/olmocr/bench/miners/mine_math.py b/olmocr/bench/miners/mine_math.py index 8f2b102..6b2eb5b 100644 --- a/olmocr/bench/miners/mine_math.py +++ b/olmocr/bench/miners/mine_math.py @@ -320,7 +320,6 @@ def main(): ) parser.add_argument("--math_data", required=True, help="Path to math_data folder") parser.add_argument("--candidate", required=True, help="Candidate folder name inside math_data") - parser.add_argument("--output_file", default="math_tests.jsonl", help="Output file for math tests in JSONL format") parser.add_argument("--max_pages", type=int, default=3, help="Maximum distinct pages to process per TeX document") parser.add_argument("--parallel", type=int, default=8, help="Maximum process pool workers") parser.add_argument("--sim_threshold", type=float, default=0.7, help="Similarity threshold for matching candidate text") @@ -338,8 +337,9 @@ def main(): logging.info("After filtering, %d candidate files will be processed.", len(candidate_files_filtered)) # Remove output file if it exists to start fresh - if os.path.exists(args.output_file): - os.remove(args.output_file) + output_file = os.path.join(args.math_data, "math_tests.jsonl") + if os.path.exists(output_file): + os.remove(output_file) all_math_tests = [] @@ -355,12 +355,12 @@ def main(): tests = future.result() all_math_tests.extend(tests) # Incrementally save tests as each candidate file finishes processing. - save_tests(all_math_tests, args.output_file) + save_tests(all_math_tests, output_file) except Exception as e: logging.error("Error processing %s: %s", candidate_file, e) logging.info("Found %d valid math equations from %d candidate files.", len(all_math_tests), len(candidate_files_filtered)) - logging.info("Results incrementally saved to %s", args.output_file) + logging.info("Results incrementally saved to %s", output_file) if __name__ == "__main__":