Fixes for math mining

2025-11-25 14:52:56 +00:00 · 2025-03-12 15:49:07 -07:00 · 2025-03-12 15:49:07 -07:00 · d0b9b5b7a8
commit d0b9b5b7a8
parent 09fd299242
3 changed files with 25 additions and 26 deletions
--- a/olmocr/bench/katex/render.py
+++ b/olmocr/bench/katex/render.py
@ -12,6 +12,7 @@ Requirements:
 """

 import os
+import re
 import html
 import hashlib
 import pathlib
@ -21,7 +22,8 @@ import shutil
 from dataclasses import dataclass
 from typing import List
 import unittest
-import xml.etree.ElementTree as ET
+import html.entities
+from lxml import etree

 from playwright.sync_api import sync_playwright, Error as PlaywrightError

@ -239,7 +241,7 @@ def render_equation(
        
        # Build the result as a RenderedEquation dataclass
        rendered_eq = RenderedEquation(
-            mathml=html.unescape(mathml),
+            mathml=mathml,
            spans=[
                SpanInfo(
                    text=s["text"],
@ -286,30 +288,26 @@ def compare_rendered_equations(reference: RenderedEquation, hypothesis: Rendered
    for the hypothesis neighbor – otherwise, the candidate must have the same text as the hypothesis neighbor.
    The algorithm uses backtracking to explore all possible assignments.
    """
-    import xml.etree.ElementTree as ET
-    import re
-
-    def strip_namespaces(elem: ET.Element) -> ET.Element:
-        for sub in elem.iter():
-            if '}' in sub.tag:
-                sub.tag = sub.tag.split('}', 1)[1]
-        return elem
+    from bs4 import BeautifulSoup

    def extract_inner(mathml: str) -> str:
        try:
-            root = ET.fromstring(mathml)
-            root = strip_namespaces(root)
-            semantics = root.find('semantics')
-            if semantics is not None:
-                inner_parts = []
-                for child in semantics:
-                    if child.tag != 'annotation':
-                        inner_parts.append(ET.tostring(child, encoding='unicode'))
+            # Use the "xml" parser so that BeautifulSoup parses MathML correctly,
+            # handling HTML entities along the way.
+            soup = BeautifulSoup(mathml, "xml")
+            semantics = soup.find("semantics")
+            if semantics:
+                # Concatenate the string representation of all children except <annotation>
+                inner_parts = [
+                    str(child)
+                    for child in semantics.contents
+                    if getattr(child, "name", None) != "annotation"
+                ]
                return ''.join(inner_parts)
            else:
-                return ET.tostring(root, encoding='unicode')
+                return str(soup)
        except Exception as e:
-            print("Error parsing MathML:", e)
+            print("Error parsing MathML with BeautifulSoup:", e)
            print(mathml)
            return mathml

--- a/olmocr/bench/miners/download_math.py
+++ b/olmocr/bench/miners/download_math.py
@ -12,6 +12,7 @@ import time
 import io
 import tarfile
 import requests
+from tqdm import tqdm

 def download_and_extract_source(paper_id, data_dir):
    source_url = f"https://export.arxiv.org/src/{paper_id}"
@ -97,7 +98,7 @@ def main():
    print(f"Found {len(paper_ids)} papers.")

    # For each paper, only keep the files if both the tex extraction and pdf download succeed.
-    for paper_id in paper_ids:
+    for paper_id in tqdm(paper_ids):
        tex_success = download_and_extract_source(paper_id, args.data_dir)
        if not tex_success:
            print(f"Skipping PDF download for {paper_id} because tex extraction failed.")
--- a/olmocr/bench/miners/mine_math.py
+++ b/olmocr/bench/miners/mine_math.py
@ -320,7 +320,6 @@ def main():
    )
    parser.add_argument("--math_data", required=True, help="Path to math_data folder")
    parser.add_argument("--candidate", required=True, help="Candidate folder name inside math_data")
-    parser.add_argument("--output_file", default="math_tests.jsonl", help="Output file for math tests in JSONL format")
    parser.add_argument("--max_pages", type=int, default=3, help="Maximum distinct pages to process per TeX document")
    parser.add_argument("--parallel", type=int, default=8, help="Maximum process pool workers")
    parser.add_argument("--sim_threshold", type=float, default=0.7, help="Similarity threshold for matching candidate text")
@ -338,8 +337,9 @@ def main():
    logging.info("After filtering, %d candidate files will be processed.", len(candidate_files_filtered))
    
    # Remove output file if it exists to start fresh
-    if os.path.exists(args.output_file):
-        os.remove(args.output_file)
+    output_file = os.path.join(args.math_data, "math_tests.jsonl")
+    if os.path.exists(output_file):
+        os.remove(output_file)
    
    all_math_tests = []
    
@ -355,12 +355,12 @@ def main():
                tests = future.result()
                all_math_tests.extend(tests)
                # Incrementally save tests as each candidate file finishes processing.
-                save_tests(all_math_tests, args.output_file)
+                save_tests(all_math_tests, output_file)
            except Exception as e:
                logging.error("Error processing %s: %s", candidate_file, e)
    
    logging.info("Found %d valid math equations from %d candidate files.", len(all_math_tests), len(candidate_files_filtered))
-    logging.info("Results incrementally saved to %s", args.output_file)
+    logging.info("Results incrementally saved to %s", output_file)


 if __name__ == "__main__":