Fixes for math mining

This commit is contained in:
Jake Poznanski 2025-03-12 15:49:07 -07:00
parent 09fd299242
commit d0b9b5b7a8
3 changed files with 25 additions and 26 deletions

View File

@ -12,6 +12,7 @@ Requirements:
"""
import os
import re
import html
import hashlib
import pathlib
@ -21,7 +22,8 @@ import shutil
from dataclasses import dataclass
from typing import List
import unittest
import xml.etree.ElementTree as ET
import html.entities
from lxml import etree
from playwright.sync_api import sync_playwright, Error as PlaywrightError
@ -239,7 +241,7 @@ def render_equation(
# Build the result as a RenderedEquation dataclass
rendered_eq = RenderedEquation(
mathml=html.unescape(mathml),
mathml=mathml,
spans=[
SpanInfo(
text=s["text"],
@ -286,30 +288,26 @@ def compare_rendered_equations(reference: RenderedEquation, hypothesis: Rendered
for the hypothesis neighbor otherwise, the candidate must have the same text as the hypothesis neighbor.
The algorithm uses backtracking to explore all possible assignments.
"""
import xml.etree.ElementTree as ET
import re
def strip_namespaces(elem: ET.Element) -> ET.Element:
for sub in elem.iter():
if '}' in sub.tag:
sub.tag = sub.tag.split('}', 1)[1]
return elem
from bs4 import BeautifulSoup
def extract_inner(mathml: str) -> str:
try:
root = ET.fromstring(mathml)
root = strip_namespaces(root)
semantics = root.find('semantics')
if semantics is not None:
inner_parts = []
for child in semantics:
if child.tag != 'annotation':
inner_parts.append(ET.tostring(child, encoding='unicode'))
# Use the "xml" parser so that BeautifulSoup parses MathML correctly,
# handling HTML entities along the way.
soup = BeautifulSoup(mathml, "xml")
semantics = soup.find("semantics")
if semantics:
# Concatenate the string representation of all children except <annotation>
inner_parts = [
str(child)
for child in semantics.contents
if getattr(child, "name", None) != "annotation"
]
return ''.join(inner_parts)
else:
return ET.tostring(root, encoding='unicode')
return str(soup)
except Exception as e:
print("Error parsing MathML:", e)
print("Error parsing MathML with BeautifulSoup:", e)
print(mathml)
return mathml

View File

@ -12,6 +12,7 @@ import time
import io
import tarfile
import requests
from tqdm import tqdm
def download_and_extract_source(paper_id, data_dir):
source_url = f"https://export.arxiv.org/src/{paper_id}"
@ -97,7 +98,7 @@ def main():
print(f"Found {len(paper_ids)} papers.")
# For each paper, only keep the files if both the tex extraction and pdf download succeed.
for paper_id in paper_ids:
for paper_id in tqdm(paper_ids):
tex_success = download_and_extract_source(paper_id, args.data_dir)
if not tex_success:
print(f"Skipping PDF download for {paper_id} because tex extraction failed.")

View File

@ -320,7 +320,6 @@ def main():
)
parser.add_argument("--math_data", required=True, help="Path to math_data folder")
parser.add_argument("--candidate", required=True, help="Candidate folder name inside math_data")
parser.add_argument("--output_file", default="math_tests.jsonl", help="Output file for math tests in JSONL format")
parser.add_argument("--max_pages", type=int, default=3, help="Maximum distinct pages to process per TeX document")
parser.add_argument("--parallel", type=int, default=8, help="Maximum process pool workers")
parser.add_argument("--sim_threshold", type=float, default=0.7, help="Similarity threshold for matching candidate text")
@ -338,8 +337,9 @@ def main():
logging.info("After filtering, %d candidate files will be processed.", len(candidate_files_filtered))
# Remove output file if it exists to start fresh
if os.path.exists(args.output_file):
os.remove(args.output_file)
output_file = os.path.join(args.math_data, "math_tests.jsonl")
if os.path.exists(output_file):
os.remove(output_file)
all_math_tests = []
@ -355,12 +355,12 @@ def main():
tests = future.result()
all_math_tests.extend(tests)
# Incrementally save tests as each candidate file finishes processing.
save_tests(all_math_tests, args.output_file)
save_tests(all_math_tests, output_file)
except Exception as e:
logging.error("Error processing %s: %s", candidate_file, e)
logging.info("Found %d valid math equations from %d candidate files.", len(all_math_tests), len(candidate_files_filtered))
logging.info("Results incrementally saved to %s", args.output_file)
logging.info("Results incrementally saved to %s", output_file)
if __name__ == "__main__":