Fixes for math mining

This commit is contained in:
Jake Poznanski 2025-03-12 15:49:07 -07:00
parent 09fd299242
commit d0b9b5b7a8
3 changed files with 25 additions and 26 deletions

View File

@ -12,6 +12,7 @@ Requirements:
""" """
import os import os
import re
import html import html
import hashlib import hashlib
import pathlib import pathlib
@ -21,7 +22,8 @@ import shutil
from dataclasses import dataclass from dataclasses import dataclass
from typing import List from typing import List
import unittest import unittest
import xml.etree.ElementTree as ET import html.entities
from lxml import etree
from playwright.sync_api import sync_playwright, Error as PlaywrightError from playwright.sync_api import sync_playwright, Error as PlaywrightError
@ -239,7 +241,7 @@ def render_equation(
# Build the result as a RenderedEquation dataclass # Build the result as a RenderedEquation dataclass
rendered_eq = RenderedEquation( rendered_eq = RenderedEquation(
mathml=html.unescape(mathml), mathml=mathml,
spans=[ spans=[
SpanInfo( SpanInfo(
text=s["text"], text=s["text"],
@ -286,30 +288,26 @@ def compare_rendered_equations(reference: RenderedEquation, hypothesis: Rendered
for the hypothesis neighbor otherwise, the candidate must have the same text as the hypothesis neighbor. for the hypothesis neighbor otherwise, the candidate must have the same text as the hypothesis neighbor.
The algorithm uses backtracking to explore all possible assignments. The algorithm uses backtracking to explore all possible assignments.
""" """
import xml.etree.ElementTree as ET from bs4 import BeautifulSoup
import re
def strip_namespaces(elem: ET.Element) -> ET.Element:
for sub in elem.iter():
if '}' in sub.tag:
sub.tag = sub.tag.split('}', 1)[1]
return elem
def extract_inner(mathml: str) -> str: def extract_inner(mathml: str) -> str:
try: try:
root = ET.fromstring(mathml) # Use the "xml" parser so that BeautifulSoup parses MathML correctly,
root = strip_namespaces(root) # handling HTML entities along the way.
semantics = root.find('semantics') soup = BeautifulSoup(mathml, "xml")
if semantics is not None: semantics = soup.find("semantics")
inner_parts = [] if semantics:
for child in semantics: # Concatenate the string representation of all children except <annotation>
if child.tag != 'annotation': inner_parts = [
inner_parts.append(ET.tostring(child, encoding='unicode')) str(child)
for child in semantics.contents
if getattr(child, "name", None) != "annotation"
]
return ''.join(inner_parts) return ''.join(inner_parts)
else: else:
return ET.tostring(root, encoding='unicode') return str(soup)
except Exception as e: except Exception as e:
print("Error parsing MathML:", e) print("Error parsing MathML with BeautifulSoup:", e)
print(mathml) print(mathml)
return mathml return mathml

View File

@ -12,6 +12,7 @@ import time
import io import io
import tarfile import tarfile
import requests import requests
from tqdm import tqdm
def download_and_extract_source(paper_id, data_dir): def download_and_extract_source(paper_id, data_dir):
source_url = f"https://export.arxiv.org/src/{paper_id}" source_url = f"https://export.arxiv.org/src/{paper_id}"
@ -97,7 +98,7 @@ def main():
print(f"Found {len(paper_ids)} papers.") print(f"Found {len(paper_ids)} papers.")
# For each paper, only keep the files if both the tex extraction and pdf download succeed. # For each paper, only keep the files if both the tex extraction and pdf download succeed.
for paper_id in paper_ids: for paper_id in tqdm(paper_ids):
tex_success = download_and_extract_source(paper_id, args.data_dir) tex_success = download_and_extract_source(paper_id, args.data_dir)
if not tex_success: if not tex_success:
print(f"Skipping PDF download for {paper_id} because tex extraction failed.") print(f"Skipping PDF download for {paper_id} because tex extraction failed.")

View File

@ -320,7 +320,6 @@ def main():
) )
parser.add_argument("--math_data", required=True, help="Path to math_data folder") parser.add_argument("--math_data", required=True, help="Path to math_data folder")
parser.add_argument("--candidate", required=True, help="Candidate folder name inside math_data") parser.add_argument("--candidate", required=True, help="Candidate folder name inside math_data")
parser.add_argument("--output_file", default="math_tests.jsonl", help="Output file for math tests in JSONL format")
parser.add_argument("--max_pages", type=int, default=3, help="Maximum distinct pages to process per TeX document") parser.add_argument("--max_pages", type=int, default=3, help="Maximum distinct pages to process per TeX document")
parser.add_argument("--parallel", type=int, default=8, help="Maximum process pool workers") parser.add_argument("--parallel", type=int, default=8, help="Maximum process pool workers")
parser.add_argument("--sim_threshold", type=float, default=0.7, help="Similarity threshold for matching candidate text") parser.add_argument("--sim_threshold", type=float, default=0.7, help="Similarity threshold for matching candidate text")
@ -338,8 +337,9 @@ def main():
logging.info("After filtering, %d candidate files will be processed.", len(candidate_files_filtered)) logging.info("After filtering, %d candidate files will be processed.", len(candidate_files_filtered))
# Remove output file if it exists to start fresh # Remove output file if it exists to start fresh
if os.path.exists(args.output_file): output_file = os.path.join(args.math_data, "math_tests.jsonl")
os.remove(args.output_file) if os.path.exists(output_file):
os.remove(output_file)
all_math_tests = [] all_math_tests = []
@ -355,12 +355,12 @@ def main():
tests = future.result() tests = future.result()
all_math_tests.extend(tests) all_math_tests.extend(tests)
# Incrementally save tests as each candidate file finishes processing. # Incrementally save tests as each candidate file finishes processing.
save_tests(all_math_tests, args.output_file) save_tests(all_math_tests, output_file)
except Exception as e: except Exception as e:
logging.error("Error processing %s: %s", candidate_file, e) logging.error("Error processing %s: %s", candidate_file, e)
logging.info("Found %d valid math equations from %d candidate files.", len(all_math_tests), len(candidate_files_filtered)) logging.info("Found %d valid math equations from %d candidate files.", len(all_math_tests), len(candidate_files_filtered))
logging.info("Results incrementally saved to %s", args.output_file) logging.info("Results incrementally saved to %s", output_file)
if __name__ == "__main__": if __name__ == "__main__":