mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-25 16:30:28 +00:00
Fixes for math mining
This commit is contained in:
parent
09fd299242
commit
d0b9b5b7a8
@ -12,6 +12,7 @@ Requirements:
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import html
|
||||
import hashlib
|
||||
import pathlib
|
||||
@ -21,7 +22,8 @@ import shutil
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
import unittest
|
||||
import xml.etree.ElementTree as ET
|
||||
import html.entities
|
||||
from lxml import etree
|
||||
|
||||
from playwright.sync_api import sync_playwright, Error as PlaywrightError
|
||||
|
||||
@ -239,7 +241,7 @@ def render_equation(
|
||||
|
||||
# Build the result as a RenderedEquation dataclass
|
||||
rendered_eq = RenderedEquation(
|
||||
mathml=html.unescape(mathml),
|
||||
mathml=mathml,
|
||||
spans=[
|
||||
SpanInfo(
|
||||
text=s["text"],
|
||||
@ -286,30 +288,26 @@ def compare_rendered_equations(reference: RenderedEquation, hypothesis: Rendered
|
||||
for the hypothesis neighbor – otherwise, the candidate must have the same text as the hypothesis neighbor.
|
||||
The algorithm uses backtracking to explore all possible assignments.
|
||||
"""
|
||||
import xml.etree.ElementTree as ET
|
||||
import re
|
||||
|
||||
def strip_namespaces(elem: ET.Element) -> ET.Element:
|
||||
for sub in elem.iter():
|
||||
if '}' in sub.tag:
|
||||
sub.tag = sub.tag.split('}', 1)[1]
|
||||
return elem
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def extract_inner(mathml: str) -> str:
|
||||
try:
|
||||
root = ET.fromstring(mathml)
|
||||
root = strip_namespaces(root)
|
||||
semantics = root.find('semantics')
|
||||
if semantics is not None:
|
||||
inner_parts = []
|
||||
for child in semantics:
|
||||
if child.tag != 'annotation':
|
||||
inner_parts.append(ET.tostring(child, encoding='unicode'))
|
||||
# Use the "xml" parser so that BeautifulSoup parses MathML correctly,
|
||||
# handling HTML entities along the way.
|
||||
soup = BeautifulSoup(mathml, "xml")
|
||||
semantics = soup.find("semantics")
|
||||
if semantics:
|
||||
# Concatenate the string representation of all children except <annotation>
|
||||
inner_parts = [
|
||||
str(child)
|
||||
for child in semantics.contents
|
||||
if getattr(child, "name", None) != "annotation"
|
||||
]
|
||||
return ''.join(inner_parts)
|
||||
else:
|
||||
return ET.tostring(root, encoding='unicode')
|
||||
return str(soup)
|
||||
except Exception as e:
|
||||
print("Error parsing MathML:", e)
|
||||
print("Error parsing MathML with BeautifulSoup:", e)
|
||||
print(mathml)
|
||||
return mathml
|
||||
|
||||
|
@ -12,6 +12,7 @@ import time
|
||||
import io
|
||||
import tarfile
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
def download_and_extract_source(paper_id, data_dir):
|
||||
source_url = f"https://export.arxiv.org/src/{paper_id}"
|
||||
@ -97,7 +98,7 @@ def main():
|
||||
print(f"Found {len(paper_ids)} papers.")
|
||||
|
||||
# For each paper, only keep the files if both the tex extraction and pdf download succeed.
|
||||
for paper_id in paper_ids:
|
||||
for paper_id in tqdm(paper_ids):
|
||||
tex_success = download_and_extract_source(paper_id, args.data_dir)
|
||||
if not tex_success:
|
||||
print(f"Skipping PDF download for {paper_id} because tex extraction failed.")
|
||||
|
@ -320,7 +320,6 @@ def main():
|
||||
)
|
||||
parser.add_argument("--math_data", required=True, help="Path to math_data folder")
|
||||
parser.add_argument("--candidate", required=True, help="Candidate folder name inside math_data")
|
||||
parser.add_argument("--output_file", default="math_tests.jsonl", help="Output file for math tests in JSONL format")
|
||||
parser.add_argument("--max_pages", type=int, default=3, help="Maximum distinct pages to process per TeX document")
|
||||
parser.add_argument("--parallel", type=int, default=8, help="Maximum process pool workers")
|
||||
parser.add_argument("--sim_threshold", type=float, default=0.7, help="Similarity threshold for matching candidate text")
|
||||
@ -338,8 +337,9 @@ def main():
|
||||
logging.info("After filtering, %d candidate files will be processed.", len(candidate_files_filtered))
|
||||
|
||||
# Remove output file if it exists to start fresh
|
||||
if os.path.exists(args.output_file):
|
||||
os.remove(args.output_file)
|
||||
output_file = os.path.join(args.math_data, "math_tests.jsonl")
|
||||
if os.path.exists(output_file):
|
||||
os.remove(output_file)
|
||||
|
||||
all_math_tests = []
|
||||
|
||||
@ -355,12 +355,12 @@ def main():
|
||||
tests = future.result()
|
||||
all_math_tests.extend(tests)
|
||||
# Incrementally save tests as each candidate file finishes processing.
|
||||
save_tests(all_math_tests, args.output_file)
|
||||
save_tests(all_math_tests, output_file)
|
||||
except Exception as e:
|
||||
logging.error("Error processing %s: %s", candidate_file, e)
|
||||
|
||||
logging.info("Found %d valid math equations from %d candidate files.", len(all_math_tests), len(candidate_files_filtered))
|
||||
logging.info("Results incrementally saved to %s", args.output_file)
|
||||
logging.info("Results incrementally saved to %s", output_file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Loading…
x
Reference in New Issue
Block a user