mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-27 09:27:55 +00:00
Fixes for math mining
This commit is contained in:
parent
09fd299242
commit
d0b9b5b7a8
@ -12,6 +12,7 @@ Requirements:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import html
|
import html
|
||||||
import hashlib
|
import hashlib
|
||||||
import pathlib
|
import pathlib
|
||||||
@ -21,7 +22,8 @@ import shutil
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import List
|
from typing import List
|
||||||
import unittest
|
import unittest
|
||||||
import xml.etree.ElementTree as ET
|
import html.entities
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
from playwright.sync_api import sync_playwright, Error as PlaywrightError
|
from playwright.sync_api import sync_playwright, Error as PlaywrightError
|
||||||
|
|
||||||
@ -239,7 +241,7 @@ def render_equation(
|
|||||||
|
|
||||||
# Build the result as a RenderedEquation dataclass
|
# Build the result as a RenderedEquation dataclass
|
||||||
rendered_eq = RenderedEquation(
|
rendered_eq = RenderedEquation(
|
||||||
mathml=html.unescape(mathml),
|
mathml=mathml,
|
||||||
spans=[
|
spans=[
|
||||||
SpanInfo(
|
SpanInfo(
|
||||||
text=s["text"],
|
text=s["text"],
|
||||||
@ -286,30 +288,26 @@ def compare_rendered_equations(reference: RenderedEquation, hypothesis: Rendered
|
|||||||
for the hypothesis neighbor – otherwise, the candidate must have the same text as the hypothesis neighbor.
|
for the hypothesis neighbor – otherwise, the candidate must have the same text as the hypothesis neighbor.
|
||||||
The algorithm uses backtracking to explore all possible assignments.
|
The algorithm uses backtracking to explore all possible assignments.
|
||||||
"""
|
"""
|
||||||
import xml.etree.ElementTree as ET
|
from bs4 import BeautifulSoup
|
||||||
import re
|
|
||||||
|
|
||||||
def strip_namespaces(elem: ET.Element) -> ET.Element:
|
|
||||||
for sub in elem.iter():
|
|
||||||
if '}' in sub.tag:
|
|
||||||
sub.tag = sub.tag.split('}', 1)[1]
|
|
||||||
return elem
|
|
||||||
|
|
||||||
def extract_inner(mathml: str) -> str:
|
def extract_inner(mathml: str) -> str:
|
||||||
try:
|
try:
|
||||||
root = ET.fromstring(mathml)
|
# Use the "xml" parser so that BeautifulSoup parses MathML correctly,
|
||||||
root = strip_namespaces(root)
|
# handling HTML entities along the way.
|
||||||
semantics = root.find('semantics')
|
soup = BeautifulSoup(mathml, "xml")
|
||||||
if semantics is not None:
|
semantics = soup.find("semantics")
|
||||||
inner_parts = []
|
if semantics:
|
||||||
for child in semantics:
|
# Concatenate the string representation of all children except <annotation>
|
||||||
if child.tag != 'annotation':
|
inner_parts = [
|
||||||
inner_parts.append(ET.tostring(child, encoding='unicode'))
|
str(child)
|
||||||
|
for child in semantics.contents
|
||||||
|
if getattr(child, "name", None) != "annotation"
|
||||||
|
]
|
||||||
return ''.join(inner_parts)
|
return ''.join(inner_parts)
|
||||||
else:
|
else:
|
||||||
return ET.tostring(root, encoding='unicode')
|
return str(soup)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Error parsing MathML:", e)
|
print("Error parsing MathML with BeautifulSoup:", e)
|
||||||
print(mathml)
|
print(mathml)
|
||||||
return mathml
|
return mathml
|
||||||
|
|
||||||
|
@ -12,6 +12,7 @@ import time
|
|||||||
import io
|
import io
|
||||||
import tarfile
|
import tarfile
|
||||||
import requests
|
import requests
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
def download_and_extract_source(paper_id, data_dir):
|
def download_and_extract_source(paper_id, data_dir):
|
||||||
source_url = f"https://export.arxiv.org/src/{paper_id}"
|
source_url = f"https://export.arxiv.org/src/{paper_id}"
|
||||||
@ -97,7 +98,7 @@ def main():
|
|||||||
print(f"Found {len(paper_ids)} papers.")
|
print(f"Found {len(paper_ids)} papers.")
|
||||||
|
|
||||||
# For each paper, only keep the files if both the tex extraction and pdf download succeed.
|
# For each paper, only keep the files if both the tex extraction and pdf download succeed.
|
||||||
for paper_id in paper_ids:
|
for paper_id in tqdm(paper_ids):
|
||||||
tex_success = download_and_extract_source(paper_id, args.data_dir)
|
tex_success = download_and_extract_source(paper_id, args.data_dir)
|
||||||
if not tex_success:
|
if not tex_success:
|
||||||
print(f"Skipping PDF download for {paper_id} because tex extraction failed.")
|
print(f"Skipping PDF download for {paper_id} because tex extraction failed.")
|
||||||
|
@ -320,7 +320,6 @@ def main():
|
|||||||
)
|
)
|
||||||
parser.add_argument("--math_data", required=True, help="Path to math_data folder")
|
parser.add_argument("--math_data", required=True, help="Path to math_data folder")
|
||||||
parser.add_argument("--candidate", required=True, help="Candidate folder name inside math_data")
|
parser.add_argument("--candidate", required=True, help="Candidate folder name inside math_data")
|
||||||
parser.add_argument("--output_file", default="math_tests.jsonl", help="Output file for math tests in JSONL format")
|
|
||||||
parser.add_argument("--max_pages", type=int, default=3, help="Maximum distinct pages to process per TeX document")
|
parser.add_argument("--max_pages", type=int, default=3, help="Maximum distinct pages to process per TeX document")
|
||||||
parser.add_argument("--parallel", type=int, default=8, help="Maximum process pool workers")
|
parser.add_argument("--parallel", type=int, default=8, help="Maximum process pool workers")
|
||||||
parser.add_argument("--sim_threshold", type=float, default=0.7, help="Similarity threshold for matching candidate text")
|
parser.add_argument("--sim_threshold", type=float, default=0.7, help="Similarity threshold for matching candidate text")
|
||||||
@ -338,8 +337,9 @@ def main():
|
|||||||
logging.info("After filtering, %d candidate files will be processed.", len(candidate_files_filtered))
|
logging.info("After filtering, %d candidate files will be processed.", len(candidate_files_filtered))
|
||||||
|
|
||||||
# Remove output file if it exists to start fresh
|
# Remove output file if it exists to start fresh
|
||||||
if os.path.exists(args.output_file):
|
output_file = os.path.join(args.math_data, "math_tests.jsonl")
|
||||||
os.remove(args.output_file)
|
if os.path.exists(output_file):
|
||||||
|
os.remove(output_file)
|
||||||
|
|
||||||
all_math_tests = []
|
all_math_tests = []
|
||||||
|
|
||||||
@ -355,12 +355,12 @@ def main():
|
|||||||
tests = future.result()
|
tests = future.result()
|
||||||
all_math_tests.extend(tests)
|
all_math_tests.extend(tests)
|
||||||
# Incrementally save tests as each candidate file finishes processing.
|
# Incrementally save tests as each candidate file finishes processing.
|
||||||
save_tests(all_math_tests, args.output_file)
|
save_tests(all_math_tests, output_file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error("Error processing %s: %s", candidate_file, e)
|
logging.error("Error processing %s: %s", candidate_file, e)
|
||||||
|
|
||||||
logging.info("Found %d valid math equations from %d candidate files.", len(all_math_tests), len(candidate_files_filtered))
|
logging.info("Found %d valid math equations from %d candidate files.", len(all_math_tests), len(candidate_files_filtered))
|
||||||
logging.info("Results incrementally saved to %s", args.output_file)
|
logging.info("Results incrementally saved to %s", output_file)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
Loading…
x
Reference in New Issue
Block a user