mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-03 03:25:22 +00:00
Fix for some math equations stuff
This commit is contained in:
parent
d36357f3db
commit
5c6225b227
@ -875,14 +875,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
|
||||
if num_order_tests > 5:
|
||||
break
|
||||
|
||||
# Step 4: Generate Math tests for LaTeX equations
|
||||
# Get only the body content as a string to search for math patterns
|
||||
body = soup.find('body')
|
||||
if body:
|
||||
body_html = str(body)
|
||||
else:
|
||||
# If no body tag, use the whole soup
|
||||
body_html = str(soup)
|
||||
# Step 4: Generate Math tests for LaTeX equations from the markdown
|
||||
|
||||
# Define math patterns to search for
|
||||
math_patterns = [
|
||||
@ -893,7 +886,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
|
||||
|
||||
math_equations = []
|
||||
for pattern, flags in math_patterns:
|
||||
matches = re.findall(pattern, body_html, flags)
|
||||
matches = re.findall(pattern, markdown_content, flags)
|
||||
for match in matches:
|
||||
# Clean up the match - remove extra whitespace and newlines
|
||||
equation = match.strip()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user