mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-14 01:19:29 +00:00
Fix for some math equations stuff
This commit is contained in:
parent
d36357f3db
commit
5c6225b227
@ -875,14 +875,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
|
|||||||
if num_order_tests > 5:
|
if num_order_tests > 5:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Step 4: Generate Math tests for LaTeX equations
|
# Step 4: Generate Math tests for LaTeX equations from the markdown
|
||||||
# Get only the body content as a string to search for math patterns
|
|
||||||
body = soup.find('body')
|
|
||||||
if body:
|
|
||||||
body_html = str(body)
|
|
||||||
else:
|
|
||||||
# If no body tag, use the whole soup
|
|
||||||
body_html = str(soup)
|
|
||||||
|
|
||||||
# Define math patterns to search for
|
# Define math patterns to search for
|
||||||
math_patterns = [
|
math_patterns = [
|
||||||
@ -893,7 +886,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
|
|||||||
|
|
||||||
math_equations = []
|
math_equations = []
|
||||||
for pattern, flags in math_patterns:
|
for pattern, flags in math_patterns:
|
||||||
matches = re.findall(pattern, body_html, flags)
|
matches = re.findall(pattern, markdown_content, flags)
|
||||||
for match in matches:
|
for match in matches:
|
||||||
# Clean up the match - remove extra whitespace and newlines
|
# Clean up the match - remove extra whitespace and newlines
|
||||||
equation = match.strip()
|
equation = match.strip()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user