import unittest import re from unittest.mock import patch, MagicMock from bs4 import BeautifulSoup from olmocr.bench.synth.mine_html_templates import ( generate_tests_from_html, html_to_markdown_with_frontmatter, extract_html_metadata, PreserveTablesConverter ) from olmocr.bench.tests import TestType class TestMathExtraction(unittest.TestCase): """Test the math extraction functionality in mine_html_templates.py""" def test_math_extraction_from_html(self): """Test that math equations are properly extracted from HTML content""" html_content = """

Some text with inline math \\(x = 2\\) here.

Display math: \\[E = mc^2\\]

Another inline: \\(\\alpha + \\beta = \\gamma\\)

Complex display: \\[\\int_0^\\infty e^{-x} dx = 1\\]

""" # Generate tests from HTML tests = generate_tests_from_html(html_content, "test_pdf", 1) # Filter math tests math_tests = [t for t in tests if t.get("type") == "math"] # Check that we extracted math equations self.assertTrue(len(math_tests) > 0, "Should extract at least one math equation") # Check that specific equations were extracted math_contents = [t["math"] for t in math_tests] self.assertIn("x = 2", math_contents) self.assertIn("E = mc^2", math_contents) self.assertIn("\\alpha + \\beta = \\gamma", math_contents) self.assertIn("\\int_0^\\infty e^{-x} dx = 1", math_contents) def test_math_extraction_with_multiline(self): """Test extraction of multiline math equations""" html_content = """

Multiline equation: \\[ e_i = \\frac{e_i + \\varphi(e_i)}{2} + \\frac{e_i - \\varphi(e_i)}{2}, \\quad \\text{for } i \\in \\mathbb{N}. \\]

""" tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] # Check multiline equation is captured self.assertTrue(len(math_tests) > 0) # Check that the multiline content is preserved (without excessive newlines) found_multiline = False for test in math_tests: if "\\frac{e_i + \\varphi(e_i)}{2}" in test["math"] and "\\mathbb{N}" in test["math"]: found_multiline = True break self.assertTrue(found_multiline, "Should extract multiline equation correctly") def test_math_extraction_deduplication(self): """Test that duplicate math equations are deduplicated""" html_content = """

First occurrence: \\[x^2 + y^2 = z^2\\]

Second occurrence: \\[x^2 + y^2 = z^2\\]

Third occurrence: \\[x^2 + y^2 = z^2\\]

""" tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] # Count how many times the equation appears equation_count = sum(1 for t in math_tests if "x^2 + y^2 = z^2" in t["math"]) # Should only appear once due to deduplication self.assertEqual(equation_count, 1, "Duplicate equations should be deduplicated") def test_math_extraction_patterns(self): """Test different math delimiter patterns""" html_content = """

Pattern 1: \\(inline1\\)

Pattern 2: \\[display1\\]

Pattern 3: $$display2$$

""" tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] math_contents = [t["math"] for t in math_tests] # Check all patterns are captured self.assertIn("inline1", math_contents) self.assertIn("display1", math_contents) self.assertIn("display2", math_contents) def test_math_extraction_minimum_length(self): """Test that very short equations are filtered out""" html_content = """

Short: \\(x\\)

Also short: \\[y\\]

Long enough: \\(x=1\\)

""" tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] math_contents = [t["math"] for t in math_tests] # Short equations (length <= 2) should be filtered out self.assertNotIn("x", math_contents) self.assertNotIn("y", math_contents) # Longer equation should be included self.assertIn("x=1", math_contents) def test_math_validation_passes(self): """Test that valid math tests pass validation against markdown""" html_content = """

Test equation: \\[E = mc^2\\]

""" # Mock the validation to always pass for math tests with patch('olmocr.bench.synth.mine_html_templates.load_single_test') as mock_load: mock_test = MagicMock() mock_test.run.return_value = (True, None) mock_load.return_value = mock_test tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] # Verify math test was created self.assertTrue(len(math_tests) > 0) # Verify test has correct structure for test in math_tests: self.assertEqual(test["type"], "math") self.assertIn("math", test) self.assertEqual(test["max_diffs"], 0) self.assertIn("id", test) self.assertIn("pdf", test) self.assertEqual(test["page"], 1) def test_complex_markdown_example(self): """Test with the complex markdown example provided by the user""" # Convert markdown to HTML-like structure for testing html_content = '\n\n\n \n \n Automorphisms of Order Two\n \n \n \n \n\n\n
\n

If \\(\\varphi \\in \\text{Aut}(E)\\) with \\(\\varphi^2 = id\\) we observe that

\n \\[e_i = \\frac{e_i + \\varphi(e_i)}{2} + \\frac{e_i - \\varphi(e_i)}{2}, \\quad \\text{for } i \\in \\mathbb{N}.\\]\n \n

Setting \\(a_i = e_i + \\varphi(e_i)/2\\) we have:

\n \n \n
\n \n
\n
Definition 5
\n

Let \\(\\varphi \\in \\text{Aut}(E)\\). We say that \\(\\varphi\\) is of canonical type if \\(\\varphi(e_i) \\in E_{(1)}\\) for all \\(i\\).

\n \n

If \\(\\varphi\\) is an automorphism of order 2 on \\(E\\), we have that \\(\\varphi\\) is of canonical type if and only if \\(a_i \\in E_{(1)}\\) for all \\(i\\). Let us fix a basis \\(\\beta = \\{e_1, e_2, \\ldots, e_n, \\ldots\\}\\) of the vector space \\(L\\) and an automorphism \\(\\varphi \\in \\text{Aut}(E)\\) such that \\(\\varphi^2 = id\\). Then \\(\\varphi\\), as a linear transformation, has eigenvalues \\(\\pm 1\\) and \\(-1\\) only, and moreover, there exists a basis of the vector space \\(E\\) consisting of eigenvectors. (It is well known from elementary Linear Algebra that this fact does not depend on the dimension of the vector space as long as the characteristic of \\(F\\) is different from 2.) Then \\(E = E(1) \\oplus E(-1)\\) where \\(E(t)\\) is the eigenspace for the eigenvalue \\(t\\) of the linear transformation \\(\\varphi\\). One considers the intersections \\(L(t) = L \\cap E(t)\\), \\(t = \\pm 1\\). Changing the basis \\(\\beta\\), if necessary, one may assume that \\(L(t)\\) is the span of \\(\\beta \\cap L(t)\\). Clearly this change of basis gives rise to a homogeneous automorphism of \\(E\\) and we can take the composition of it and then \\(\\varphi\\). We shall assume that such a change of basis has been done.

\n \n

Denote

\n \\[I_\\varphi = \\{n \\in \\mathbb{N} \\mid \\varphi(e_n) = \\pm e_n\\}.\\]\n
\n \n

We shall distinguish the following four possibilities:

\n \n
    \n
  1. \\(I_\\varphi = \\mathbb{N}\\).
  2. \n
  3. \\(I_\\varphi \\neq \\mathbb{N}\\) is infinite.
  4. \n
  5. \\(I_\\varphi\\) is finite and nonempty.
  6. \n
  7. \\(I_\\gamma = \\emptyset\\) for every linear basis \\(\\gamma\\) of \\(L\\).
  8. \n
\n \n

We shall call these automorphisms (and also the corresponding \\(\\mathbb{Z}_2\\)-gradings), automorphisms (or gradings) of type 1, 2, 3, and 4, respectively.

\n \n

The automorphisms of type 1 induce \\(\\mathbb{Z}_2\\)-gradings on \\(E\\) in which all generators of \\(E\\) are homogeneous. Such structures are called homogeneous \\(\\mathbb{Z}_2\\)-gradings on \\(E\\). The corresponding graded identities were completely studied in [22, 24, 29].

\n \n

We conclude this section with the following lemma.

\n \n
\n
Lemma 6
\n

Let \\(\\varphi\\) be an automorphism of order two of \\(E\\). Then \\(\\varphi\\) is of type 4 if and only if, for every \\(v \\in L\\) such that \\(\\varphi(v) = \\pm v\\), one has \\(v = 0\\).

\n \n
\n Proof Assume that \\(\\varphi\\) is of type 4 and let \\(v \\in L\\) with \\(\\varphi(v) = \\pm v\\). If \\(v \\neq 0\\), choose a basis \\(\\gamma\\) of \\(L\\) such that \\(v \\in \\gamma\\). Then \\(I_\\gamma \\neq \\emptyset\\), a contradiction. The converse follows by the same argument.\n \n
\n
\n \n

3    Automorphisms of order two of E

\n \n

From this point on, our goal is to survey recent developments regarding automorphisms of order two and the corresponding \\(\\mathbb{Z}_2\\)-gradings of the infinite-dimensional Grassmann algebra.

\n \n

Let \\(X = \\{e_1, \\ldots, e_n, \\ldots\\}\\). For each map \\(\\lambda : X \\to E\\), we can define the linear transformation \\(\\varphi : E \\to E\\) by

\n \n
\n \\[\\varphi(e_{i_1} \\cdots e_{i_n}) = \\lambda(e_{i_1}) \\cdots \\lambda(e_{i_n}),\\] (1)\n
\n \n

for all \\(n \\in \\mathbb{N}\\).

\n \n

We start with the next lemma.

\n \n
\n
Lemma 7
\n

The linear transformation \\(\\varphi\\) is an endomorphism of \\(E\\) if and only if

\n \\[\\lambda(e_i)\\lambda(e_j) + \\lambda(e_j)\\lambda(e_i) = 0, \\quad \\text{for all } i, j.\\]\n
\n \n \n\n' tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] for test in math_tests: print(test) def test_math_extraction_strips_whitespace(self): """Test that extracted math equations have whitespace properly stripped""" html_content = """

\\[ x = y + z \\]

""" tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] self.assertTrue(len(math_tests) > 0) # The equation should be stripped of leading/trailing whitespace self.assertEqual(math_tests[0]["math"].strip(), math_tests[0]["math"]) class TestExtractHtmlMetadata(unittest.TestCase): def test_extract_metadata_portuguese_document(self): """Test metadata extraction from a Portuguese document with mixed content.""" html_content = """ Test Document
Header content here

Política de Metadados

Este é um documento de teste com texto em português.

Contém múltiplos parágrafos para simular conteúdo real.

Image placeholder 1

Mais texto após a imagem.

""" metadata = extract_html_metadata(html_content) # Check language extraction self.assertEqual(metadata['primary_language'], 'pt') # Check rotation values (always fixed) self.assertTrue(metadata['is_rotation_valid']) self.assertEqual(metadata['rotation_correction'], 0) # Check table/diagram detection # With 1 image (500 chars) and small text content, image ratio > 50% self.assertFalse(metadata['is_table']) self.assertTrue(metadata['is_diagram']) # Image estimate dominates def test_extract_metadata_table_heavy_document(self): """Test metadata extraction from a document that is mostly tables.""" html_content = """

Small intro text

Cell 1Cell 2Cell 3
Data AData BData C
More dataMore dataMore data
Even more dataEven more dataEven more data
Lots of dataLots of dataLots of data
Table contentTable contentTable content
Final rowFinal rowFinal row
""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'en') self.assertTrue(metadata['is_table']) # Should be True as >50% is table self.assertFalse(metadata['is_diagram']) def test_extract_metadata_image_heavy_document(self): """Test metadata extraction from a document that is mostly images.""" html_content = """

Brief text

Image 1
Image 2
Image 3
Image 4
Image 5
""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'es') self.assertFalse(metadata['is_table']) self.assertTrue(metadata['is_diagram']) # Should be True as >50% is images def test_extract_metadata_language_with_region(self): """Test that language codes with regions (e.g., pt-BR) are shortened.""" html_content = """

Texto em português brasileiro

""" metadata = extract_html_metadata(html_content) # Should convert pt-BR to pt self.assertEqual(metadata['primary_language'], 'pt') def test_extract_metadata_no_html_tag(self): """Test extraction when there's no html tag (defaults to 'en').""" html_content = """

Content without html tag

""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'en') # Should default to 'en' def test_extract_metadata_mixed_content(self): """Test a document with mixed content types.""" html_content = """ Política de Metadados para Livros e Capítulos de Livro UFPA
Biblioteca Central UFPA
LIVRO ABERTO portal do livro aberto da UFPA
SIBI/UFPA

Política de Metadados para Livros e Capítulos de Livro UFPA

Essa política de metadados possui o objetivo de garantir a consistência do trabalho executado no Portal do Livro Aberto. Dessa forma, foi desenvolvido com base no esquema de metadados do Dublin Core com adaptações para a realidade brasileira e local.

METADADOS VALOR REPETITIVO CONDIÇÃO
dc.type Tipo de documento Não Obrigatório
dc.title Título e subtítulo (se houver) Não Obrigatório
dc.title.alternative Título alternativo Sim Opcional
dc.creator Autor Sim Opcional
dc.creator.Lattes URL do currículo Lattes do autor Sim Opcional
dc.creator.ORCID ORCID do autor Sim Opcional
dc.description.affiliation Afiliação do autor Sim Opcional
dc.contributor.organizer Organizador Sim Opcional
dc.contributor.organizerLattes URL do currículo Lattes do organizador Sim Opcional
dc.contributor.organizerORCID ORCID do organizador Sim Opcional
dc.description.affiliationOrganizer Afiliação do organizador Sim Opcional
dc.contributor.coordinator Coordenador Sim Opcional
dc.contributor.coordinatorLattes URL do currículo Lattes do coordenador Sim Opcional
dc.contributor.coordinatorORCID ORCID do coordenador Sim Opcional
dc.contributor.affiliationCoordinator Afiliação do coordenador Sim Opcional
dc.contributor.editor Editor Sim Opcional
dc.contributor.editorLattes URL do currículo Lattes do editor Sim Opcional
dc.contributor.editorORCID ORCID do editor Sim Opcional
dc.description.affiliationEditor Afiliação do editor Sim Opcional
""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'pt') self.assertTrue(metadata['is_table']) self.assertFalse(metadata['is_diagram']) def test_extract_metadata_empty_body(self): """Test extraction with empty or minimal content.""" html_content = """ """ metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'de') self.assertFalse(metadata['is_table']) self.assertFalse(metadata['is_diagram']) self.assertTrue(metadata['is_rotation_valid']) self.assertEqual(metadata['rotation_correction'], 0) class TestHtmlToMarkdown(unittest.TestCase): def test_title_tag_excluded_from_markdown(self): """Test that title tags from head are not included in markdown output.""" html_content = """ This Should Not Appear In Markdown

Main Heading

This is the body content that should appear.

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that the title from head tag is NOT in the markdown self.assertNotIn("This Should Not Appear In Markdown", markdown_with_frontmatter) # Check that body content IS in the markdown self.assertIn("Main Heading", markdown_with_frontmatter) self.assertIn("This is the body content that should appear", markdown_with_frontmatter) # Check that frontmatter is present self.assertTrue(markdown_with_frontmatter.startswith("---")) def test_image_with_data_description(self): """Test that images are converted with placeholder alt text.""" html_content = """

Text before image

Placeholder

Text after image

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that images use the fixed placeholder alt text self.assertIn("![Image Placeholder]", markdown_with_frontmatter) # Check that other content is preserved self.assertIn("Text before image", markdown_with_frontmatter) self.assertIn("Text after image", markdown_with_frontmatter) def test_image_without_data_description(self): """Test that images without data-description use default alt text.""" html_content = """
Some placeholder content
""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that default alt text is used self.assertIn("![Image Placeholder]", markdown_with_frontmatter) def test_headers_footers_excluded(self): """Test that header and footer tags are excluded from markdown.""" html_content = """

Main Content

This should appear in the markdown.

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that header/footer content is excluded self.assertNotIn("Navigation menu", markdown_with_frontmatter) self.assertNotIn("Footer text", markdown_with_frontmatter) # Check that main content is included self.assertIn("Main Content", markdown_with_frontmatter) self.assertIn("This should appear in the markdown", markdown_with_frontmatter) def test_no_body_tag_fallback(self): """Test that content is still processed when there's no body tag.""" html_content = """

Content without body tag

This should still be converted.

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that content is still converted self.assertIn("Content without body tag", markdown_with_frontmatter) self.assertIn("This should still be converted", markdown_with_frontmatter) def test_removes_triple_dashes_from_content(self): """Test that --- at the start or end of markdown content is removed.""" # Test with --- at the beginning html_content_start = """

---

Regular content here

""" markdown_start = html_to_markdown_with_frontmatter(html_content_start) lines = markdown_start.split('\n') # Check that we have FrontMatter self.assertEqual(lines[0], '---') # Check that the content doesn't start with --- after the FrontMatter ends frontmatter_end = next(i for i in range(1, len(lines)) if lines[i] == '---') content_after_frontmatter = '\n'.join(lines[frontmatter_end + 1:]) self.assertFalse(content_after_frontmatter.strip().startswith('---')) # Test with --- at the end html_content_end = """

Regular content here

---

""" markdown_end = html_to_markdown_with_frontmatter(html_content_end) # Check that content doesn't end with --- self.assertFalse(markdown_end.rstrip().endswith('---\n---')) # Test with --- at both beginning and end html_content_both = """

---

Middle content

---

""" markdown_both = html_to_markdown_with_frontmatter(html_content_both) lines_both = markdown_both.split('\n') frontmatter_end_both = next(i for i in range(1, len(lines_both)) if lines_both[i] == '---') content_both = '\n'.join(lines_both[frontmatter_end_both + 1:]) # Content should not start or end with --- self.assertFalse(content_both.strip().startswith('---')) self.assertFalse(content_both.strip().endswith('---')) # But should contain "Middle content" self.assertIn("Middle content", content_both) if __name__ == '__main__': unittest.main()