import re import unittest from unittest.mock import MagicMock, patch from bs4 import BeautifulSoup from olmocr.bench.synth.mine_html_templates import ( PreserveTablesConverter, extract_html_metadata, generate_tests_from_html, html_to_markdown_with_frontmatter, ) from olmocr.bench.tests import TestType class TestMineTests(unittest.TestCase): def test_absent_nested(self): html_content = """ New Paradigm for Nuclear Safety

New Paradigm for Nuclear Safety

Thursday, April 25, 2013

Japan Nuclear Safety Institute

Shojiro Matsuura, Chairman

""" tests = generate_tests_from_html(html_content, "0", 1) self.assertEqual(len([test for test in tests if test["type"] == "absent"]), 2) def test_text_basic(self): html_content = """ Bone Morphology Description

The posterior end exhibits a curved process to articulate with the angular. Aside from the process, the rest of the posterior end has slight curvatures for articulation, but is mostly uniform. Ventral border of the bone is mostly straight, with slight curvature (FIG. 20).

Lateral- A spine runs from the anterior-most tip, reduces in height ~3/4 way down toward posterior, and terminates at the center of the posterior notch. A fossa is present on the dorsal side of the spine. The posterior end exhibits more relief than in medial view, with the medial side of the posterior process extending past the posterior notch.

Ontogeny- Anterior tip is sharply pointed in AR12 and AR1 with AR2 being rounded, though this could be due to breakage. Anterior dorsal margin is straight and flat in AR12; AR2 shows little curvature and AR1 shows the most curvature; curving outward dorsally. Dorsal incisure is anteroposteriorly oriented in AR12, in AR2 there is some ventral curvature, and in AR1 there is a posteroventral curvature. Both AR1 and AR3 are curved on the ventral margin while AR12 is mostly straight. Posterior end of AR1 exhibits four undulations, ventral process is not yet extended. A fossa is present dorsal to the ventral process, not seen on AR12 or AR2. In medial view the lateral ridge is visible posteriorly in AR1 and AR2l the ridge does not fully extend anteriorly. In lateral view of the posterior the ventral process is present on AR2, but not fully extended posteriorly. Tip of the anterodorsal process is sharply pointed in AR1 and AR2, rounded in AR12. A short ridge is present on the dorsal edge of the dorsal process of AR1. The short ridge on the posterodorsal process of AR2 is slightly more ventral than in AR1. On AR12 the ridge is long and positioned most ventral. The lateral ridge is closest to the ventral margin in AR1. In AR2 the ridge is positioned more dorsally and in AR12 the ridge terminates and the anterior tip. The section of bone ventral to the lateral ridge appears to thin with age. The posterior notch on AR12 is curved anteriorly and the medial side of the notch extends posteriorly

""" tests = generate_tests_from_html(html_content, "0", 1) self.assertGreater(len(tests), 5) def test_big_headers(self): html_content = """ FORANE 427A Comparative Data

Comparative data

Parameters R-22 FORANE® 427A
Evaporating temperature 2.7 °C 1.1 °C
Condensing temperature 40.3 °C 44.0 °C
Suction temperature 7.1 °C 9.2 °C
Suction pressure 5.4 bar 5.0 bar
Discharge temperature 69.5 °C 71.1 °C
Discharge pressure 15.5 bar 17.1 bar
Cooling power 431 KW 376 KW
Power consumption 122 kW 124 kW
Residual mineral oil - 11%

During this field test, very satisfactory running conditions were reached immediately. The temperature set points were easily achieved with similar energy consumption as compared to R-22 despite a high level of residual mineral oil in the circuit. The performance of the installation continues to satisfy the customer's requirements after more than one year of service.

FORANE® 427A consequently fully satisfies the requirements of the European regulations while enabling existing equipment to continue to perform well without the need for any long and costly plant modifications.

The versatility of FORANE® 427A is also appreciated as it can be used to retrofit low temperature refrigeration equipment as well as air-conditioning installations, resulting in only one retrofit refrigerant for all R-22 units.

Combining environmental friendliness, high performance and simplicity is today a reality with FORANE® 427A !

""" tests = generate_tests_from_html(html_content, "0", 1) self.assertFalse(any(test for test in tests if test["type"] == "absent" and "Comparative data" in test["text"])) def test_page_num(self): html_content = """ Academic Paper - Page 47

Figure 4.3: The COVID-19 pandemic resulted in meaningful increase in the support for other groups' protests among Panamanians.

4.2.2 Demographically-Informed Opinion Assignment

Our model does not endow opinions randomly; instead, we leverage data to assign activists in a more realistic fashion. We use Latinobarómetro survey data from 2020 and 2023, both of which contain the three measurements of support for protest. Then, we explored which demographic groups were more likely to be activists; these are young adults and individuals at either extreme of the financial spectrum. We use this insight to influence the assignment of opinions: our logistic equations make it so that individuals with these characteristics are more likely to be labeled as activists as the probabilistic endowment happens. The code ensures that the proportion of activists overall remains exactly as desired and that there are activists who do not belong to these identified groups

4.2.3 Identity Factored into Social Influence

The similarity formula for Panama is built as follows, taking in nine demographic factors stored as node attributes. These are gender, age, nationality, financial status, highest level of education, level of employment, geographical region, party affiliation, and ethnicity (respectively encoded as gend, age, nation, fin, edu, emp, region, paff, and ethni). Each one of these factors has an associated weight; in this model, all factors were weighted as 0.10, except for level of education and financial status which received 0.15. Our code establishes logical rules to compare the two individuals on each dimension and return a factor by which to multiply the weight. These factors can be absolute or relative, based on the demographic dimension in question. For example, the logical conditions for gender returns 1 if same or 0 if different, while age returns a float value between 0 and 1 according to how close in age the individuals are. Once the pairwise similarity

""" tests = generate_tests_from_html(html_content, "0", 1) self.assertEqual(len([test for test in tests if test["type"] == "absent"]), 1) def test_div_footer(self): html_content = """ Being Thai: A Narrow Identity in a Wide World

hard to create a political and cultural narrative that appeals to old ideas about being Thai at this moment of perceived vulnerability.

The Concept of "Thainess"

Thainess is a political notion that originally evolved to support an authoritarian government and was then re-shaped for periods of more democratic rule.13 Thailand has long oscillated between dictatorship and democracy and, in either case, a sense of the "Thai style" (baeb Thai) is commonly invoked. Under these conditions a military coup may be thought to "advance Thai-style democracy".14 This is obviously fraught with difficulties and requires government agencies, most notably the Ministry of Culture, to work hard on shaping national identity.15 Thailand's geographical and cultural diversity means that there are inevitable deviations. Some of these have been astutely handled, especially in the northeastern provinces where the Lao-speaking minority has been integrated as chao isan. Nowadays it is only at the margins that their "Isan-ness" remains a contested sub-category of Thainess.16 In earlier generations there were more explicit challenges to the suggestion of Isan as Thai.17 Similar defiance has emerged in both the northern provinces18 and in the Malay Muslim majority areas of the far south.19 At various times there have been suggestions, as reported by the anthropologist Nick Tapp, that "Thainess" was disintegrating.20 It is in response to these persistent challenges that Prayuth's military government has sought to create its own revised version of the national ideal.

For the military government the codification of Thailand's core values has created new opportunities to stamp its preferred identity on society. In a key speech soon after he took power in 2014, Prayuth identified disunity as a problem in Thai society that would, in his words, "urgently require inclusive cooperation from people of all levels, gender and age".21 His approach was to "define clear core values of Thai people so that we can build a strong nation". These values draw on cultural ideas that have existed for many decades and have enjoyed the favour of previous military rulers. The full list of these twelve values is:

  1. Upholding the three main pillars of the country: the nation, the religion and the monarchy;
  2. Showing honesty, sacrifice and patience, with a positive attitude for the interest of the public;
  3. Practicing filial piety towards parents, guardians and teachers;
  4. Seeking both direct and indirect knowledge and education;
""" tests = generate_tests_from_html(html_content, "0", 1) self.assertEqual(len([test for test in tests if test["type"] == "absent"]), 4) def test_table(self): html_content = """ Distribuição da população na estrutura socioocupacional - Brasil 2000
Alexandre Gori Maia e Waldir José de Quadros ■ 417

Apêndice A - Distribuição da população na estrutura socioocupacional - Brasil 2000

Grupo Ocupacional Classe Ocupacional Superior Médio Baixo Interior Ínfimo Total
N (1.000s) % N (1.000s) % N (1.000s) % N (1.000s) % N (1.000s) % N (1.000s) %
Empregadores A-1 Empregadores (> 10) 608 67,3 185 20,4 86 9,6 16 1,8 8 0,9 903 100
A-2 Empregadores (<= 10) 1.555 36,9 1.107 26,3 1.036 24,7 341 8,1 171 4,1 4.213 100
Total 2.162 42,3 1.292 25,3 1.126 22,0 357 7,0 179 3,5 5.116 100
Profissionais C Profissionais Autônomos 1.643 21,7 1.513 20,0 2.073 27,4 1.225 16,2 1.108 14,7 7.562 100
D Profissionais Assalariados 4.438 13,3 6.030 18,0 11.550 34,5 7.027 21,0 4.389 13,1 33.434 100
Total 6.081 14,8 7.543 18,4 13.623 33,2 8.252 20,1 5.497 13,4 40.995 100
Massa Não-Agrícola F Trabalhadores Autônomos 657 3,5 1.754 9,2 5.561 29,2 5.271 27,7 5.788 30,4 19.030 100
G Trabalhadores Assalariados 282 0,7 1.657 4,3 10.363 27,1 13.002 34,0 12.968 33,9 38.272 100
I Trabalhadores Domésticos 10 0,1 104 1,6 977 14,7 1.810 27,3 3.733 56,3 6.633 100
Total 948 1,5 3.515 5,5 16.901 26,4 20.083 31,4 22.489 35,2 63.936 100
Massa Agrícola H-1 Proprietários Conta Própria 188 2,0 364 3,8 1.387 14,4 1.889 19,7 5.779 60,2 9.608 100
H-2 Trabalhadores Autônomos 5 0,5 14 1,5 72 7,6 152 16,1 703 74,3 946 100
H-3 Trabalhadores Assalariados 17 0,2 58 0,6 794 8,4 2.260 23,9 6.322 66,9 9.451 100
Total 210 1,0 436 2,2 2.253 11,3 4.301 21,5 12.805 64,0 20.005 100
Não-remunerados Não-remunerados Não-Agrícolas 13 6,8 16 8,1 28 14,0 22 10,9 119 60,2 198 100
Não-remunerados Agrícolas 5 0,1 13 0,3 59 1,6 352 9,4 3.302 88,5 3.731 100
Sem Ocupação Com Renda 1.567 6,0 2.330 8,9 5.395 20,7 6.821 26,2 9.964 38,2 26.078 100
Sem Ocupação Sem Renda 8.094 100 8.094 100
Ignorados 177 10,3 202 11,8 364 21,1 337 19,6 640 37,2 1.720 100

Fonte: Censo Demográfico 2000, microdados. IBGE. Elaboração dos autores.

""" tests = generate_tests_from_html(html_content, "0", 1) self.assertTrue(len(tests) > 10) def test_sup(self): html_content = """ A ROSE BY ANY OTHER NAME
2016]
A ROSE BY ANY OTHER NAME
1083

cases were decided within a year of each other (2000 and 2001, respectively). Save the Manatee Club largely consists of a truncated version of the Consolidated-Tomoka analysis, with minor adjustments to conform the opinion to the 1999 amendments. Day Cruise, on the other hand, closely analyzes the 1999 version of section 120.52(8). However, it is Save the Manatee Club that has come to dominate Florida court opinions on rulemaking challenges and not the more detailed Day Cruise analysis.78 The following Sections will discuss the facts of the two cases, examine the differences between their analyses of section 120.52(8), and finally conclude with an opinion on which analysis is better to apply in section 120.52(8) rulemaking challenges.

A. Southwest Florida Water Management District v. Save the Manatee Club, Inc.

After the legislature amended the APA, the First DCA analyzed the statutory language of section 120.52(8) again in Southwest Florida Water Management District v. Save the Manatee Club, Inc.79 Save the Manatee Club concerned the Southwest Florida Water Management District's (the "District's") authority to create exemptions to environmental resource permitting requirements.80 South Shores Partners, Ltd. ("South Shores") applied "for a permit to develop a 720-acre tract of land in Southwest Hillsborough County."81 As part of the development project, South Shores wanted "to build a connecting waterway between the [existing] canal system [on the property] and the [Tampa] Bay."82 The Save the Manatee Club believed that the resulting increase in power boat traffic in this new waterway would "endanger the manatee and its habitat."83

The District has the authority to grant either a general permit or an environmental resource permit to a development project, depending on the type of project involved.84 When granting an environmental resource permit, the District must consider "[t]he impact a proposed development will have on wildlife" as a factor; it does not have to do so when it grants a general permit.85 The District granted South

""" tests = generate_tests_from_html(html_content, "0", 1) superscript_map = { "0": "⁰", "1": "¹", "2": "²", "3": "³", "4": "⁴", "5": "⁵", "6": "⁶", "7": "⁷", "8": "⁸", "9": "⁹", "+": "⁺", "-": "⁻", "=": "⁼", "(": "⁽", ")": "⁾", "n": "ⁿ", "i": "ⁱ", } for test in tests: for sup in superscript_map.values(): self.assertTrue(sup not in test.get("text", "")) self.assertTrue(sup not in test.get("before", "")) self.assertTrue(sup not in test.get("after", "")) def test_katex_autorender(self): """Test that KaTeX math expressions are properly auto-rendered when using the render_pdf_with_playwright function.""" import asyncio import os import tempfile from olmocr.bench.synth.mine_html_templates import render_pdf_with_playwright # Create HTML with LaTeX expressions html_content = """ KaTeX Auto-Render Test

Math Expressions Test

Inline math expression: \(E = mc^2\)

Block math expression:

\[ \\frac{d}{dx}(x^n) = nx^{n-1} \]

Another complex equation:

\[ \int_{a}^{b} f(x) \, dx = F(b) - F(a) \]

""" # Create a temporary file to store the rendered PDF with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file: output_pdf_path = tmp_file.name # Render the HTML to PDF render_success = asyncio.run(render_pdf_with_playwright(html_content=html_content, output_pdf_path=output_pdf_path, png_width=800, png_height=600)) # Check if rendering was successful self.assertTrue(render_success, "PDF rendering should succeed") # Verify PDF was created and has content self.assertTrue(os.path.exists(output_pdf_path), "PDF file should exist") self.assertTrue(os.path.getsize(output_pdf_path) > 0, "PDF file should have content") # The actual validation of KaTeX rendering would require visual inspection or text extraction, # but at minimum we can verify the file was created successfully print(output_pdf_path) def test_line_numbers(self): html_content = """ House Amendment Bill No. CS/CS/SB 7030
HOUSE AMENDMENT
Bill No. CS/CS/SB 7030, 1st Eng. (2019)
Amendment No.
Senate
House
.
1
Representative Jenne offered the following:
2
3
Amendment
4
Remove lines 274-280 and insert:
5
c.3. Pass an initial a psychological evaluation, and
6
subsequent yearly psychological evaluations before each school
7
year, administered by a psychologist licensed under chapter 490
8
and designated by the Department of Law Enforcement and submit
9
the results of such evaluations the evaluation to the sheriff's
10
office. The Department of Law Enforcement is authorized to
11
provide the sheriff's office with mental health and substance
12
abuse data for compliance with this paragraph.
""" tests = generate_tests_from_html(html_content, "0", 1) for test in tests: if test["type"] == "order": self.assertTrue(len([c for c in test["before"] if not c.isdigit()]) > 0) class TestMathExtraction(unittest.TestCase): """Test the math extraction functionality in mine_html_templates.py""" def test_math_extraction_from_html(self): """Test that math equations are properly extracted from HTML content""" html_content = """

Some text with inline math \\(x = 2\\) here.

Display math: \\[E = mc^2\\]

Another inline: \\(\\alpha + \\beta = \\gamma\\)

Complex display: \\[\\int_0^\\infty e^{-x} dx = 1\\]

""" # Generate tests from HTML tests = generate_tests_from_html(html_content, "test_pdf", 1) # Filter math tests math_tests = [t for t in tests if t.get("type") == "math"] # Check that we extracted math equations self.assertTrue(len(math_tests) > 0, "Should extract at least one math equation") # Check that specific equations were extracted math_contents = [t["math"] for t in math_tests] self.assertIn("x = 2", math_contents) self.assertIn("E = mc^2", math_contents) self.assertIn("\\alpha + \\beta = \\gamma", math_contents) self.assertIn("\\int_0^\\infty e^{-x} dx = 1", math_contents) def test_math_extraction_with_multiline(self): """Test extraction of multiline math equations""" html_content = """

Multiline equation: \\[ e_i = \\frac{e_i + \\varphi(e_i)}{2} + \\frac{e_i - \\varphi(e_i)}{2}, \\quad \\text{for } i \\in \\mathbb{N}. \\]

""" tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] # Check multiline equation is captured self.assertTrue(len(math_tests) > 0) # Check that the multiline content is preserved (without excessive newlines) found_multiline = False for test in math_tests: if "\\frac{e_i + \\varphi(e_i)}{2}" in test["math"] and "\\mathbb{N}" in test["math"]: found_multiline = True break self.assertTrue(found_multiline, "Should extract multiline equation correctly") def test_math_extraction_deduplication(self): """Test that duplicate math equations are deduplicated""" html_content = """

First occurrence: \\[x^2 + y^2 = z^2\\]

Second occurrence: \\[x^2 + y^2 = z^2\\]

Third occurrence: \\[x^2 + y^2 = z^2\\]

""" tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] # Count how many times the equation appears equation_count = sum(1 for t in math_tests if "x^2 + y^2 = z^2" in t["math"]) # Should only appear once due to deduplication self.assertEqual(equation_count, 1, "Duplicate equations should be deduplicated") def test_math_extraction_patterns(self): """Test different math delimiter patterns""" html_content = """

Pattern 1: \\(inline1\\)

Pattern 2: \\[display1\\]

Pattern 3: $$display2$$

""" tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] math_contents = [t["math"] for t in math_tests] # Check all patterns are captured self.assertIn("inline1", math_contents) self.assertIn("display1", math_contents) self.assertIn("display2", math_contents) def test_math_extraction_minimum_length(self): """Test that very short equations are filtered out""" html_content = """

Short: \\(x\\)

Also short: \\[y\\]

Long enough: \\(x=1\\)

""" tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] math_contents = [t["math"] for t in math_tests] # Short equations (length <= 2) should be filtered out self.assertNotIn("x", math_contents) self.assertNotIn("y", math_contents) # Longer equation should be included self.assertIn("x=1", math_contents) def test_math_validation_passes(self): """Test that valid math tests pass validation against markdown""" html_content = """

Test equation: \\[E = mc^2\\]

""" # Mock the validation to always pass for math tests with patch("olmocr.bench.synth.mine_html_templates.load_single_test") as mock_load: mock_test = MagicMock() mock_test.run.return_value = (True, None) mock_load.return_value = mock_test tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] # Verify math test was created self.assertTrue(len(math_tests) > 0) # Verify test has correct structure for test in math_tests: self.assertEqual(test["type"], "math") self.assertIn("math", test) self.assertEqual(test["max_diffs"], 0) self.assertIn("id", test) self.assertIn("pdf", test) self.assertEqual(test["page"], 1) def test_complex_markdown_example(self): """Test with the complex markdown example provided by the user""" # Convert markdown to HTML-like structure for testing html_content = '\n\n\n \n \n Automorphisms of Order Two\n \n \n \n \n\n\n
\n

If \\(\\varphi \\in \\text{Aut}(E)\\) with \\(\\varphi^2 = id\\) we observe that

\n \\[e_i = \\frac{e_i + \\varphi(e_i)}{2} + \\frac{e_i - \\varphi(e_i)}{2}, \\quad \\text{for } i \\in \\mathbb{N}.\\]\n \n

Setting \\(a_i = e_i + \\varphi(e_i)/2\\) we have:

\n \n \n
\n \n
\n
Definition 5
\n

Let \\(\\varphi \\in \\text{Aut}(E)\\). We say that \\(\\varphi\\) is of canonical type if \\(\\varphi(e_i) \\in E_{(1)}\\) for all \\(i\\).

\n \n

If \\(\\varphi\\) is an automorphism of order 2 on \\(E\\), we have that \\(\\varphi\\) is of canonical type if and only if \\(a_i \\in E_{(1)}\\) for all \\(i\\). Let us fix a basis \\(\\beta = \\{e_1, e_2, \\ldots, e_n, \\ldots\\}\\) of the vector space \\(L\\) and an automorphism \\(\\varphi \\in \\text{Aut}(E)\\) such that \\(\\varphi^2 = id\\). Then \\(\\varphi\\), as a linear transformation, has eigenvalues \\(\\pm 1\\) and \\(-1\\) only, and moreover, there exists a basis of the vector space \\(E\\) consisting of eigenvectors. (It is well known from elementary Linear Algebra that this fact does not depend on the dimension of the vector space as long as the characteristic of \\(F\\) is different from 2.) Then \\(E = E(1) \\oplus E(-1)\\) where \\(E(t)\\) is the eigenspace for the eigenvalue \\(t\\) of the linear transformation \\(\\varphi\\). One considers the intersections \\(L(t) = L \\cap E(t)\\), \\(t = \\pm 1\\). Changing the basis \\(\\beta\\), if necessary, one may assume that \\(L(t)\\) is the span of \\(\\beta \\cap L(t)\\). Clearly this change of basis gives rise to a homogeneous automorphism of \\(E\\) and we can take the composition of it and then \\(\\varphi\\). We shall assume that such a change of basis has been done.

\n \n

Denote

\n \\[I_\\varphi = \\{n \\in \\mathbb{N} \\mid \\varphi(e_n) = \\pm e_n\\}.\\]\n
\n \n

We shall distinguish the following four possibilities:

\n \n
    \n
  1. \\(I_\\varphi = \\mathbb{N}\\).
  2. \n
  3. \\(I_\\varphi \\neq \\mathbb{N}\\) is infinite.
  4. \n
  5. \\(I_\\varphi\\) is finite and nonempty.
  6. \n
  7. \\(I_\\gamma = \\emptyset\\) for every linear basis \\(\\gamma\\) of \\(L\\).
  8. \n
\n \n

We shall call these automorphisms (and also the corresponding \\(\\mathbb{Z}_2\\)-gradings), automorphisms (or gradings) of type 1, 2, 3, and 4, respectively.

\n \n

The automorphisms of type 1 induce \\(\\mathbb{Z}_2\\)-gradings on \\(E\\) in which all generators of \\(E\\) are homogeneous. Such structures are called homogeneous \\(\\mathbb{Z}_2\\)-gradings on \\(E\\). The corresponding graded identities were completely studied in [22, 24, 29].

\n \n

We conclude this section with the following lemma.

\n \n
\n
Lemma 6
\n

Let \\(\\varphi\\) be an automorphism of order two of \\(E\\). Then \\(\\varphi\\) is of type 4 if and only if, for every \\(v \\in L\\) such that \\(\\varphi(v) = \\pm v\\), one has \\(v = 0\\).

\n \n
\n Proof Assume that \\(\\varphi\\) is of type 4 and let \\(v \\in L\\) with \\(\\varphi(v) = \\pm v\\). If \\(v \\neq 0\\), choose a basis \\(\\gamma\\) of \\(L\\) such that \\(v \\in \\gamma\\). Then \\(I_\\gamma \\neq \\emptyset\\), a contradiction. The converse follows by the same argument.\n \n
\n
\n \n

3    Automorphisms of order two of E

\n \n

From this point on, our goal is to survey recent developments regarding automorphisms of order two and the corresponding \\(\\mathbb{Z}_2\\)-gradings of the infinite-dimensional Grassmann algebra.

\n \n

Let \\(X = \\{e_1, \\ldots, e_n, \\ldots\\}\\). For each map \\(\\lambda : X \\to E\\), we can define the linear transformation \\(\\varphi : E \\to E\\) by

\n \n
\n \\[\\varphi(e_{i_1} \\cdots e_{i_n}) = \\lambda(e_{i_1}) \\cdots \\lambda(e_{i_n}),\\] (1)\n
\n \n

for all \\(n \\in \\mathbb{N}\\).

\n \n

We start with the next lemma.

\n \n
\n
Lemma 7
\n

The linear transformation \\(\\varphi\\) is an endomorphism of \\(E\\) if and only if

\n \\[\\lambda(e_i)\\lambda(e_j) + \\lambda(e_j)\\lambda(e_i) = 0, \\quad \\text{for all } i, j.\\]\n
\n \n \n\n' tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] for test in math_tests: print(test) def test_math_extraction_strips_whitespace(self): """Test that extracted math equations have whitespace properly stripped""" html_content = """

\\[ x = y + z \\]

""" tests = generate_tests_from_html(html_content, "test_pdf", 1) math_tests = [t for t in tests if t.get("type") == "math"] self.assertTrue(len(math_tests) > 0) # The equation should be stripped of leading/trailing whitespace self.assertEqual(math_tests[0]["math"].strip(), math_tests[0]["math"]) class TestExtractHtmlMetadata(unittest.TestCase): def test_extract_metadata_portuguese_document(self): """Test metadata extraction from a Portuguese document with mixed content.""" html_content = """ Test Document
Header content here

Política de Metadados

Este é um documento de teste com texto em português.

Contém múltiplos parágrafos para simular conteúdo real.

Image placeholder 1

Mais texto após a imagem.

""" metadata = extract_html_metadata(html_content) # Check language extraction self.assertEqual(metadata["primary_language"], "pt") # Check rotation values (always fixed) self.assertTrue(metadata["is_rotation_valid"]) self.assertEqual(metadata["rotation_correction"], 0) # Check table/diagram detection # With 1 image (500 chars) and small text content, image ratio > 50% self.assertFalse(metadata["is_table"]) self.assertTrue(metadata["is_diagram"]) # Image estimate dominates def test_extract_metadata_table_heavy_document(self): """Test metadata extraction from a document that is mostly tables.""" html_content = """

Small intro text

Cell 1Cell 2Cell 3
Data AData BData C
More dataMore dataMore data
Even more dataEven more dataEven more data
Lots of dataLots of dataLots of data
Table contentTable contentTable content
Final rowFinal rowFinal row
""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata["primary_language"], "en") self.assertTrue(metadata["is_table"]) # Should be True as >50% is table self.assertFalse(metadata["is_diagram"]) def test_extract_metadata_image_heavy_document(self): """Test metadata extraction from a document that is mostly images.""" html_content = """

Brief text

Image 1
Image 2
Image 3
Image 4
Image 5
""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata["primary_language"], "es") self.assertFalse(metadata["is_table"]) self.assertTrue(metadata["is_diagram"]) # Should be True as >50% is images def test_extract_metadata_language_with_region(self): """Test that language codes with regions (e.g., pt-BR) are shortened.""" html_content = """

Texto em português brasileiro

""" metadata = extract_html_metadata(html_content) # Should convert pt-BR to pt self.assertEqual(metadata["primary_language"], "pt") def test_extract_metadata_no_html_tag(self): """Test extraction when there's no html tag (defaults to 'en').""" html_content = """

Content without html tag

""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata["primary_language"], "en") # Should default to 'en' def test_extract_metadata_mixed_content(self): """Test a document with mixed content types.""" html_content = """ Política de Metadados para Livros e Capítulos de Livro UFPA
Biblioteca Central UFPA
LIVRO ABERTO portal do livro aberto da UFPA
SIBI/UFPA

Política de Metadados para Livros e Capítulos de Livro UFPA

Essa política de metadados possui o objetivo de garantir a consistência do trabalho executado no Portal do Livro Aberto. Dessa forma, foi desenvolvido com base no esquema de metadados do Dublin Core com adaptações para a realidade brasileira e local.

METADADOS VALOR REPETITIVO CONDIÇÃO
dc.type Tipo de documento Não Obrigatório
dc.title Título e subtítulo (se houver) Não Obrigatório
dc.title.alternative Título alternativo Sim Opcional
dc.creator Autor Sim Opcional
dc.creator.Lattes URL do currículo Lattes do autor Sim Opcional
dc.creator.ORCID ORCID do autor Sim Opcional
dc.description.affiliation Afiliação do autor Sim Opcional
dc.contributor.organizer Organizador Sim Opcional
dc.contributor.organizerLattes URL do currículo Lattes do organizador Sim Opcional
dc.contributor.organizerORCID ORCID do organizador Sim Opcional
dc.description.affiliationOrganizer Afiliação do organizador Sim Opcional
dc.contributor.coordinator Coordenador Sim Opcional
dc.contributor.coordinatorLattes URL do currículo Lattes do coordenador Sim Opcional
dc.contributor.coordinatorORCID ORCID do coordenador Sim Opcional
dc.contributor.affiliationCoordinator Afiliação do coordenador Sim Opcional
dc.contributor.editor Editor Sim Opcional
dc.contributor.editorLattes URL do currículo Lattes do editor Sim Opcional
dc.contributor.editorORCID ORCID do editor Sim Opcional
dc.description.affiliationEditor Afiliação do editor Sim Opcional
""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata["primary_language"], "pt") self.assertTrue(metadata["is_table"]) self.assertFalse(metadata["is_diagram"]) def test_extract_metadata_empty_body(self): """Test extraction with empty or minimal content.""" html_content = """ """ metadata = extract_html_metadata(html_content) self.assertEqual(metadata["primary_language"], "de") self.assertFalse(metadata["is_table"]) self.assertFalse(metadata["is_diagram"]) self.assertTrue(metadata["is_rotation_valid"]) self.assertEqual(metadata["rotation_correction"], 0) class TestHtmlToMarkdown(unittest.TestCase): def test_title_tag_excluded_from_markdown(self): """Test that title tags from head are not included in markdown output.""" html_content = """ This Should Not Appear In Markdown

Main Heading

This is the body content that should appear.

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that the title from head tag is NOT in the markdown self.assertNotIn("This Should Not Appear In Markdown", markdown_with_frontmatter) # Check that body content IS in the markdown self.assertIn("Main Heading", markdown_with_frontmatter) self.assertIn("This is the body content that should appear", markdown_with_frontmatter) # Check that frontmatter is present self.assertTrue(markdown_with_frontmatter.startswith("---")) def test_image_with_data_description(self): """Test that images are converted with placeholder alt text.""" html_content = """

Text before image

Placeholder

Text after image

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that images use the fixed placeholder alt text self.assertIn("![Image Placeholder]", markdown_with_frontmatter) # Check that other content is preserved self.assertIn("Text before image", markdown_with_frontmatter) self.assertIn("Text after image", markdown_with_frontmatter) def test_image_without_data_description(self): """Test that images without data-description use default alt text.""" html_content = """
Some placeholder content
""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that default alt text is used self.assertIn("![Image Placeholder]", markdown_with_frontmatter) def test_headers_footers_excluded(self): """Test that header and footer tags are excluded from markdown.""" html_content = """

Main Content

This should appear in the markdown.

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that header/footer content is excluded self.assertNotIn("Navigation menu", markdown_with_frontmatter) self.assertNotIn("Footer text", markdown_with_frontmatter) # Check that main content is included self.assertIn("Main Content", markdown_with_frontmatter) self.assertIn("This should appear in the markdown", markdown_with_frontmatter) def test_no_body_tag_fallback(self): """Test that content is still processed when there's no body tag.""" html_content = """

Content without body tag

This should still be converted.

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that content is still converted self.assertIn("Content without body tag", markdown_with_frontmatter) self.assertIn("This should still be converted", markdown_with_frontmatter) def test_removes_triple_dashes_from_content(self): """Test that --- at the start or end of markdown content is removed.""" # Test with --- at the beginning html_content_start = """

---

Regular content here

""" markdown_start = html_to_markdown_with_frontmatter(html_content_start) lines = markdown_start.split("\n") # Check that we have FrontMatter self.assertEqual(lines[0], "---") # Check that the content doesn't start with --- after the FrontMatter ends frontmatter_end = next(i for i in range(1, len(lines)) if lines[i] == "---") content_after_frontmatter = "\n".join(lines[frontmatter_end + 1 :]) self.assertFalse(content_after_frontmatter.strip().startswith("---")) # Test with --- at the end html_content_end = """

Regular content here

---

""" markdown_end = html_to_markdown_with_frontmatter(html_content_end) # Check that content doesn't end with --- self.assertFalse(markdown_end.rstrip().endswith("---\n---")) # Test with --- at both beginning and end html_content_both = """

---

Middle content

---

""" markdown_both = html_to_markdown_with_frontmatter(html_content_both) lines_both = markdown_both.split("\n") frontmatter_end_both = next(i for i in range(1, len(lines_both)) if lines_both[i] == "---") content_both = "\n".join(lines_both[frontmatter_end_both + 1 :]) # Content should not start or end with --- self.assertFalse(content_both.strip().startswith("---")) self.assertFalse(content_both.strip().endswith("---")) # But should contain "Middle content" self.assertIn("Middle content", content_both) class TestSuperscriptSubscriptConversion(unittest.TestCase): """Test superscript and subscript conversion to Unicode in html_to_markdown_with_frontmatter""" def test_basic_superscripts(self): """Test basic superscript conversion""" html = """

x2 + y3 = z4

109 is a billion

""" result = html_to_markdown_with_frontmatter(html) # Check that superscripts are converted to Unicode self.assertIn("x²", result) self.assertIn("y³", result) self.assertIn("z⁴", result) self.assertIn("10⁹", result) # Should not contain HTML sup tags in markdown self.assertNotIn("", result) self.assertNotIn("", result) def test_basic_subscripts(self): """Test basic subscript conversion""" html = """

H2O is water

CO2 is carbon dioxide

Xn represents the nth element

""" result = html_to_markdown_with_frontmatter(html) # Check that subscripts are converted to Unicode self.assertIn("H₂O", result) self.assertIn("CO₂", result) self.assertIn("Xₙ", result) # Should not contain HTML sub tags in markdown self.assertNotIn("", result) self.assertNotIn("", result) def test_mixed_super_and_subscripts(self): """Test mixed superscripts and subscripts""" html = """

The formula is x2 + H2O+

Chemical: Ca2+ and SO42-

""" result = html_to_markdown_with_frontmatter(html) # Check mixed conversions self.assertIn("x²", result) self.assertIn("H₂O⁺", result) self.assertIn("Ca²⁺", result) self.assertIn("SO₄²⁻", result) def test_special_characters(self): """Test special character conversions""" html = """

Math: (x+y)n and f(x)

Ion: OH- and H+

Index: ai and bi

""" result = html_to_markdown_with_frontmatter(html) # Check special character conversions self.assertIn("(x+y)ⁿ", result) self.assertIn("f₍ₓ₎", result) self.assertIn("OH⁻", result) self.assertIn("H⁺", result) # subscript i might not be in map, so check either form self.assertTrue("aᵢ" in result or "ai" in result or "ai" in result) self.assertIn("bⁱ", result) def test_in_table(self): """Test superscripts/subscripts within HTML tables""" html = """
Chemical Formula
Water H2O
Sulfate ion SO42-
""" result = html_to_markdown_with_frontmatter(html) # Tables should be preserved as HTML but superscripts/subscripts should still be converted self.assertIn("", result) # Check if conversions happened in table cells self.assertTrue("H₂O" in result or "2" in result) self.assertTrue("SO₄²⁻" in result or "42-" in result) def test_nested_elements(self): """Test superscripts/subscripts in nested HTML elements""" html = """

In physics: E = mc2

""" result = html_to_markdown_with_frontmatter(html) # Check conversions in nested structures self.assertIn("mc²", result) self.assertTrue("x¹" in result or "x1" in result) self.assertTrue("x₂" in result or "x2" in result) def test_frontmatter_preserved(self): """Test that frontmatter is still generated correctly""" html = """

Test with x2

Data
""" result = html_to_markdown_with_frontmatter(html) # Check frontmatter exists self.assertTrue(result.startswith("---")) self.assertIn("primary_language: es", result) self.assertIn("is_table:", result) # Also check the conversion happened self.assertIn("x²", result) def test_unmapped_characters(self): """Test characters not in the mapping""" html = """

Unknown: xabc and yxyz

Mixed: H2SO4 with note*

""" result = html_to_markdown_with_frontmatter(html) # Unmapped characters should be left as-is or handled gracefully self.assertIn("H₂SO₄", result) # Asterisk is not in the map, so it might remain as-is self.assertTrue("note*" in result or "note*" in result or "note^*" in result) def test_empty_super_subscripts(self): """Test empty sup/sub tags""" html = """

Empty tags: x and y

Normal: z2

""" result = html_to_markdown_with_frontmatter(html) # Empty tags should not cause errors self.assertIn("z²", result) # Empty tags should just be removed self.assertIn("x", result) self.assertIn("y", result) def test_complex_math_expression(self): """Test a complex mathematical expression""" html = """

The equation: (x1)2 + (x2)2 = r2

Series: a0 + a1x + a2x2 + ... + anxn

""" result = html_to_markdown_with_frontmatter(html) # Check complex nested expressions self.assertIn("x₁", result) self.assertIn("x₂", result) self.assertIn("r²", result) self.assertIn("a₀", result) self.assertIn("a₁", result) self.assertIn("a₂", result) self.assertIn("aₙ", result) self.assertIn("xⁿ", result) if __name__ == "__main__": unittest.main()