import unittest from olmocr.bench.synth.mine_html_templates import extract_html_metadata, html_to_markdown_with_frontmatter class TestExtractHtmlMetadata(unittest.TestCase): def test_extract_metadata_portuguese_document(self): """Test metadata extraction from a Portuguese document with mixed content.""" html_content = """ Test Document
Header content here

Política de Metadados

Este é um documento de teste com texto em português.

Contém múltiplos parágrafos para simular conteúdo real.

Image placeholder 1

Mais texto após a imagem.

""" metadata = extract_html_metadata(html_content) # Check language extraction self.assertEqual(metadata['primary_language'], 'pt') # Check rotation values (always fixed) self.assertTrue(metadata['is_rotation_valid']) self.assertEqual(metadata['rotation_correction'], 0) # Check table/diagram detection # With 1 image (500 chars) and small text content, image ratio > 50% self.assertFalse(metadata['is_table']) self.assertTrue(metadata['is_diagram']) # Image estimate dominates def test_extract_metadata_table_heavy_document(self): """Test metadata extraction from a document that is mostly tables.""" html_content = """

Small intro text

Cell 1Cell 2Cell 3
Data AData BData C
More dataMore dataMore data
Even more dataEven more dataEven more data
Lots of dataLots of dataLots of data
Table contentTable contentTable content
Final rowFinal rowFinal row
""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'en') self.assertTrue(metadata['is_table']) # Should be True as >50% is table self.assertFalse(metadata['is_diagram']) def test_extract_metadata_image_heavy_document(self): """Test metadata extraction from a document that is mostly images.""" html_content = """

Brief text

Image 1
Image 2
Image 3
Image 4
Image 5
""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'es') self.assertFalse(metadata['is_table']) self.assertTrue(metadata['is_diagram']) # Should be True as >50% is images def test_extract_metadata_language_with_region(self): """Test that language codes with regions (e.g., pt-BR) are shortened.""" html_content = """

Texto em português brasileiro

""" metadata = extract_html_metadata(html_content) # Should convert pt-BR to pt self.assertEqual(metadata['primary_language'], 'pt') def test_extract_metadata_no_html_tag(self): """Test extraction when there's no html tag (defaults to 'en').""" html_content = """

Content without html tag

""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'en') # Should default to 'en' def test_extract_metadata_mixed_content(self): """Test a document with mixed content types.""" html_content = """ Política de Metadados para Livros e Capítulos de Livro UFPA
Biblioteca Central UFPA
LIVRO ABERTO portal do livro aberto da UFPA
SIBI/UFPA

Política de Metadados para Livros e Capítulos de Livro UFPA

Essa política de metadados possui o objetivo de garantir a consistência do trabalho executado no Portal do Livro Aberto. Dessa forma, foi desenvolvido com base no esquema de metadados do Dublin Core com adaptações para a realidade brasileira e local.

METADADOS VALOR REPETITIVO CONDIÇÃO
dc.type Tipo de documento Não Obrigatório
dc.title Título e subtítulo (se houver) Não Obrigatório
dc.title.alternative Título alternativo Sim Opcional
dc.creator Autor Sim Opcional
dc.creator.Lattes URL do currículo Lattes do autor Sim Opcional
dc.creator.ORCID ORCID do autor Sim Opcional
dc.description.affiliation Afiliação do autor Sim Opcional
dc.contributor.organizer Organizador Sim Opcional
dc.contributor.organizerLattes URL do currículo Lattes do organizador Sim Opcional
dc.contributor.organizerORCID ORCID do organizador Sim Opcional
dc.description.affiliationOrganizer Afiliação do organizador Sim Opcional
dc.contributor.coordinator Coordenador Sim Opcional
dc.contributor.coordinatorLattes URL do currículo Lattes do coordenador Sim Opcional
dc.contributor.coordinatorORCID ORCID do coordenador Sim Opcional
dc.contributor.affiliationCoordinator Afiliação do coordenador Sim Opcional
dc.contributor.editor Editor Sim Opcional
dc.contributor.editorLattes URL do currículo Lattes do editor Sim Opcional
dc.contributor.editorORCID ORCID do editor Sim Opcional
dc.description.affiliationEditor Afiliação do editor Sim Opcional
""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'pt') self.assertTrue(metadata['is_table']) self.assertFalse(metadata['is_diagram']) def test_extract_metadata_empty_body(self): """Test extraction with empty or minimal content.""" html_content = """ """ metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'de') self.assertFalse(metadata['is_table']) self.assertFalse(metadata['is_diagram']) self.assertTrue(metadata['is_rotation_valid']) self.assertEqual(metadata['rotation_correction'], 0) class TestHtmlToMarkdown(unittest.TestCase): def test_title_tag_excluded_from_markdown(self): """Test that title tags from head are not included in markdown output.""" html_content = """ This Should Not Appear In Markdown

Main Heading

This is the body content that should appear.

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that the title from head tag is NOT in the markdown self.assertNotIn("This Should Not Appear In Markdown", markdown_with_frontmatter) # Check that body content IS in the markdown self.assertIn("Main Heading", markdown_with_frontmatter) self.assertIn("This is the body content that should appear", markdown_with_frontmatter) # Check that frontmatter is present self.assertTrue(markdown_with_frontmatter.startswith("---")) def test_image_with_data_description(self): """Test that images are converted with placeholder alt text.""" html_content = """

Text before image

Placeholder

Text after image

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that images use the fixed placeholder alt text self.assertIn("![Image Placeholder]", markdown_with_frontmatter) # Check that other content is preserved self.assertIn("Text before image", markdown_with_frontmatter) self.assertIn("Text after image", markdown_with_frontmatter) def test_image_without_data_description(self): """Test that images without data-description use default alt text.""" html_content = """
Some placeholder content
""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that default alt text is used self.assertIn("![Image Placeholder]", markdown_with_frontmatter) def test_headers_footers_excluded(self): """Test that header and footer tags are excluded from markdown.""" html_content = """

Main Content

This should appear in the markdown.

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that header/footer content is excluded self.assertNotIn("Navigation menu", markdown_with_frontmatter) self.assertNotIn("Footer text", markdown_with_frontmatter) # Check that main content is included self.assertIn("Main Content", markdown_with_frontmatter) self.assertIn("This should appear in the markdown", markdown_with_frontmatter) def test_no_body_tag_fallback(self): """Test that content is still processed when there's no body tag.""" html_content = """

Content without body tag

This should still be converted.

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that content is still converted self.assertIn("Content without body tag", markdown_with_frontmatter) self.assertIn("This should still be converted", markdown_with_frontmatter) def test_removes_triple_dashes_from_content(self): """Test that --- at the start or end of markdown content is removed.""" # Test with --- at the beginning html_content_start = """

---

Regular content here

""" markdown_start = html_to_markdown_with_frontmatter(html_content_start) lines = markdown_start.split('\n') # Check that we have FrontMatter self.assertEqual(lines[0], '---') # Check that the content doesn't start with --- after the FrontMatter ends frontmatter_end = next(i for i in range(1, len(lines)) if lines[i] == '---') content_after_frontmatter = '\n'.join(lines[frontmatter_end + 1:]) self.assertFalse(content_after_frontmatter.strip().startswith('---')) # Test with --- at the end html_content_end = """

Regular content here

---

""" markdown_end = html_to_markdown_with_frontmatter(html_content_end) # Check that content doesn't end with --- self.assertFalse(markdown_end.rstrip().endswith('---\n---')) # Test with --- at both beginning and end html_content_both = """

---

Middle content

---

""" markdown_both = html_to_markdown_with_frontmatter(html_content_both) lines_both = markdown_both.split('\n') frontmatter_end_both = next(i for i in range(1, len(lines_both)) if lines_both[i] == '---') content_both = '\n'.join(lines_both[frontmatter_end_both + 1:]) # Content should not start or end with --- self.assertFalse(content_both.strip().startswith('---')) self.assertFalse(content_both.strip().endswith('---')) # But should contain "Middle content" self.assertIn("Middle content", content_both) if __name__ == '__main__': unittest.main()