import unittest from olmocr.bench.synth.mine_html_templates import extract_html_metadata, html_to_markdown_with_frontmatter class TestExtractHtmlMetadata(unittest.TestCase): def test_extract_metadata_portuguese_document(self): """Test metadata extraction from a Portuguese document with mixed content.""" html_content = """ Test Document

Header content here

Política de Metadados

Este é um documento de teste com texto em português.

Contém múltiplos parágrafos para simular conteúdo real.

Image placeholder 1

Mais texto após a imagem.

""" metadata = extract_html_metadata(html_content) # Check language extraction self.assertEqual(metadata['primary_language'], 'pt') # Check rotation values (always fixed) self.assertTrue(metadata['is_rotation_valid']) self.assertEqual(metadata['rotation_correction'], 0) # Check table/diagram detection # With 1 image (500 chars) and small text content, image ratio > 50% self.assertFalse(metadata['is_table']) self.assertTrue(metadata['is_diagram']) # Image estimate dominates def test_extract_metadata_table_heavy_document(self): """Test metadata extraction from a document that is mostly tables.""" html_content = """

Small intro text

Cell 1	Cell 2	Cell 3
Data A	Data B	Data C
More data	More data	More data
Even more data	Even more data	Even more data
Lots of data	Lots of data	Lots of data
Table content	Table content	Table content
Final row	Final row	Final row

""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'en') self.assertTrue(metadata['is_table']) # Should be True as >50% is table self.assertFalse(metadata['is_diagram']) def test_extract_metadata_image_heavy_document(self): """Test metadata extraction from a document that is mostly images.""" html_content = """

Brief text

Image 1

Image 2

Image 3

Image 4

Image 5

""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'es') self.assertFalse(metadata['is_table']) self.assertTrue(metadata['is_diagram']) # Should be True as >50% is images def test_extract_metadata_language_with_region(self): """Test that language codes with regions (e.g., pt-BR) are shortened.""" html_content = """

Texto em português brasileiro

""" metadata = extract_html_metadata(html_content) # Should convert pt-BR to pt self.assertEqual(metadata['primary_language'], 'pt') def test_extract_metadata_no_html_tag(self): """Test extraction when there's no html tag (defaults to 'en').""" html_content = """

Content without html tag

""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'en') # Should default to 'en' def test_extract_metadata_mixed_content(self): """Test a document with mixed content types.""" html_content = """ Política de Metadados para Livros e Capítulos de Livro UFPA

Biblioteca Central UFPA

LIVRO ABERTO portal do livro aberto da UFPA

SIBI/UFPA

Política de Metadados para Livros e Capítulos de Livro UFPA

Essa política de metadados possui o objetivo de garantir a consistência do trabalho executado no Portal do Livro Aberto. Dessa forma, foi desenvolvido com base no esquema de metadados do Dublin Core com adaptações para a realidade brasileira e local.

METADADOS	VALOR	REPETITIVO	CONDIÇÃO
dc.type	Tipo de documento	Não	Obrigatório
dc.title	Título e subtítulo (se houver)	Não	Obrigatório
dc.title.alternative	Título alternativo	Sim	Opcional
dc.creator	Autor	Sim	Opcional
dc.creator.Lattes	URL do currículo Lattes do autor	Sim	Opcional
dc.creator.ORCID	ORCID do autor	Sim	Opcional
dc.description.affiliation	Afiliação do autor	Sim	Opcional
dc.contributor.organizer	Organizador	Sim	Opcional
dc.contributor.organizerLattes	URL do currículo Lattes do organizador	Sim	Opcional
dc.contributor.organizerORCID	ORCID do organizador	Sim	Opcional
dc.description.affiliationOrganizer	Afiliação do organizador	Sim	Opcional
dc.contributor.coordinator	Coordenador	Sim	Opcional
dc.contributor.coordinatorLattes	URL do currículo Lattes do coordenador	Sim	Opcional
dc.contributor.coordinatorORCID	ORCID do coordenador	Sim	Opcional
dc.contributor.affiliationCoordinator	Afiliação do coordenador	Sim	Opcional
dc.contributor.editor	Editor	Sim	Opcional
dc.contributor.editorLattes	URL do currículo Lattes do editor	Sim	Opcional
dc.contributor.editorORCID	ORCID do editor	Sim	Opcional
dc.description.affiliationEditor	Afiliação do editor	Sim	Opcional

""" metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'pt') self.assertTrue(metadata['is_table']) self.assertFalse(metadata['is_diagram']) def test_extract_metadata_empty_body(self): """Test extraction with empty or minimal content.""" html_content = """ """ metadata = extract_html_metadata(html_content) self.assertEqual(metadata['primary_language'], 'de') self.assertFalse(metadata['is_table']) self.assertFalse(metadata['is_diagram']) self.assertTrue(metadata['is_rotation_valid']) self.assertEqual(metadata['rotation_correction'], 0) class TestHtmlToMarkdown(unittest.TestCase): def test_title_tag_excluded_from_markdown(self): """Test that title tags from head are not included in markdown output.""" html_content = """ This Should Not Appear In Markdown

Main Heading

This is the body content that should appear.

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that the title from head tag is NOT in the markdown self.assertNotIn("This Should Not Appear In Markdown", markdown_with_frontmatter) # Check that body content IS in the markdown self.assertIn("Main Heading", markdown_with_frontmatter) self.assertIn("This is the body content that should appear", markdown_with_frontmatter) # Check that frontmatter is present self.assertTrue(markdown_with_frontmatter.startswith("---")) def test_image_with_data_description(self): """Test that images are converted with placeholder alt text.""" html_content = """

Text before image

Placeholder

Text after image

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that images use the fixed placeholder alt text self.assertIn("![Image Placeholder]", markdown_with_frontmatter) # Check that other content is preserved self.assertIn("Text before image", markdown_with_frontmatter) self.assertIn("Text after image", markdown_with_frontmatter) def test_image_without_data_description(self): """Test that images without data-description use default alt text.""" html_content = """

Some placeholder content

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that default alt text is used self.assertIn("![Image Placeholder]", markdown_with_frontmatter) def test_headers_footers_excluded(self): """Test that header and footer tags are excluded from markdown.""" html_content = """

Main Content

This should appear in the markdown.

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that header/footer content is excluded self.assertNotIn("Navigation menu", markdown_with_frontmatter) self.assertNotIn("Footer text", markdown_with_frontmatter) # Check that main content is included self.assertIn("Main Content", markdown_with_frontmatter) self.assertIn("This should appear in the markdown", markdown_with_frontmatter) def test_no_body_tag_fallback(self): """Test that content is still processed when there's no body tag.""" html_content = """

Content without body tag

This should still be converted.

""" markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content) # Check that content is still converted self.assertIn("Content without body tag", markdown_with_frontmatter) self.assertIn("This should still be converted", markdown_with_frontmatter) def test_removes_triple_dashes_from_content(self): """Test that --- at the start or end of markdown content is removed.""" # Test with --- at the beginning html_content_start = """

---

Regular content here

""" markdown_start = html_to_markdown_with_frontmatter(html_content_start) lines = markdown_start.split('\n') # Check that we have FrontMatter self.assertEqual(lines[0], '---') # Check that the content doesn't start with --- after the FrontMatter ends frontmatter_end = next(i for i in range(1, len(lines)) if lines[i] == '---') content_after_frontmatter = '\n'.join(lines[frontmatter_end + 1:]) self.assertFalse(content_after_frontmatter.strip().startswith('---')) # Test with --- at the end html_content_end = """

Regular content here

---

""" markdown_end = html_to_markdown_with_frontmatter(html_content_end) # Check that content doesn't end with --- self.assertFalse(markdown_end.rstrip().endswith('---\n---')) # Test with --- at both beginning and end html_content_both = """

---

Middle content

---

""" markdown_both = html_to_markdown_with_frontmatter(html_content_both) lines_both = markdown_both.split('\n') frontmatter_end_both = next(i for i in range(1, len(lines_both)) if lines_both[i] == '---') content_both = '\n'.join(lines_both[frontmatter_end_both + 1:]) # Content should not start or end with --- self.assertFalse(content_both.strip().startswith('---')) self.assertFalse(content_both.strip().endswith('---')) # But should contain "Middle content" self.assertIn("Middle content", content_both) if __name__ == '__main__': unittest.main()