import unittest
from olmocr.bench.synth.mine_html_templates import extract_html_metadata, html_to_markdown_with_frontmatter
class TestExtractHtmlMetadata(unittest.TestCase):
def test_extract_metadata_portuguese_document(self):
"""Test metadata extraction from a Portuguese document with mixed content."""
html_content = """
Test DocumentHeader content here
Política de Metadados
Este é um documento de teste com texto em português.
Contém múltiplos parágrafos para simular conteúdo real.
Image placeholder 1
Mais texto após a imagem.
"""
metadata = extract_html_metadata(html_content)
# Check language extraction
self.assertEqual(metadata['primary_language'], 'pt')
# Check rotation values (always fixed)
self.assertTrue(metadata['is_rotation_valid'])
self.assertEqual(metadata['rotation_correction'], 0)
# Check table/diagram detection
# With 1 image (500 chars) and small text content, image ratio > 50%
self.assertFalse(metadata['is_table'])
self.assertTrue(metadata['is_diagram']) # Image estimate dominates
def test_extract_metadata_table_heavy_document(self):
"""Test metadata extraction from a document that is mostly tables."""
html_content = """
Small intro text
Cell 1
Cell 2
Cell 3
Data A
Data B
Data C
More data
More data
More data
Even more data
Even more data
Even more data
Lots of data
Lots of data
Lots of data
Table content
Table content
Table content
Final row
Final row
Final row
"""
metadata = extract_html_metadata(html_content)
self.assertEqual(metadata['primary_language'], 'en')
self.assertTrue(metadata['is_table']) # Should be True as >50% is table
self.assertFalse(metadata['is_diagram'])
def test_extract_metadata_image_heavy_document(self):
"""Test metadata extraction from a document that is mostly images."""
html_content = """
Brief text
Image 1
Image 2
Image 3
Image 4
Image 5
"""
metadata = extract_html_metadata(html_content)
self.assertEqual(metadata['primary_language'], 'es')
self.assertFalse(metadata['is_table'])
self.assertTrue(metadata['is_diagram']) # Should be True as >50% is images
def test_extract_metadata_language_with_region(self):
"""Test that language codes with regions (e.g., pt-BR) are shortened."""
html_content = """
Texto em português brasileiro
"""
metadata = extract_html_metadata(html_content)
# Should convert pt-BR to pt
self.assertEqual(metadata['primary_language'], 'pt')
def test_extract_metadata_no_html_tag(self):
"""Test extraction when there's no html tag (defaults to 'en')."""
html_content = """
Content without html tag
"""
metadata = extract_html_metadata(html_content)
self.assertEqual(metadata['primary_language'], 'en') # Should default to 'en'
def test_extract_metadata_mixed_content(self):
"""Test a document with mixed content types."""
html_content = """
Política de Metadados para Livros e Capítulos de Livro UFPA
Biblioteca Central UFPA
LIVRO ABERTO portal do livro aberto da UFPA
SIBI/UFPA
Política de Metadados para Livros e Capítulos de Livro UFPA
Essa política de metadados possui o objetivo de garantir a consistência do trabalho executado no Portal do Livro Aberto. Dessa forma, foi desenvolvido com base no esquema de metadados do Dublin Core com adaptações para a realidade brasileira e local.
METADADOS
VALOR
REPETITIVO
CONDIÇÃO
dc.type
Tipo de documento
Não
Obrigatório
dc.title
Título e subtítulo (se houver)
Não
Obrigatório
dc.title.alternative
Título alternativo
Sim
Opcional
dc.creator
Autor
Sim
Opcional
dc.creator.Lattes
URL do currículo Lattes do autor
Sim
Opcional
dc.creator.ORCID
ORCID do autor
Sim
Opcional
dc.description.affiliation
Afiliação do autor
Sim
Opcional
dc.contributor.organizer
Organizador
Sim
Opcional
dc.contributor.organizerLattes
URL do currículo Lattes do organizador
Sim
Opcional
dc.contributor.organizerORCID
ORCID do organizador
Sim
Opcional
dc.description.affiliationOrganizer
Afiliação do organizador
Sim
Opcional
dc.contributor.coordinator
Coordenador
Sim
Opcional
dc.contributor.coordinatorLattes
URL do currículo Lattes do coordenador
Sim
Opcional
dc.contributor.coordinatorORCID
ORCID do coordenador
Sim
Opcional
dc.contributor.affiliationCoordinator
Afiliação do coordenador
Sim
Opcional
dc.contributor.editor
Editor
Sim
Opcional
dc.contributor.editorLattes
URL do currículo Lattes do editor
Sim
Opcional
dc.contributor.editorORCID
ORCID do editor
Sim
Opcional
dc.description.affiliationEditor
Afiliação do editor
Sim
Opcional
"""
metadata = extract_html_metadata(html_content)
self.assertEqual(metadata['primary_language'], 'pt')
self.assertTrue(metadata['is_table'])
self.assertFalse(metadata['is_diagram'])
def test_extract_metadata_empty_body(self):
"""Test extraction with empty or minimal content."""
html_content = """
"""
metadata = extract_html_metadata(html_content)
self.assertEqual(metadata['primary_language'], 'de')
self.assertFalse(metadata['is_table'])
self.assertFalse(metadata['is_diagram'])
self.assertTrue(metadata['is_rotation_valid'])
self.assertEqual(metadata['rotation_correction'], 0)
class TestHtmlToMarkdown(unittest.TestCase):
def test_title_tag_excluded_from_markdown(self):
"""Test that title tags from head are not included in markdown output."""
html_content = """
This Should Not Appear In Markdown
Main Heading
This is the body content that should appear.
"""
markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content)
# Check that the title from head tag is NOT in the markdown
self.assertNotIn("This Should Not Appear In Markdown", markdown_with_frontmatter)
# Check that body content IS in the markdown
self.assertIn("Main Heading", markdown_with_frontmatter)
self.assertIn("This is the body content that should appear", markdown_with_frontmatter)
# Check that frontmatter is present
self.assertTrue(markdown_with_frontmatter.startswith("---"))
def test_image_with_data_description(self):
"""Test that images are converted with placeholder alt text."""
html_content = """
Text before image
Placeholder
Text after image
"""
markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content)
# Check that images use the fixed placeholder alt text
self.assertIn("![Image Placeholder]", markdown_with_frontmatter)
# Check that other content is preserved
self.assertIn("Text before image", markdown_with_frontmatter)
self.assertIn("Text after image", markdown_with_frontmatter)
def test_image_without_data_description(self):
"""Test that images without data-description use default alt text."""
html_content = """
Some placeholder content
"""
markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content)
# Check that default alt text is used
self.assertIn("![Image Placeholder]", markdown_with_frontmatter)
def test_headers_footers_excluded(self):
"""Test that header and footer tags are excluded from markdown."""
html_content = """
Main Content
This should appear in the markdown.
"""
markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content)
# Check that header/footer content is excluded
self.assertNotIn("Navigation menu", markdown_with_frontmatter)
self.assertNotIn("Footer text", markdown_with_frontmatter)
# Check that main content is included
self.assertIn("Main Content", markdown_with_frontmatter)
self.assertIn("This should appear in the markdown", markdown_with_frontmatter)
def test_no_body_tag_fallback(self):
"""Test that content is still processed when there's no body tag."""
html_content = """
Content without body tag
This should still be converted.
"""
markdown_with_frontmatter = html_to_markdown_with_frontmatter(html_content)
# Check that content is still converted
self.assertIn("Content without body tag", markdown_with_frontmatter)
self.assertIn("This should still be converted", markdown_with_frontmatter)
def test_removes_triple_dashes_from_content(self):
"""Test that --- at the start or end of markdown content is removed."""
# Test with --- at the beginning
html_content_start = """
---
Regular content here
"""
markdown_start = html_to_markdown_with_frontmatter(html_content_start)
lines = markdown_start.split('\n')
# Check that we have FrontMatter
self.assertEqual(lines[0], '---')
# Check that the content doesn't start with --- after the FrontMatter ends
frontmatter_end = next(i for i in range(1, len(lines)) if lines[i] == '---')
content_after_frontmatter = '\n'.join(lines[frontmatter_end + 1:])
self.assertFalse(content_after_frontmatter.strip().startswith('---'))
# Test with --- at the end
html_content_end = """
Regular content here
---
"""
markdown_end = html_to_markdown_with_frontmatter(html_content_end)
# Check that content doesn't end with ---
self.assertFalse(markdown_end.rstrip().endswith('---\n---'))
# Test with --- at both beginning and end
html_content_both = """
---
Middle content
---
"""
markdown_both = html_to_markdown_with_frontmatter(html_content_both)
lines_both = markdown_both.split('\n')
frontmatter_end_both = next(i for i in range(1, len(lines_both)) if lines_both[i] == '---')
content_both = '\n'.join(lines_both[frontmatter_end_both + 1:])
# Content should not start or end with ---
self.assertFalse(content_both.strip().startswith('---'))
self.assertFalse(content_both.strip().endswith('---'))
# But should contain "Middle content"
self.assertIn("Middle content", content_both)
if __name__ == '__main__':
unittest.main()