from typing import Optional, Type
import pytest
from bs4 import BeautifulSoup
from unstructured.documents.ontology import (
Checkbox,
Form,
FormFieldValue,
Image,
OntologyElement,
Page,
RadioButton,
)
from unstructured.partition.html.html_utils import indent_html
from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology
def _wrap_with_body(html: str) -> str:
return f'
{html}'
def remove_all_ids(html_str):
soup = BeautifulSoup(html_str, "html.parser")
for tag in soup.find_all(True):
if tag.has_attr("id"):
del tag["id"]
return str(soup)
def test_parsing_header_and_footer_into_correct_ontologyelement():
input_html = """
this is a header
"""
page = parse_html_to_ontology(input_html)
assert len(page.children) == 2
header, footer = page.children
assert header.text == "this is a header"
assert header.html_tag_name == "header"
assert footer.text == "this is a footer"
assert footer.html_tag_name == "footer"
def test_wrong_html_parser_causes_paragraph_to_be_nested_in_div():
# This test would fail if html5lib parser would be applied on the input HTML.
# It would result in Page:
# instead of Page:
# language=HTML
input_html = """
"""
)
base_html = indent_html(base_html)
# TODO (Pluto): Maybe it should be considered as plain text?
# language=HTML
expected_html = _wrap_with_body(
"""
"""
)
expected_html = indent_html(expected_html)
ontology: OntologyElement = parse_html_to_ontology(base_html)
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
assert parsed_ontology == expected_html
def test_keyword_only_attributes_are_preserved_during_mapping():
# language=HTML
base_html = _wrap_with_body(
"""
"""
) # noqa: E501
base_html = indent_html(base_html)
# language=HTML
expected_html = _wrap_with_body(
"""
"""
) # noqa: E501
expected_html = indent_html(expected_html)
ontology: OntologyElement = parse_html_to_ontology(base_html)
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
assert parsed_ontology == expected_html
def test_when_unknown_element_keyword_only_attributes_are_preserved_during_mapping():
# can be assigned to multiple classes so it is not clear what it is
# thus we assign it to UncategorizedText
# language=HTML
base_html = _wrap_with_body(
"""
"""
)
base_html = indent_html(base_html)
# TODO(Pluto): Maybe tag also should be overwritten? Or just leave it as it is?
# We classify as UncategorizedText but all the text is preserved
# for UnstructuredElement so it make sense now as well
# language=HTML
expected_html = _wrap_with_body(
"""