from bs4 import BeautifulSoup from unstructured.documents.ontology import Form, FormFieldValue, Image, OntologyElement, Page from unstructured.partition.html.html_utils import indent_html from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology def _wrap_with_body(html: str) -> str: return f'{html}' def remove_all_ids(html_str): soup = BeautifulSoup(html_str, "html.parser") for tag in soup.find_all(True): if tag.has_attr("id"): del tag["id"] return str(soup) def test_wrong_html_parser_causes_paragraph_to_be_nested_in_div(): # This test would fail if html5lib parser would be applied on the input HTML. # It would result in Page:

# instead of Page:

# language=HTML input_html = """

Mountain View, California

""" page = parse_html_to_ontology(input_html) assert len(page.children) == 1 narrative_text = page.children[0] assert len(narrative_text.children) == 1 address = narrative_text.children[0] assert address.text == "Mountain View, California" def test_when_class_is_missing_it_can_be_inferred_from_type(): # language=HTML base_html = _wrap_with_body( """
""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """
""" ) expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_when_class_is_wrong_tag_name_is_overwritten(): # language=HTML base_html = _wrap_with_body( """
""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """
""" ) expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_when_tag_not_supported_by_ontology_and_wrong_then_consider_them_text(): # language=HTML base_html = _wrap_with_body( """
Some text
""" ) base_html = indent_html(base_html) # TODO (Pluto): Maybe it should be considered as plain text? # language=HTML expected_html = _wrap_with_body( """
Some text
""" ) expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_div_are_ignored_when_no_attrs(): # language=HTML base_html = _wrap_with_body( """
""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """
""" ) expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_ids_are_preserved(): # language=HTML base_html = _wrap_with_body( """
""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """
""" ) expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) page = ontology.children[0] div_obj = page.children[0] assert div_obj.additional_attributes["id"] == "important_div" def test_br_is_not_considered_uncategorized_text(): # language=HTML base_html = _wrap_with_body( """

""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """

""" ) expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_text_without_tag_is_marked_as_uncategorized_text_when_there_are_other_elements(): # language=HTML base_html = _wrap_with_body( """
About the same Some text
""" ) # language=HTML expected_html = _wrap_with_body( """

About the same

Some text

""" ) expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_keyword_only_attributes_are_preserved_during_mapping(): # language=HTML base_html = _wrap_with_body( """ """ ) # noqa: E501 base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """ """ ) # noqa: E501 expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_when_unknown_element_keyword_only_attributes_are_preserved_during_mapping(): # can be assigned to multiple classes so it is not clear what it is # thus we assign it to UncategorizedText # language=HTML base_html = _wrap_with_body( """
""" ) base_html = indent_html(base_html) # TODO(Pluto): Maybe tag also should be overwritten? Or just leave it as it is? # We classify as UncategorizedText but all the text is preserved # for UnstructuredElement so it make sense now as well # language=HTML expected_html = _wrap_with_body( """
""" ) # noqa: E501 expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_broken_cell_is_not_raising_error(): # language=HTML base_html = _wrap_with_body( """
83.64 GiB Fair Value
""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """
83.64 GiB Fair Value
""" ) expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_table(): # language=HTML base_html = _wrap_with_body( """
Fair Value1 Fair Value2
""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """
Fair Value1 Fair Value2
""" ) expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_table_and_time(): # language=HTML base_html = _wrap_with_body( """
Carrying Value
$—
""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """
Carrying Value
$—
""" ) expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_malformed_html(): # language=HTML input_html = """ Super Malformed HTML