Super Malformed HTML

from bs4 import BeautifulSoup from unstructured.documents.ontology import Form, FormFieldValue, Image, OntologyElement, Page from unstructured.partition.html.html_utils import indent_html from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology def _wrap_with_body(html: str) -> str: return f'{html}' def remove_all_ids(html_str): soup = BeautifulSoup(html_str, "html.parser") for tag in soup.find_all(True): if tag.has_attr("id"): del tag["id"] return str(soup) def test_wrong_html_parser_causes_paragraph_to_be_nested_in_div(): # This test would fail if html5lib parser would be applied on the input HTML. # It would result in Page:

# instead of Page:

# language=HTML input_html = """

Mountain View, California

""" page = parse_html_to_ontology(input_html) assert len(page.children) == 1 narrative_text = page.children[0] assert len(narrative_text.children) == 1 address = narrative_text.children[0] assert address.text == "Mountain View, California" def test_when_class_is_missing_it_can_be_inferred_from_type(): # language=HTML base_html = _wrap_with_body( """

""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """

Some text

""" ) base_html = indent_html(base_html) # TODO (Pluto): Maybe it should be considered as plain text? # language=HTML expected_html = _wrap_with_body( """

Some text

""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """

""" ) expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) page = ontology.children[0] div_obj = page.children[0] assert div_obj.additional_attributes["id"] == "important_div" def test_br_is_not_considered_uncategorized_text(): # language=HTML base_html = _wrap_with_body( """

""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """

""" ) expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_text_without_tag_is_marked_as_uncategorized_text_when_there_are_other_elements(): # language=HTML base_html = _wrap_with_body( """

About the same Some text

""" ) # language=HTML expected_html = _wrap_with_body( """

About the same

Some text

""" ) expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_keyword_only_attributes_are_preserved_during_mapping(): # language=HTML base_html = _wrap_with_body( """ """ ) # noqa: E501 base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """ """ ) # noqa: E501 expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_when_unknown_element_keyword_only_attributes_are_preserved_during_mapping(): # can be assigned to multiple classes so it is not clear what it is # thus we assign it to UncategorizedText # language=HTML base_html = _wrap_with_body( """

""" ) base_html = indent_html(base_html) # TODO(Pluto): Maybe tag also should be overwritten? Or just leave it as it is? # We classify as UncategorizedText but all the text is preserved # for UnstructuredElement so it make sense now as well # language=HTML expected_html = _wrap_with_body( """

""" ) # noqa: E501 expected_html = indent_html(expected_html) ontology: OntologyElement = parse_html_to_ontology(base_html) parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html def test_broken_cell_is_not_raising_error(): # language=HTML base_html = _wrap_with_body( """

83.64 GiB	Fair Value

""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """

83.64 GiB	Fair Value

Fair Value1	Fair Value2

""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """

Fair Value1	Fair Value2

Carrying Value
June 30, 2023					$—

""" ) base_html = indent_html(base_html) # language=HTML expected_html = _wrap_with_body( """

Carrying Value
June 30, 2023					$—