from bs4 import BeautifulSoup from unstructured.documents.ontology import Form, FormFieldValue, Image, OntologyElement, Page from unstructured.partition.html.html_utils import indent_html from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology def _wrap_with_body(html: str) -> str: return f'
{html}' def remove_all_ids(html_str): soup = BeautifulSoup(html_str, "html.parser") for tag in soup.find_all(True): if tag.has_attr("id"): del tag["id"] return str(soup) def test_wrong_html_parser_causes_paragraph_to_be_nested_in_div(): # This test would fail if html5lib parser would be applied on the input HTML. # It would result in Page: # instead of Page: # language=HTML input_html = """Mountain View, California
About the same
Some text
83.64 GiB | Fair Value |
---|
83.64 GiB | Fair Value |
---|
Fair Value1 | Fair Value2 |
---|
Fair Value1 | Fair Value2 |
---|
Carrying Value | |||||
---|---|---|---|---|---|
$— |
Carrying Value | |||||
---|---|---|---|---|---|
$— |