diff --git a/CHANGELOG.md b/CHANGELOG.md index b9e042eff..3f956444d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.3-dev1 +## 0.16.3-dev2 ### Enhancements @@ -7,6 +7,7 @@ ### Fixes * **V2 elements without first parent ID can be parsed** +* **Fix missing elements when layout element parsed in V2 ontology** ## 0.16.2 diff --git a/test_unstructured/partition/html/test_html_to_ontology_parsing.py b/test_unstructured/partition/html/test_html_to_ontology_parsing.py index 9cbdf6805..102dd4c6e 100644 --- a/test_unstructured/partition/html/test_html_to_ontology_parsing.py +++ b/test_unstructured/partition/html/test_html_to_ontology_parsing.py @@ -59,7 +59,7 @@ def test_when_class_is_missing_it_can_be_inferred_from_type(): expected_html = _wrap_with_body( """
- +
""" ) @@ -87,7 +87,7 @@ def test_when_class_is_wrong_tag_name_is_overwritten(): expected_html = _wrap_with_body( """
- +
""" ) @@ -535,6 +535,8 @@ def test_malformed_html(): # language=HTML expected_html = """ + +

Unclosed comment

@@ -554,6 +556,7 @@ def test_malformed_html():

Paragraph with invalid characters: � � �

+

""" @@ -563,3 +566,31 @@ def test_malformed_html(): parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html + + +def test_text_is_wrapped_inside_layout_element(): + # language=HTML + base_html = _wrap_with_body( + """ +
+ Text +
+ """ + ) + base_html = indent_html(base_html) + + # language=HTML + expected_html = _wrap_with_body( + """ +
+

Text

+
+ """ + ) + + expected_html = indent_html(expected_html) + + ontology: OntologyElement = parse_html_to_ontology(base_html) + parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) + + assert parsed_ontology == expected_html diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3eb8a850e..16fec7848 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.3-dev1" # pragma: no cover +__version__ = "0.16.3-dev2" # pragma: no cover diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index 421585a2d..f2b897e51 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -79,15 +79,17 @@ def ontology_to_unstructured_elements( ), ) ] - + childreen = [] for child in ontology_element.children: - elements_to_return += ontology_to_unstructured_elements( + childreen += ontology_to_unstructured_elements( child, parent_id=ontology_element.id, page_number=page_number, depth=0 if isinstance(ontology_element, Document) else depth + 1, filename=filename, ) + + elements_to_return += childreen else: unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[ ontology_element.__class__.__name__ @@ -98,7 +100,6 @@ def ontology_to_unstructured_elements( BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip() ) # TODO value attribute from form input should be added to the text - unstructured_element = element_class( text=element_text, element_id=ontology_element.id, @@ -255,8 +256,10 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None: additional_attributes=escaped_attrs, ) - has_children = (ontology_class != UncategorizedText) and any( - isinstance(content, Tag) for content in soup.contents + has_children = ( + (ontology_class != UncategorizedText) + and any(isinstance(content, Tag) for content in soup.contents) + or ontology_class().elementType == ElementTypeEnum.layout ) if has_children: