diff --git a/CHANGELOG.md b/CHANGELOG.md
index b9e042eff..3f956444d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.3-dev1
+## 0.16.3-dev2
### Enhancements
@@ -7,6 +7,7 @@
### Fixes
* **V2 elements without first parent ID can be parsed**
+* **Fix missing elements when layout element parsed in V2 ontology**
## 0.16.2
diff --git a/test_unstructured/partition/html/test_html_to_ontology_parsing.py b/test_unstructured/partition/html/test_html_to_ontology_parsing.py
index 9cbdf6805..102dd4c6e 100644
--- a/test_unstructured/partition/html/test_html_to_ontology_parsing.py
+++ b/test_unstructured/partition/html/test_html_to_ontology_parsing.py
@@ -59,7 +59,7 @@ def test_when_class_is_missing_it_can_be_inferred_from_type():
expected_html = _wrap_with_body(
"""
@@ -554,6 +556,7 @@ def test_malformed_html():
Paragraph with invalid characters: � � �
+
"""
@@ -563,3 +566,31 @@ def test_malformed_html():
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
assert parsed_ontology == expected_html
+
+
+def test_text_is_wrapped_inside_layout_element():
+ # language=HTML
+ base_html = _wrap_with_body(
+ """
+
+ Text
+
+ """
+ )
+ base_html = indent_html(base_html)
+
+ # language=HTML
+ expected_html = _wrap_with_body(
+ """
+
+ """
+ )
+
+ expected_html = indent_html(expected_html)
+
+ ontology: OntologyElement = parse_html_to_ontology(base_html)
+ parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
+
+ assert parsed_ontology == expected_html
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 3eb8a850e..16fec7848 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.3-dev1" # pragma: no cover
+__version__ = "0.16.3-dev2" # pragma: no cover
diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py
index 421585a2d..f2b897e51 100644
--- a/unstructured/partition/html/transformations.py
+++ b/unstructured/partition/html/transformations.py
@@ -79,15 +79,17 @@ def ontology_to_unstructured_elements(
),
)
]
-
+ childreen = []
for child in ontology_element.children:
- elements_to_return += ontology_to_unstructured_elements(
+ childreen += ontology_to_unstructured_elements(
child,
parent_id=ontology_element.id,
page_number=page_number,
depth=0 if isinstance(ontology_element, Document) else depth + 1,
filename=filename,
)
+
+ elements_to_return += childreen
else:
unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[
ontology_element.__class__.__name__
@@ -98,7 +100,6 @@ def ontology_to_unstructured_elements(
BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip()
)
# TODO value attribute from form input should be added to the text
-
unstructured_element = element_class(
text=element_text,
element_id=ontology_element.id,
@@ -255,8 +256,10 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
additional_attributes=escaped_attrs,
)
- has_children = (ontology_class != UncategorizedText) and any(
- isinstance(content, Tag) for content in soup.contents
+ has_children = (
+ (ontology_class != UncategorizedText)
+ and any(isinstance(content, Tag) for content in soup.contents)
+ or ontology_class().elementType == ElementTypeEnum.layout
)
if has_children: