diff --git a/CHANGELOG.md b/CHANGELOG.md
index 46171d28c..00ef7c6aa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.5-dev0
+## 0.16.5-dev1
### Enhancements
diff --git a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
index 4b46ca12e..c69f49f2c 100644
--- a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
+++ b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
@@ -201,6 +201,10 @@ def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_p
for i in range(len(expected_json_elements)):
assert expected_json_elements[i] == predicted_elements[i]
+ assert (
+ expected_json_elements[i].metadata.text_as_html
+ == predicted_elements[i].metadata.text_as_html
+ )
def test_inline_elements_are_squeezed():
diff --git a/test_unstructured/partition/html/test_html_to_ontology_parsing.py b/test_unstructured/partition/html/test_html_to_ontology_parsing.py
index f42506c51..a15e5cb50 100644
--- a/test_unstructured/partition/html/test_html_to_ontology_parsing.py
+++ b/test_unstructured/partition/html/test_html_to_ontology_parsing.py
@@ -607,6 +607,21 @@ def test_text_in_form_field_value():
assert form_field_value.to_text() == "Random Input Value"
+def test_text_in_form_field_value_with_null_value():
+ # language=HTML
+ input_html = """
+
+
+
+ """
+ page = parse_html_to_ontology(input_html)
+
+ assert len(page.children) == 1
+ form_field_value = page.children[0]
+ assert form_field_value.text == ""
+ assert form_field_value.to_text() == ""
+
+
def test_to_text_when_form_field():
ontology = Page(
children=[
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index d282588e4..7705907c0 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.5-dev0" # pragma: no cover
+__version__ = "0.16.5-dev1" # pragma: no cover
diff --git a/unstructured/documents/ontology.py b/unstructured/documents/ontology.py
index aca80599e..75ac93f29 100644
--- a/unstructured/documents/ontology.py
+++ b/unstructured/documents/ontology.py
@@ -93,7 +93,7 @@ class OntologyElement(BaseModel):
if self.children and add_children:
children_text = " ".join(child.to_text().strip() for child in self.children)
return children_text
- return BeautifulSoup(self.to_html()).get_text().strip()
+ return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
def _construct_attribute_string(self, attributes: dict) -> str:
return " ".join(
@@ -466,8 +466,11 @@ class FormFieldValue(OntologyElement):
allowed_tags: List[str] = Field(["input"], frozen=True)
def to_text(self, add_children=True) -> str:
- text = super().to_text() + self.additional_attributes.get("value", "")
- return text.strip()
+ text = super().to_text()
+ value = self.additional_attributes.get("value", "")
+ if not value:
+ return text
+ return f"{text} {value}".strip()
class Checkbox(OntologyElement):