diff --git a/CHANGELOG.md b/CHANGELOG.md index 46171d28c..00ef7c6aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.5-dev0 +## 0.16.5-dev1 ### Enhancements diff --git a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py index 4b46ca12e..c69f49f2c 100644 --- a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py +++ b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py @@ -201,6 +201,10 @@ def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_p for i in range(len(expected_json_elements)): assert expected_json_elements[i] == predicted_elements[i] + assert ( + expected_json_elements[i].metadata.text_as_html + == predicted_elements[i].metadata.text_as_html + ) def test_inline_elements_are_squeezed(): diff --git a/test_unstructured/partition/html/test_html_to_ontology_parsing.py b/test_unstructured/partition/html/test_html_to_ontology_parsing.py index f42506c51..a15e5cb50 100644 --- a/test_unstructured/partition/html/test_html_to_ontology_parsing.py +++ b/test_unstructured/partition/html/test_html_to_ontology_parsing.py @@ -607,6 +607,21 @@ def test_text_in_form_field_value(): assert form_field_value.to_text() == "Random Input Value" +def test_text_in_form_field_value_with_null_value(): + # language=HTML + input_html = """ +
+ +
+ """ + page = parse_html_to_ontology(input_html) + + assert len(page.children) == 1 + form_field_value = page.children[0] + assert form_field_value.text == "" + assert form_field_value.to_text() == "" + + def test_to_text_when_form_field(): ontology = Page( children=[ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d282588e4..7705907c0 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.5-dev0" # pragma: no cover +__version__ = "0.16.5-dev1" # pragma: no cover diff --git a/unstructured/documents/ontology.py b/unstructured/documents/ontology.py index aca80599e..75ac93f29 100644 --- a/unstructured/documents/ontology.py +++ b/unstructured/documents/ontology.py @@ -93,7 +93,7 @@ class OntologyElement(BaseModel): if self.children and add_children: children_text = " ".join(child.to_text().strip() for child in self.children) return children_text - return BeautifulSoup(self.to_html()).get_text().strip() + return BeautifulSoup(self.to_html(), "html.parser").get_text().strip() def _construct_attribute_string(self, attributes: dict) -> str: return " ".join( @@ -466,8 +466,11 @@ class FormFieldValue(OntologyElement): allowed_tags: List[str] = Field(["input"], frozen=True) def to_text(self, add_children=True) -> str: - text = super().to_text() + self.additional_attributes.get("value", "") - return text.strip() + text = super().to_text() + value = self.additional_attributes.get("value", "") + if not value: + return text + return f"{text} {value}".strip() class Checkbox(OntologyElement):