diff --git a/CHANGELOG.md b/CHANGELOG.md index 214c73329..5a3deaa12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,11 @@ -## 0.16.4-dev0 +## 0.16.4-dev1 ### Enhancements +* **`value` attribute in `` element is parsed to `OntologyElement.text` in ontology** +* **`id` and `class` attributes removed from Table subtags in HTML partitioning** +* **cleaned `to_html` and newly introduced `to_text` in `OntologyElement`** + ### Features ### Fixes diff --git a/test_unstructured/documents/unstructured_json_output/example.json b/test_unstructured/documents/unstructured_json_output/example.json index 0a22083e4..f8997df5e 100644 --- a/test_unstructured/documents/unstructured_json_output/example.json +++ b/test_unstructured/documents/unstructured_json_output/example.json @@ -56,7 +56,7 @@ "parent_id": "3a6b156a81764e17be128264241f8136", "text_as_html": "
" }, - "text": "From field name", + "text": "From field name Example value", "type": "UncategorizedText" }, { @@ -78,9 +78,9 @@ "filename": "example.pdf", "page_number": 1, "parent_id": "592422373ed741b68a077e2003f8ed81", - "text_as_html": "
Description Row header
Value description 50 $ (1.32 %)
" + "text_as_html": "
DescriptionRow header
Value description50 $ (1.32 %)
" }, - "text": "Description Row header Value description 50 $ (1.32 %)", + "text": "Description Row header Value description 50 $ (1.32 %)", "type": "Table" }, { diff --git a/test_unstructured/partition/html/test_html_to_ontology_parsing.py b/test_unstructured/partition/html/test_html_to_ontology_parsing.py index 102dd4c6e..8a69722b2 100644 --- a/test_unstructured/partition/html/test_html_to_ontology_parsing.py +++ b/test_unstructured/partition/html/test_html_to_ontology_parsing.py @@ -356,12 +356,12 @@ def test_broken_cell_is_not_raising_error(): """
- - - + + - @@ -406,12 +406,12 @@ def test_table(): """
+
83.64 GiB + Fair Value
- - - + + - @@ -467,24 +467,20 @@ def test_table_and_time(): """
+
Fair Value1 + Fair Value2
- - - + + - - - + + - @@ -594,3 +590,18 @@ def test_text_is_wrapped_inside_layout_element(): parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html + + +def test_text_in_form_field_value(): + # language=HTML + input_html = """ +
+ +
+ """ + page = parse_html_to_ontology(input_html) + + assert len(page.children) == 1 + form_field_value = page.children[0] + assert form_field_value.text == "" + assert form_field_value.to_text() == "Random Input Value" diff --git a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py index 8da0e3b83..b7ef08b57 100644 --- a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py +++ b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py @@ -314,10 +314,7 @@ def test_table(): unstructured_elements, parsed_ontology = _parse_to_unstructured_elements_and_back_to_html( html_as_str ) - expected_html = indent_html(html_as_str, html_parser="html.parser") - parsed_html = indent_html(parsed_ontology.to_html(), html_parser="html.parser") - assert expected_html == parsed_html expected_elements = _page_elements + [ Table( text="Fair Value1 Fair Value2", @@ -325,13 +322,13 @@ def test_table(): element_id="2", metadata=ElementMetadata( text_as_html='
+
Carrying Value
-
June 30, 2023 - - + $— -
' - ' ' - ' ' - ' " + " " + "" - '
' - "Fair Value1 " + "
" + "Fair Value1" "' - "Fair Value2 " + '' + "Fair Value2" "
", parent_id="1", ), diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 1b0b57347..88e3a33cd 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.4-dev0" # pragma: no cover +__version__ = "0.16.4-dev1" # pragma: no cover diff --git a/unstructured/documents/ontology.py b/unstructured/documents/ontology.py index 152edc892..ef9cc52e8 100644 --- a/unstructured/documents/ontology.py +++ b/unstructured/documents/ontology.py @@ -20,6 +20,7 @@ from copy import copy from enum import Enum from typing import List, Optional +from bs4 import BeautifulSoup from pydantic import BaseModel, Field @@ -75,32 +76,39 @@ class OntologyElement(BaseModel): def to_html(self, add_children=True) -> str: additional_attrs = copy(self.additional_attributes) - if "class" in additional_attrs: - del additional_attrs["class"] - - # TODO(Pluto) Add support for multiple classes - attrs = " ".join( - f'{key}="{value}"' if value else f"{key}" for key, value in additional_attrs.items() - ) + additional_attrs.pop("class", None) + attr_str = self._construct_attribute_string(additional_attrs) class_attr = f'class="{self.css_class_name}"' if self.css_class_name else "" - attr_str = f"{class_attr} {attrs}".strip() - children_html = ( - ("" if not self.children else "".join(child.to_html() for child in self.children)) - if add_children - else "" + combined_attr_str = f"{class_attr} {attr_str}".strip() + + children_html = self._generate_children_html(add_children) + + result_html = self._generate_final_html(combined_attr_str, children_html) + + return result_html + + def to_text(self, add_children=True) -> str: + return " ".join(BeautifulSoup(self.to_html(add_children), "html.parser").stripped_strings) + + def _construct_attribute_string(self, attributes: dict) -> str: + return " ".join( + f'{key}="{value}"' if value else f"{key}" for key, value in attributes.items() ) - text = "" if not self.text else self.text + + def _generate_children_html(self, add_children: bool) -> str: + if not add_children or not self.children: + return "" + return "".join(child.to_html() for child in self.children) + + def _generate_final_html(self, attr_str: str, children_html: str) -> str: + text = self.text or "" if text or children_html: - # This is either one or another, never both - result_html = ( - f"<{self.html_tag_name} {attr_str}>{text} {children_html}" - ) + return f"<{self.html_tag_name} {attr_str}>{text} {children_html}" else: - result_html = f"<{self.html_tag_name} {attr_str} />" - return result_html + return f"<{self.html_tag_name} {attr_str} />" @property def id(self) -> str | None: @@ -254,6 +262,18 @@ class Table(OntologyElement): elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True) allowed_tags: List[str] = Field(["table"], frozen=True) + def to_html(self, add_children=True) -> str: + soup = BeautifulSoup(super().to_html(add_children), "html.parser") + + for tag in soup.find_all(True): + if tag.name != "table": + tag.attrs.pop("class", None) + tag.attrs.pop("id", None) + if tag.name in ["td", "th"]: + tag.string = " ".join(tag.stripped_strings) + + return str(soup) + class TableBody(OntologyElement): description: str = Field("A body of the table", frozen=True) @@ -430,6 +450,15 @@ class Form(OntologyElement): elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) allowed_tags: List[str] = Field(["form"], frozen=True) + def to_text(self, add_children=True) -> str: + texts = [self.text] if self.text else [] + + if add_children: + for child in self.children: + texts.append(child.to_text(add_children=True)) + + return " ".join(filter(None, texts)).strip() + class FormField(OntologyElement): description: str = Field("A property value of a form", frozen=True) @@ -442,6 +471,9 @@ class FormFieldValue(OntologyElement): elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) allowed_tags: List[str] = Field(["input"], frozen=True) + def to_text(self, add_children=True) -> str: + return super().to_text() + self.additional_attributes.get("value", "") + class Checkbox(OntologyElement): description: str = Field("A small box that can be checked or unchecked", frozen=True) diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index f2b897e51..ff0c354e3 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -96,10 +96,8 @@ def ontology_to_unstructured_elements( ] element_class = TYPE_TO_TEXT_ELEMENT_MAP[unstructured_element_class_name] html_code_of_ontology_element = ontology_element.to_html() - element_text = ( - BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip() - ) - # TODO value attribute from form input should be added to the text + element_text = ontology_element.to_text() + unstructured_element = element_class( text=element_text, element_id=ontology_element.id,