diff --git a/CHANGELOG.md b/CHANGELOG.md index 81595da9b..f1f2d9ddc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.13.3-dev1 +## 0.13.3-dev2 ### Enhancements @@ -6,6 +6,7 @@ ### Fixes +* **Add support for extracting text from tag tails in HTML**. This fix adds ability to generate separate elements using tag tails. * **Add support for extracting text from `` tags in HTML** Now `partition_html()` can extract text from `` tags inside container tags (like `
`, `
`).
 
 ## 0.13.2
diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py
index 48777cf13..550e2180c 100644
--- a/test_unstructured/documents/test_html.py
+++ b/test_unstructured/documents/test_html.py
@@ -11,7 +11,6 @@ from lxml import html as lxml_html
 from unstructured.documents import html
 from unstructured.documents.base import Page
 from unstructured.documents.elements import (
-    Address,
     ListItem,
     NarrativeText,
     Table,
@@ -24,9 +23,11 @@ from unstructured.documents.html import (
     SECTION_TAGS,
     TABLE_TAGS,
     TEXT_TAGS,
+    HTMLAddress,
     HTMLDocument,
     HTMLNarrativeText,
     HTMLTable,
+    HTMLText,
     HTMLTitle,
     TagsMixin,
     _parse_HTMLTable_from_table_elem,
@@ -715,13 +716,14 @@ def test_containers_with_text_are_processed():
     html_document = HTMLDocument.from_string(html_str)
 
     assert html_document.elements == [
-        Text(text="Hi All,"),
-        NarrativeText(text="Get excited for our first annual family day!"),
-        Title(text="Best."),
-        Title(text="Dino the Datasaur"),
-        Title(text="Unstructured Technologies"),
-        Title(text="Data Scientist"),
-        Address(text="Doylestown, PA 18901"),
+        HTMLText(text="Hi All,", tag="div"),
+        HTMLNarrativeText(text="Get excited for our first annual family day!", tag="div"),
+        HTMLTitle(text="Best.", tag="div"),
+        HTMLText(text="\n      -- ", tag="div"),
+        HTMLTitle(text="Dino the Datasaur", tag="div"),
+        HTMLTitle(text="Unstructured Technologies", tag="div"),
+        HTMLTitle(text="Data Scientist", tag="div"),
+        HTMLAddress(text="Doylestown, PA 18901", tag="div"),
     ]
 
 
diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py
index a0ce6f2d9..9b696ce54 100644
--- a/test_unstructured/partition/test_html_partition.py
+++ b/test_unstructured/partition/test_html_partition.py
@@ -750,3 +750,22 @@ def test_partition_html_b_tag_parsing():
         "Header 1|Text|Header 2|Param1 = Y|Param2 = 1|Param3 = 2|Param4 = A|"
         "Param5 = A,B,C,D,E|Param6 = 7|Param7 = Five"
     )
+
+
+def test_partition_html_tag_tail_parsing():
+    html_text = """
+        
+        
+        
+ Head +
Nested
+ Tail +
+ + + """ + + elements = partition_html(text=html_text) + element_text = "|".join([str(el).strip() for el in elements]) + + assert element_text == "Head|Nested|Tail" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 06dab11ce..e49d9a259 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.3-dev1" # pragma: no cover +__version__ = "0.13.3-dev2" # pragma: no cover diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 378c73d8b..c066e597b 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -167,32 +167,33 @@ class HTMLDocument(XMLDocument): continue if _is_text_tag(tag_elem): - if _has_break_tags(tag_elem): - flattened_elems = _unfurl_break_tags(tag_elem) - for _tag_elem in flattened_elems: - element = _parse_tag(_tag_elem) - if element is not None: - page.elements.append(element) - - else: - element = _parse_tag(tag_elem) - if element is not None: - page.elements.append(element) - descendanttag_elems = tuple(tag_elem.iterdescendants()) + _page_elements, descendanttag_elems = _process_text_tag(tag_elem) + page.elements.extend(_page_elements) elif _is_container_with_text(tag_elem): - links = _get_links_from_tag(tag_elem) - emphasized_texts = _get_emphasized_texts_from_tag(tag_elem) - # -- having text is guaranteed by `_is_container_with_text()` -- - assert tag_elem.text is not None - element = _text_to_element( - tag_elem.text, - "div", - (), - depth=0, - links=links, - emphasized_texts=emphasized_texts, - ) + tag_elem_tail = tag_elem.tail.strip() if tag_elem.tail else None + if tag_elem_tail: + _page_elements, descendanttag_elems = _process_text_tag(tag_elem, False) + page.elements.extend(_page_elements) + + # NOTE(christine): generate a separate element using a tag tail + element = _text_to_element( + tag_elem.tail, + tag_elem.tag, + (), + depth=0, + ) + else: + links = _get_links_from_tag(tag_elem) + emphasized_texts = _get_emphasized_texts_from_tag(tag_elem) + element = _text_to_element( + tag_elem.text, + tag_elem.tag, + (), + depth=0, + links=links, + emphasized_texts=emphasized_texts, + ) if element is not None: page.elements.append(element) @@ -394,6 +395,7 @@ def _get_emphasized_texts_from_tag(tag_elem: etree._Element) -> List[Dict[str, s def _parse_tag( tag_elem: etree._Element, + include_tail_text: bool = True, ) -> Optional[Element]: """Parses `tag_elem` to a Text element if it contains qualifying text. @@ -419,7 +421,7 @@ def _parse_tag( if tag_elem.tag == "script": return None - text = _construct_text(tag_elem) + text = _construct_text(tag_elem, include_tail_text) if not text: return None return _text_to_element( @@ -510,7 +512,9 @@ def _is_container_with_text(tag_elem: etree._Element) -> bool: if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0: return False - if tag_elem.text is None or tag_elem.text.strip() == "": + tag_elem_text = tag_elem.text.strip() if tag_elem.text else None + tag_elem_tail = tag_elem.tail.strip() if tag_elem.tail else None + if not tag_elem_text and not tag_elem_tail: return False return True @@ -597,6 +601,29 @@ def _is_text_tag( return False +def _process_text_tag( + tag_elem: etree._Element, + include_tail_text: bool = True, +) -> tuple[list[Element], tuple[etree._Element]]: + """Produces a document element from `tag_elem`.""" + + page_elements = [] + if _has_break_tags(tag_elem): + flattened_elems = _unfurl_break_tags(tag_elem) + for _tag_elem in flattened_elems: + element = _parse_tag(_tag_elem, include_tail_text) + if element is not None: + page_elements.append(element) + + else: + element = _parse_tag(tag_elem, include_tail_text) + if element is not None: + page_elements.append(element) + descendant_tag_elems = tuple(tag_elem.iterdescendants()) + + return page_elements, descendant_tag_elems + + def _process_list_item( tag_elem: etree._Element, max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,