diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c9f38ce0..347c97149 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,9 @@ -## 0.3.5-dev0 +## 0.3.5-dev1 * Add new pattern to recognize plain text dash bullets * Add test for bullet patterns +* Fix for `partition_html` that allows for processing `div` tags that have both text and child + elements ## 0.3.4 diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py index 9ca02052f..7e3a00d20 100644 --- a/test_unstructured/documents/test_html.py +++ b/test_unstructured/documents/test_html.py @@ -475,6 +475,34 @@ def test_nested_text_tags(): assert len(html_document.pages[0].elements) == 1 +def test_containers_with_text_are_processed(): + html_str = """
Hi All,

+
Get excited for our first annual family day!
+
Best.
+

+ --
+
+
Dino the Datasaur
Unstructured Technologies
Data Scientist +
+

+
+
+
+
+
""" + html_document = HTMLDocument.from_string(html_str) + html_document._read() + + assert html_document.elements == [ + Title(text="Hi All,"), + NarrativeText(text="Get excited for our first annual family day!"), + Title(text="Best."), + Title(text="Dino the Datasaur"), + Title(text="Unstructured Technologies"), + Title(text="Data Scientist"), + ] + + def test_html_grabs_bulleted_text_in_tags(): html_str = """ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 016dbc772..515530bac 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.3.5-dev0" # pragma: no cover +__version__ = "0.3.5-dev1" # pragma: no cover diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index f5e9ad6a3..cd06202fd 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -96,6 +96,11 @@ class HTMLDocument(XMLDocument): page.elements.append(element) descendanttag_elems = tuple(tag_elem.iterdescendants()) + elif _is_container_with_text(tag_elem): + element = _text_to_element(tag_elem.text, "div", ()) + if element is not None: + page.elements.append(element) + elif _is_bulleted_table(tag_elem): bulleted_text = _bulleted_text_from_table(tag_elem) page.elements.extend(bulleted_text) @@ -188,22 +193,46 @@ def _parse_tag( text = _construct_text(tag_elem) if not text: return None + return _text_to_element(text, tag_elem.tag, ancestortags) + + +def _text_to_element(text: str, tag: str, ancestortags: Tuple[str, ...]) -> Optional[Element]: + """Given the text of an element, the tag type and the ancestor tags, produces the appropriate + HTML element.""" if is_bulleted_text(text): if not clean_bullets(text): return None - return HTMLListItem(text=clean_bullets(text), tag=tag_elem.tag, ancestortags=ancestortags) + return HTMLListItem(text=clean_bullets(text), tag=tag, ancestortags=ancestortags) if len(text) < 2: return None - elif is_narrative_tag(text, tag_elem.tag): - return HTMLNarrativeText(text, tag=tag_elem.tag, ancestortags=ancestortags) + elif is_narrative_tag(text, tag): + return HTMLNarrativeText(text, tag=tag, ancestortags=ancestortags) elif is_possible_title(text): - return HTMLTitle(text, tag=tag_elem.tag, ancestortags=ancestortags) + return HTMLTitle(text, tag=tag, ancestortags=ancestortags) else: # Something that might end up here is text that's just a number. return None +def _is_container_with_text(tag_elem: etree.Element) -> bool: + """Checks if a tag is a container that also happens to containe text. + Example + ------- +
Hi there, +
This is my message.
+
Please read my message!
+
+ """ + if tag_elem.tag != "div" or len(tag_elem) == 0: + return False + + if tag_elem.text is None or tag_elem.text.strip() == "": + return False + + return True + + def is_narrative_tag(text: str, tag: str) -> bool: """Uses tag information to infer whether text is narrative.""" return tag not in HEADING_TAGS and is_possible_narrative_text(text)