diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0c9f38ce0..347c97149 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,9 @@
-## 0.3.5-dev0
+## 0.3.5-dev1
* Add new pattern to recognize plain text dash bullets
* Add test for bullet patterns
+* Fix for `partition_html` that allows for processing `div` tags that have both text and child
+ elements
## 0.3.4
diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py
index 9ca02052f..7e3a00d20 100644
--- a/test_unstructured/documents/test_html.py
+++ b/test_unstructured/documents/test_html.py
@@ -475,6 +475,34 @@ def test_nested_text_tags():
assert len(html_document.pages[0].elements) == 1
+def test_containers_with_text_are_processed():
+ html_str = """
Hi All,
+
Get excited for our first annual family day!
+
Best.
+
+ --
+
+
Dino the Datasaur
Unstructured Technologies
Data Scientist
+
+
+
+
+
+
+
"""
+ html_document = HTMLDocument.from_string(html_str)
+ html_document._read()
+
+ assert html_document.elements == [
+ Title(text="Hi All,"),
+ NarrativeText(text="Get excited for our first annual family day!"),
+ Title(text="Best."),
+ Title(text="Dino the Datasaur"),
+ Title(text="Unstructured Technologies"),
+ Title(text="Data Scientist"),
+ ]
+
+
def test_html_grabs_bulleted_text_in_tags():
html_str = """
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 016dbc772..515530bac 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.3.5-dev0" # pragma: no cover
+__version__ = "0.3.5-dev1" # pragma: no cover
diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py
index f5e9ad6a3..cd06202fd 100644
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@@ -96,6 +96,11 @@ class HTMLDocument(XMLDocument):
page.elements.append(element)
descendanttag_elems = tuple(tag_elem.iterdescendants())
+ elif _is_container_with_text(tag_elem):
+ element = _text_to_element(tag_elem.text, "div", ())
+ if element is not None:
+ page.elements.append(element)
+
elif _is_bulleted_table(tag_elem):
bulleted_text = _bulleted_text_from_table(tag_elem)
page.elements.extend(bulleted_text)
@@ -188,22 +193,46 @@ def _parse_tag(
text = _construct_text(tag_elem)
if not text:
return None
+ return _text_to_element(text, tag_elem.tag, ancestortags)
+
+
+def _text_to_element(text: str, tag: str, ancestortags: Tuple[str, ...]) -> Optional[Element]:
+ """Given the text of an element, the tag type and the ancestor tags, produces the appropriate
+ HTML element."""
if is_bulleted_text(text):
if not clean_bullets(text):
return None
- return HTMLListItem(text=clean_bullets(text), tag=tag_elem.tag, ancestortags=ancestortags)
+ return HTMLListItem(text=clean_bullets(text), tag=tag, ancestortags=ancestortags)
if len(text) < 2:
return None
- elif is_narrative_tag(text, tag_elem.tag):
- return HTMLNarrativeText(text, tag=tag_elem.tag, ancestortags=ancestortags)
+ elif is_narrative_tag(text, tag):
+ return HTMLNarrativeText(text, tag=tag, ancestortags=ancestortags)
elif is_possible_title(text):
- return HTMLTitle(text, tag=tag_elem.tag, ancestortags=ancestortags)
+ return HTMLTitle(text, tag=tag, ancestortags=ancestortags)
else:
# Something that might end up here is text that's just a number.
return None
+def _is_container_with_text(tag_elem: etree.Element) -> bool:
+ """Checks if a tag is a container that also happens to containe text.
+ Example
+ -------
+ Hi there,
+
This is my message.
+
Please read my message!
+
+ """
+ if tag_elem.tag != "div" or len(tag_elem) == 0:
+ return False
+
+ if tag_elem.text is None or tag_elem.text.strip() == "":
+ return False
+
+ return True
+
+
def is_narrative_tag(text: str, tag: str) -> bool:
"""Uses tag information to infer whether text is narrative."""
return tag not in HEADING_TAGS and is_possible_narrative_text(text)