mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-05 03:23:03 +00:00
fix: partition_html should process container divs that include text (#110)
* check for containers with text * added tests for containers with text * changelog and version bump
This commit is contained in:
parent
6f4d9ad06c
commit
4f6fc29b54
@ -1,7 +1,9 @@
|
||||
## 0.3.5-dev0
|
||||
## 0.3.5-dev1
|
||||
|
||||
* Add new pattern to recognize plain text dash bullets
|
||||
* Add test for bullet patterns
|
||||
* Fix for `partition_html` that allows for processing `div` tags that have both text and child
|
||||
elements
|
||||
|
||||
## 0.3.4
|
||||
|
||||
|
||||
@ -475,6 +475,34 @@ def test_nested_text_tags():
|
||||
assert len(html_document.pages[0].elements) == 1
|
||||
|
||||
|
||||
def test_containers_with_text_are_processed():
|
||||
html_str = """<div dir=3D"ltr">Hi All,<div><br></div>
|
||||
<div>Get excited for our first annual family day!</div>
|
||||
<div>Best.<br clear=3D"all">
|
||||
<div><br></div>
|
||||
-- <br>
|
||||
<div dir=3D"ltr">
|
||||
<div dir=3D"ltr">Dino the Datasaur<div>Unstructured Technologies<br><div>Data Scientist
|
||||
</div>
|
||||
<div><br></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>"""
|
||||
html_document = HTMLDocument.from_string(html_str)
|
||||
html_document._read()
|
||||
|
||||
assert html_document.elements == [
|
||||
Title(text="Hi All,"),
|
||||
NarrativeText(text="Get excited for our first annual family day!"),
|
||||
Title(text="Best."),
|
||||
Title(text="Dino the Datasaur"),
|
||||
Title(text="Unstructured Technologies"),
|
||||
Title(text="Data Scientist"),
|
||||
]
|
||||
|
||||
|
||||
def test_html_grabs_bulleted_text_in_tags():
|
||||
html_str = """<html>
|
||||
<body>
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.3.5-dev0" # pragma: no cover
|
||||
__version__ = "0.3.5-dev1" # pragma: no cover
|
||||
|
||||
@ -96,6 +96,11 @@ class HTMLDocument(XMLDocument):
|
||||
page.elements.append(element)
|
||||
descendanttag_elems = tuple(tag_elem.iterdescendants())
|
||||
|
||||
elif _is_container_with_text(tag_elem):
|
||||
element = _text_to_element(tag_elem.text, "div", ())
|
||||
if element is not None:
|
||||
page.elements.append(element)
|
||||
|
||||
elif _is_bulleted_table(tag_elem):
|
||||
bulleted_text = _bulleted_text_from_table(tag_elem)
|
||||
page.elements.extend(bulleted_text)
|
||||
@ -188,22 +193,46 @@ def _parse_tag(
|
||||
text = _construct_text(tag_elem)
|
||||
if not text:
|
||||
return None
|
||||
return _text_to_element(text, tag_elem.tag, ancestortags)
|
||||
|
||||
|
||||
def _text_to_element(text: str, tag: str, ancestortags: Tuple[str, ...]) -> Optional[Element]:
|
||||
"""Given the text of an element, the tag type and the ancestor tags, produces the appropriate
|
||||
HTML element."""
|
||||
if is_bulleted_text(text):
|
||||
if not clean_bullets(text):
|
||||
return None
|
||||
return HTMLListItem(text=clean_bullets(text), tag=tag_elem.tag, ancestortags=ancestortags)
|
||||
return HTMLListItem(text=clean_bullets(text), tag=tag, ancestortags=ancestortags)
|
||||
|
||||
if len(text) < 2:
|
||||
return None
|
||||
elif is_narrative_tag(text, tag_elem.tag):
|
||||
return HTMLNarrativeText(text, tag=tag_elem.tag, ancestortags=ancestortags)
|
||||
elif is_narrative_tag(text, tag):
|
||||
return HTMLNarrativeText(text, tag=tag, ancestortags=ancestortags)
|
||||
elif is_possible_title(text):
|
||||
return HTMLTitle(text, tag=tag_elem.tag, ancestortags=ancestortags)
|
||||
return HTMLTitle(text, tag=tag, ancestortags=ancestortags)
|
||||
else:
|
||||
# Something that might end up here is text that's just a number.
|
||||
return None
|
||||
|
||||
|
||||
def _is_container_with_text(tag_elem: etree.Element) -> bool:
|
||||
"""Checks if a tag is a container that also happens to containe text.
|
||||
Example
|
||||
-------
|
||||
<div>Hi there,
|
||||
<div>This is my message.</div>
|
||||
<div>Please read my message!</div>
|
||||
</div>
|
||||
"""
|
||||
if tag_elem.tag != "div" or len(tag_elem) == 0:
|
||||
return False
|
||||
|
||||
if tag_elem.text is None or tag_elem.text.strip() == "":
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def is_narrative_tag(text: str, tag: str) -> bool:
|
||||
"""Uses tag information to infer whether text is narrative."""
|
||||
return tag not in HEADING_TAGS and is_possible_narrative_text(text)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user