mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-08 04:55:36 +00:00
fix: partition_html should process container divs that include text (#110)
* check for containers with text * added tests for containers with text * changelog and version bump
This commit is contained in:
parent
6f4d9ad06c
commit
4f6fc29b54
@ -1,7 +1,9 @@
|
|||||||
## 0.3.5-dev0
|
## 0.3.5-dev1
|
||||||
|
|
||||||
* Add new pattern to recognize plain text dash bullets
|
* Add new pattern to recognize plain text dash bullets
|
||||||
* Add test for bullet patterns
|
* Add test for bullet patterns
|
||||||
|
* Fix for `partition_html` that allows for processing `div` tags that have both text and child
|
||||||
|
elements
|
||||||
|
|
||||||
## 0.3.4
|
## 0.3.4
|
||||||
|
|
||||||
|
|||||||
@ -475,6 +475,34 @@ def test_nested_text_tags():
|
|||||||
assert len(html_document.pages[0].elements) == 1
|
assert len(html_document.pages[0].elements) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_containers_with_text_are_processed():
|
||||||
|
html_str = """<div dir=3D"ltr">Hi All,<div><br></div>
|
||||||
|
<div>Get excited for our first annual family day!</div>
|
||||||
|
<div>Best.<br clear=3D"all">
|
||||||
|
<div><br></div>
|
||||||
|
-- <br>
|
||||||
|
<div dir=3D"ltr">
|
||||||
|
<div dir=3D"ltr">Dino the Datasaur<div>Unstructured Technologies<br><div>Data Scientist
|
||||||
|
</div>
|
||||||
|
<div><br></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>"""
|
||||||
|
html_document = HTMLDocument.from_string(html_str)
|
||||||
|
html_document._read()
|
||||||
|
|
||||||
|
assert html_document.elements == [
|
||||||
|
Title(text="Hi All,"),
|
||||||
|
NarrativeText(text="Get excited for our first annual family day!"),
|
||||||
|
Title(text="Best."),
|
||||||
|
Title(text="Dino the Datasaur"),
|
||||||
|
Title(text="Unstructured Technologies"),
|
||||||
|
Title(text="Data Scientist"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_html_grabs_bulleted_text_in_tags():
|
def test_html_grabs_bulleted_text_in_tags():
|
||||||
html_str = """<html>
|
html_str = """<html>
|
||||||
<body>
|
<body>
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.3.5-dev0" # pragma: no cover
|
__version__ = "0.3.5-dev1" # pragma: no cover
|
||||||
|
|||||||
@ -96,6 +96,11 @@ class HTMLDocument(XMLDocument):
|
|||||||
page.elements.append(element)
|
page.elements.append(element)
|
||||||
descendanttag_elems = tuple(tag_elem.iterdescendants())
|
descendanttag_elems = tuple(tag_elem.iterdescendants())
|
||||||
|
|
||||||
|
elif _is_container_with_text(tag_elem):
|
||||||
|
element = _text_to_element(tag_elem.text, "div", ())
|
||||||
|
if element is not None:
|
||||||
|
page.elements.append(element)
|
||||||
|
|
||||||
elif _is_bulleted_table(tag_elem):
|
elif _is_bulleted_table(tag_elem):
|
||||||
bulleted_text = _bulleted_text_from_table(tag_elem)
|
bulleted_text = _bulleted_text_from_table(tag_elem)
|
||||||
page.elements.extend(bulleted_text)
|
page.elements.extend(bulleted_text)
|
||||||
@ -188,22 +193,46 @@ def _parse_tag(
|
|||||||
text = _construct_text(tag_elem)
|
text = _construct_text(tag_elem)
|
||||||
if not text:
|
if not text:
|
||||||
return None
|
return None
|
||||||
|
return _text_to_element(text, tag_elem.tag, ancestortags)
|
||||||
|
|
||||||
|
|
||||||
|
def _text_to_element(text: str, tag: str, ancestortags: Tuple[str, ...]) -> Optional[Element]:
|
||||||
|
"""Given the text of an element, the tag type and the ancestor tags, produces the appropriate
|
||||||
|
HTML element."""
|
||||||
if is_bulleted_text(text):
|
if is_bulleted_text(text):
|
||||||
if not clean_bullets(text):
|
if not clean_bullets(text):
|
||||||
return None
|
return None
|
||||||
return HTMLListItem(text=clean_bullets(text), tag=tag_elem.tag, ancestortags=ancestortags)
|
return HTMLListItem(text=clean_bullets(text), tag=tag, ancestortags=ancestortags)
|
||||||
|
|
||||||
if len(text) < 2:
|
if len(text) < 2:
|
||||||
return None
|
return None
|
||||||
elif is_narrative_tag(text, tag_elem.tag):
|
elif is_narrative_tag(text, tag):
|
||||||
return HTMLNarrativeText(text, tag=tag_elem.tag, ancestortags=ancestortags)
|
return HTMLNarrativeText(text, tag=tag, ancestortags=ancestortags)
|
||||||
elif is_possible_title(text):
|
elif is_possible_title(text):
|
||||||
return HTMLTitle(text, tag=tag_elem.tag, ancestortags=ancestortags)
|
return HTMLTitle(text, tag=tag, ancestortags=ancestortags)
|
||||||
else:
|
else:
|
||||||
# Something that might end up here is text that's just a number.
|
# Something that might end up here is text that's just a number.
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _is_container_with_text(tag_elem: etree.Element) -> bool:
|
||||||
|
"""Checks if a tag is a container that also happens to containe text.
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
<div>Hi there,
|
||||||
|
<div>This is my message.</div>
|
||||||
|
<div>Please read my message!</div>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
if tag_elem.tag != "div" or len(tag_elem) == 0:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if tag_elem.text is None or tag_elem.text.strip() == "":
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def is_narrative_tag(text: str, tag: str) -> bool:
|
def is_narrative_tag(text: str, tag: str) -> bool:
|
||||||
"""Uses tag information to infer whether text is narrative."""
|
"""Uses tag information to infer whether text is narrative."""
|
||||||
return tag not in HEADING_TAGS and is_possible_narrative_text(text)
|
return tag not in HEADING_TAGS and is_possible_narrative_text(text)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user