fix: partition_html should process container divs that include text (#110)

* check for containers with text

* added tests for containers with text

* changelog and version bump
This commit is contained in:
Matt Robinson 2022-12-21 16:51:04 -05:00 committed by GitHub
parent 6f4d9ad06c
commit 4f6fc29b54
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 65 additions and 6 deletions

View File

@ -1,7 +1,9 @@
## 0.3.5-dev0
## 0.3.5-dev1
* Add new pattern to recognize plain text dash bullets
* Add test for bullet patterns
* Fix for `partition_html` that allows for processing `div` tags that have both text and child
elements
## 0.3.4

View File

@ -475,6 +475,34 @@ def test_nested_text_tags():
assert len(html_document.pages[0].elements) == 1
def test_containers_with_text_are_processed():
html_str = """<div dir=3D"ltr">Hi All,<div><br></div>
<div>Get excited for our first annual family day!</div>
<div>Best.<br clear=3D"all">
<div><br></div>
-- <br>
<div dir=3D"ltr">
<div dir=3D"ltr">Dino the Datasaur<div>Unstructured Technologies<br><div>Data Scientist
</div>
<div><br></div>
</div>
</div>
</div>
</div>
</div>"""
html_document = HTMLDocument.from_string(html_str)
html_document._read()
assert html_document.elements == [
Title(text="Hi All,"),
NarrativeText(text="Get excited for our first annual family day!"),
Title(text="Best."),
Title(text="Dino the Datasaur"),
Title(text="Unstructured Technologies"),
Title(text="Data Scientist"),
]
def test_html_grabs_bulleted_text_in_tags():
html_str = """<html>
<body>

View File

@ -1 +1 @@
__version__ = "0.3.5-dev0" # pragma: no cover
__version__ = "0.3.5-dev1" # pragma: no cover

View File

@ -96,6 +96,11 @@ class HTMLDocument(XMLDocument):
page.elements.append(element)
descendanttag_elems = tuple(tag_elem.iterdescendants())
elif _is_container_with_text(tag_elem):
element = _text_to_element(tag_elem.text, "div", ())
if element is not None:
page.elements.append(element)
elif _is_bulleted_table(tag_elem):
bulleted_text = _bulleted_text_from_table(tag_elem)
page.elements.extend(bulleted_text)
@ -188,22 +193,46 @@ def _parse_tag(
text = _construct_text(tag_elem)
if not text:
return None
return _text_to_element(text, tag_elem.tag, ancestortags)
def _text_to_element(text: str, tag: str, ancestortags: Tuple[str, ...]) -> Optional[Element]:
"""Given the text of an element, the tag type and the ancestor tags, produces the appropriate
HTML element."""
if is_bulleted_text(text):
if not clean_bullets(text):
return None
return HTMLListItem(text=clean_bullets(text), tag=tag_elem.tag, ancestortags=ancestortags)
return HTMLListItem(text=clean_bullets(text), tag=tag, ancestortags=ancestortags)
if len(text) < 2:
return None
elif is_narrative_tag(text, tag_elem.tag):
return HTMLNarrativeText(text, tag=tag_elem.tag, ancestortags=ancestortags)
elif is_narrative_tag(text, tag):
return HTMLNarrativeText(text, tag=tag, ancestortags=ancestortags)
elif is_possible_title(text):
return HTMLTitle(text, tag=tag_elem.tag, ancestortags=ancestortags)
return HTMLTitle(text, tag=tag, ancestortags=ancestortags)
else:
# Something that might end up here is text that's just a number.
return None
def _is_container_with_text(tag_elem: etree.Element) -> bool:
"""Checks if a tag is a container that also happens to containe text.
Example
-------
<div>Hi there,
<div>This is my message.</div>
<div>Please read my message!</div>
</div>
"""
if tag_elem.tag != "div" or len(tag_elem) == 0:
return False
if tag_elem.text is None or tag_elem.text.strip() == "":
return False
return True
def is_narrative_tag(text: str, tag: str) -> bool:
"""Uses tag information to infer whether text is narrative."""
return tag not in HEADING_TAGS and is_possible_narrative_text(text)