Fix: partition_html() partially extracts text (#2852)

Closes #2362.

Previously, when an HTML contained a `div` with a nested tag e.g. a
`<b>` or `<span>`, the element created from the `div` contained only the
text up to the inline element. This PR adds support for extracting text
from tag tails in HTML.

### Testing
```
html_text = """
<html>
<body>
    <div>
        the Company issues shares at $<div style="display:inline;"><span>5.22</span></div> per share. There is more text
    </div>
</body>
</html>
"""

elements = partition_html(text=html_text)
print(''.join([str(el).strip() for el in elements]))
```

**Expected behavior**
```
the Company issues shares at $5.22per share. There is more text
```
This commit is contained in:
Christine Straub 2024-04-08 12:18:55 -07:00 committed by GitHub
parent 2c7e0289aa
commit 4656b8cbe5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 85 additions and 36 deletions

View File

@ -1,4 +1,4 @@
## 0.13.3-dev1
## 0.13.3-dev2
### Enhancements
@ -6,6 +6,7 @@
### Fixes
* **Add support for extracting text from tag tails in HTML**. This fix adds ability to generate separate elements using tag tails.
* **Add support for extracting text from `<b>` tags in HTML** Now `partition_html()` can extract text from `<b>` tags inside container tags (like `<div>`, `<pre>`).
## 0.13.2

View File

@ -11,7 +11,6 @@ from lxml import html as lxml_html
from unstructured.documents import html
from unstructured.documents.base import Page
from unstructured.documents.elements import (
Address,
ListItem,
NarrativeText,
Table,
@ -24,9 +23,11 @@ from unstructured.documents.html import (
SECTION_TAGS,
TABLE_TAGS,
TEXT_TAGS,
HTMLAddress,
HTMLDocument,
HTMLNarrativeText,
HTMLTable,
HTMLText,
HTMLTitle,
TagsMixin,
_parse_HTMLTable_from_table_elem,
@ -715,13 +716,14 @@ def test_containers_with_text_are_processed():
html_document = HTMLDocument.from_string(html_str)
assert html_document.elements == [
Text(text="Hi All,"),
NarrativeText(text="Get excited for our first annual family day!"),
Title(text="Best."),
Title(text="Dino the Datasaur"),
Title(text="Unstructured Technologies"),
Title(text="Data Scientist"),
Address(text="Doylestown, PA 18901"),
HTMLText(text="Hi All,", tag="div"),
HTMLNarrativeText(text="Get excited for our first annual family day!", tag="div"),
HTMLTitle(text="Best.", tag="div"),
HTMLText(text="\n -- ", tag="div"),
HTMLTitle(text="Dino the Datasaur", tag="div"),
HTMLTitle(text="Unstructured Technologies", tag="div"),
HTMLTitle(text="Data Scientist", tag="div"),
HTMLAddress(text="Doylestown, PA 18901", tag="div"),
]

View File

@ -750,3 +750,22 @@ def test_partition_html_b_tag_parsing():
"Header 1|Text|Header 2|Param1 = Y|Param2 = 1|Param3 = 2|Param4 = A|"
"Param5 = A,B,C,D,E|Param6 = 7|Param7 = Five"
)
def test_partition_html_tag_tail_parsing():
html_text = """
<html>
<body>
<div>
Head
<div><span>Nested</span></div>
Tail
</div>
</body>
</html>
"""
elements = partition_html(text=html_text)
element_text = "|".join([str(el).strip() for el in elements])
assert element_text == "Head|Nested|Tail"

View File

@ -1 +1 @@
__version__ = "0.13.3-dev1" # pragma: no cover
__version__ = "0.13.3-dev2" # pragma: no cover

View File

@ -167,32 +167,33 @@ class HTMLDocument(XMLDocument):
continue
if _is_text_tag(tag_elem):
if _has_break_tags(tag_elem):
flattened_elems = _unfurl_break_tags(tag_elem)
for _tag_elem in flattened_elems:
element = _parse_tag(_tag_elem)
if element is not None:
page.elements.append(element)
else:
element = _parse_tag(tag_elem)
if element is not None:
page.elements.append(element)
descendanttag_elems = tuple(tag_elem.iterdescendants())
_page_elements, descendanttag_elems = _process_text_tag(tag_elem)
page.elements.extend(_page_elements)
elif _is_container_with_text(tag_elem):
links = _get_links_from_tag(tag_elem)
emphasized_texts = _get_emphasized_texts_from_tag(tag_elem)
# -- having text is guaranteed by `_is_container_with_text()` --
assert tag_elem.text is not None
element = _text_to_element(
tag_elem.text,
"div",
(),
depth=0,
links=links,
emphasized_texts=emphasized_texts,
)
tag_elem_tail = tag_elem.tail.strip() if tag_elem.tail else None
if tag_elem_tail:
_page_elements, descendanttag_elems = _process_text_tag(tag_elem, False)
page.elements.extend(_page_elements)
# NOTE(christine): generate a separate element using a tag tail
element = _text_to_element(
tag_elem.tail,
tag_elem.tag,
(),
depth=0,
)
else:
links = _get_links_from_tag(tag_elem)
emphasized_texts = _get_emphasized_texts_from_tag(tag_elem)
element = _text_to_element(
tag_elem.text,
tag_elem.tag,
(),
depth=0,
links=links,
emphasized_texts=emphasized_texts,
)
if element is not None:
page.elements.append(element)
@ -394,6 +395,7 @@ def _get_emphasized_texts_from_tag(tag_elem: etree._Element) -> List[Dict[str, s
def _parse_tag(
tag_elem: etree._Element,
include_tail_text: bool = True,
) -> Optional[Element]:
"""Parses `tag_elem` to a Text element if it contains qualifying text.
@ -419,7 +421,7 @@ def _parse_tag(
if tag_elem.tag == "script":
return None
text = _construct_text(tag_elem)
text = _construct_text(tag_elem, include_tail_text)
if not text:
return None
return _text_to_element(
@ -510,7 +512,9 @@ def _is_container_with_text(tag_elem: etree._Element) -> bool:
if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0:
return False
if tag_elem.text is None or tag_elem.text.strip() == "":
tag_elem_text = tag_elem.text.strip() if tag_elem.text else None
tag_elem_tail = tag_elem.tail.strip() if tag_elem.tail else None
if not tag_elem_text and not tag_elem_tail:
return False
return True
@ -597,6 +601,29 @@ def _is_text_tag(
return False
def _process_text_tag(
tag_elem: etree._Element,
include_tail_text: bool = True,
) -> tuple[list[Element], tuple[etree._Element]]:
"""Produces a document element from `tag_elem`."""
page_elements = []
if _has_break_tags(tag_elem):
flattened_elems = _unfurl_break_tags(tag_elem)
for _tag_elem in flattened_elems:
element = _parse_tag(_tag_elem, include_tail_text)
if element is not None:
page_elements.append(element)
else:
element = _parse_tag(tag_elem, include_tail_text)
if element is not None:
page_elements.append(element)
descendant_tag_elems = tuple(tag_elem.iterdescendants())
return page_elements, descendant_tag_elems
def _process_list_item(
tag_elem: etree._Element,
max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,