mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-17 21:29:05 +00:00
Fix: partition_html()
partially extracts text (#2852)
Closes #2362. Previously, when an HTML contained a `div` with a nested tag e.g. a `<b>` or `<span>`, the element created from the `div` contained only the text up to the inline element. This PR adds support for extracting text from tag tails in HTML. ### Testing ``` html_text = """ <html> <body> <div> the Company issues shares at $<div style="display:inline;"><span>5.22</span></div> per share. There is more text </div> </body> </html> """ elements = partition_html(text=html_text) print(''.join([str(el).strip() for el in elements])) ``` **Expected behavior** ``` the Company issues shares at $5.22per share. There is more text ```
This commit is contained in:
parent
2c7e0289aa
commit
4656b8cbe5
@ -1,4 +1,4 @@
|
||||
## 0.13.3-dev1
|
||||
## 0.13.3-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Add support for extracting text from tag tails in HTML**. This fix adds ability to generate separate elements using tag tails.
|
||||
* **Add support for extracting text from `<b>` tags in HTML** Now `partition_html()` can extract text from `<b>` tags inside container tags (like `<div>`, `<pre>`).
|
||||
|
||||
## 0.13.2
|
||||
|
@ -11,7 +11,6 @@ from lxml import html as lxml_html
|
||||
from unstructured.documents import html
|
||||
from unstructured.documents.base import Page
|
||||
from unstructured.documents.elements import (
|
||||
Address,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
Table,
|
||||
@ -24,9 +23,11 @@ from unstructured.documents.html import (
|
||||
SECTION_TAGS,
|
||||
TABLE_TAGS,
|
||||
TEXT_TAGS,
|
||||
HTMLAddress,
|
||||
HTMLDocument,
|
||||
HTMLNarrativeText,
|
||||
HTMLTable,
|
||||
HTMLText,
|
||||
HTMLTitle,
|
||||
TagsMixin,
|
||||
_parse_HTMLTable_from_table_elem,
|
||||
@ -715,13 +716,14 @@ def test_containers_with_text_are_processed():
|
||||
html_document = HTMLDocument.from_string(html_str)
|
||||
|
||||
assert html_document.elements == [
|
||||
Text(text="Hi All,"),
|
||||
NarrativeText(text="Get excited for our first annual family day!"),
|
||||
Title(text="Best."),
|
||||
Title(text="Dino the Datasaur"),
|
||||
Title(text="Unstructured Technologies"),
|
||||
Title(text="Data Scientist"),
|
||||
Address(text="Doylestown, PA 18901"),
|
||||
HTMLText(text="Hi All,", tag="div"),
|
||||
HTMLNarrativeText(text="Get excited for our first annual family day!", tag="div"),
|
||||
HTMLTitle(text="Best.", tag="div"),
|
||||
HTMLText(text="\n -- ", tag="div"),
|
||||
HTMLTitle(text="Dino the Datasaur", tag="div"),
|
||||
HTMLTitle(text="Unstructured Technologies", tag="div"),
|
||||
HTMLTitle(text="Data Scientist", tag="div"),
|
||||
HTMLAddress(text="Doylestown, PA 18901", tag="div"),
|
||||
]
|
||||
|
||||
|
||||
|
@ -750,3 +750,22 @@ def test_partition_html_b_tag_parsing():
|
||||
"Header 1|Text|Header 2|Param1 = Y|Param2 = 1|Param3 = 2|Param4 = A|"
|
||||
"Param5 = A,B,C,D,E|Param6 = 7|Param7 = Five"
|
||||
)
|
||||
|
||||
|
||||
def test_partition_html_tag_tail_parsing():
|
||||
html_text = """
|
||||
<html>
|
||||
<body>
|
||||
<div>
|
||||
Head
|
||||
<div><span>Nested</span></div>
|
||||
Tail
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
elements = partition_html(text=html_text)
|
||||
element_text = "|".join([str(el).strip() for el in elements])
|
||||
|
||||
assert element_text == "Head|Nested|Tail"
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.13.3-dev1" # pragma: no cover
|
||||
__version__ = "0.13.3-dev2" # pragma: no cover
|
||||
|
@ -167,32 +167,33 @@ class HTMLDocument(XMLDocument):
|
||||
continue
|
||||
|
||||
if _is_text_tag(tag_elem):
|
||||
if _has_break_tags(tag_elem):
|
||||
flattened_elems = _unfurl_break_tags(tag_elem)
|
||||
for _tag_elem in flattened_elems:
|
||||
element = _parse_tag(_tag_elem)
|
||||
if element is not None:
|
||||
page.elements.append(element)
|
||||
|
||||
else:
|
||||
element = _parse_tag(tag_elem)
|
||||
if element is not None:
|
||||
page.elements.append(element)
|
||||
descendanttag_elems = tuple(tag_elem.iterdescendants())
|
||||
_page_elements, descendanttag_elems = _process_text_tag(tag_elem)
|
||||
page.elements.extend(_page_elements)
|
||||
|
||||
elif _is_container_with_text(tag_elem):
|
||||
links = _get_links_from_tag(tag_elem)
|
||||
emphasized_texts = _get_emphasized_texts_from_tag(tag_elem)
|
||||
# -- having text is guaranteed by `_is_container_with_text()` --
|
||||
assert tag_elem.text is not None
|
||||
element = _text_to_element(
|
||||
tag_elem.text,
|
||||
"div",
|
||||
(),
|
||||
depth=0,
|
||||
links=links,
|
||||
emphasized_texts=emphasized_texts,
|
||||
)
|
||||
tag_elem_tail = tag_elem.tail.strip() if tag_elem.tail else None
|
||||
if tag_elem_tail:
|
||||
_page_elements, descendanttag_elems = _process_text_tag(tag_elem, False)
|
||||
page.elements.extend(_page_elements)
|
||||
|
||||
# NOTE(christine): generate a separate element using a tag tail
|
||||
element = _text_to_element(
|
||||
tag_elem.tail,
|
||||
tag_elem.tag,
|
||||
(),
|
||||
depth=0,
|
||||
)
|
||||
else:
|
||||
links = _get_links_from_tag(tag_elem)
|
||||
emphasized_texts = _get_emphasized_texts_from_tag(tag_elem)
|
||||
element = _text_to_element(
|
||||
tag_elem.text,
|
||||
tag_elem.tag,
|
||||
(),
|
||||
depth=0,
|
||||
links=links,
|
||||
emphasized_texts=emphasized_texts,
|
||||
)
|
||||
if element is not None:
|
||||
page.elements.append(element)
|
||||
|
||||
@ -394,6 +395,7 @@ def _get_emphasized_texts_from_tag(tag_elem: etree._Element) -> List[Dict[str, s
|
||||
|
||||
def _parse_tag(
|
||||
tag_elem: etree._Element,
|
||||
include_tail_text: bool = True,
|
||||
) -> Optional[Element]:
|
||||
"""Parses `tag_elem` to a Text element if it contains qualifying text.
|
||||
|
||||
@ -419,7 +421,7 @@ def _parse_tag(
|
||||
|
||||
if tag_elem.tag == "script":
|
||||
return None
|
||||
text = _construct_text(tag_elem)
|
||||
text = _construct_text(tag_elem, include_tail_text)
|
||||
if not text:
|
||||
return None
|
||||
return _text_to_element(
|
||||
@ -510,7 +512,9 @@ def _is_container_with_text(tag_elem: etree._Element) -> bool:
|
||||
if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0:
|
||||
return False
|
||||
|
||||
if tag_elem.text is None or tag_elem.text.strip() == "":
|
||||
tag_elem_text = tag_elem.text.strip() if tag_elem.text else None
|
||||
tag_elem_tail = tag_elem.tail.strip() if tag_elem.tail else None
|
||||
if not tag_elem_text and not tag_elem_tail:
|
||||
return False
|
||||
|
||||
return True
|
||||
@ -597,6 +601,29 @@ def _is_text_tag(
|
||||
return False
|
||||
|
||||
|
||||
def _process_text_tag(
|
||||
tag_elem: etree._Element,
|
||||
include_tail_text: bool = True,
|
||||
) -> tuple[list[Element], tuple[etree._Element]]:
|
||||
"""Produces a document element from `tag_elem`."""
|
||||
|
||||
page_elements = []
|
||||
if _has_break_tags(tag_elem):
|
||||
flattened_elems = _unfurl_break_tags(tag_elem)
|
||||
for _tag_elem in flattened_elems:
|
||||
element = _parse_tag(_tag_elem, include_tail_text)
|
||||
if element is not None:
|
||||
page_elements.append(element)
|
||||
|
||||
else:
|
||||
element = _parse_tag(tag_elem, include_tail_text)
|
||||
if element is not None:
|
||||
page_elements.append(element)
|
||||
descendant_tag_elems = tuple(tag_elem.iterdescendants())
|
||||
|
||||
return page_elements, descendant_tag_elems
|
||||
|
||||
|
||||
def _process_list_item(
|
||||
tag_elem: etree._Element,
|
||||
max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
|
||||
|
Loading…
x
Reference in New Issue
Block a user