rfctr(html): improve SNR in HTMLDocument (#3162)

**Summary**
Remove dead code and organize helpers of HTMLDocument in preparation for
improvements and bug-fixes to follow
This commit is contained in:
Steve Canny 2024-06-06 14:21:33 -07:00 committed by GitHub
parent 8378ddaa3b
commit a883fc9df2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 399 additions and 542 deletions

View File

@ -1,4 +1,4 @@
## 0.14.5-dev5 ## 0.14.5-dev6
### Enhancements ### Enhancements

View File

@ -1,4 +1,5 @@
# pyright: reportPrivateUsage=false # pyright: reportPrivateUsage=false
# pyright: reportUnusedFunction=false
"""Test suite for `unstructured.documents.html` module.""" """Test suite for `unstructured.documents.html` module."""
@ -27,15 +28,14 @@ from unstructured.documents.elements import (
Text, Text,
Title, Title,
) )
from unstructured.documents.html import ( from unstructured.documents.html import HTMLDocument, _parse_HTMLTable_from_table_elem
from unstructured.documents.html_elements import (
HTMLAddress, HTMLAddress,
HTMLDocument,
HTMLNarrativeText, HTMLNarrativeText,
HTMLTable, HTMLTable,
HTMLText, HTMLText,
HTMLTitle, HTMLTitle,
TagsMixin, TagsMixin,
_parse_HTMLTable_from_table_elem,
) )
TAGS = ( TAGS = (
@ -287,9 +287,6 @@ def test_read_html_doc(tmp_path: pathlib.Path):
f.write( f.write(
"<html>\n" "<html>\n"
" <body>\n" " <body>\n"
" <header>\n"
" <p>Here is a header. We want to ignore anything that is in this section.</p>\n"
" </header>\n"
" <h1>A Great and Glorious Section</h1>\n" " <h1>A Great and Glorious Section</h1>\n"
" <p>Dear Leader is the best. He is such a wonderful engineer!</p>\n" " <p>Dear Leader is the best. He is such a wonderful engineer!</p>\n"
" <p></p>\n" " <p></p>\n"
@ -298,42 +295,35 @@ def test_read_html_doc(tmp_path: pathlib.Path):
" <table>\n" " <table>\n"
" <tbody>\n" " <tbody>\n"
" <tr>\n" " <tr>\n"
" <td><p>Skip me because I'm in a table</p></td>\n" " <td><p>I'm in a table</p></td>\n"
" </tr>\n" " </tr>\n"
" </tbody>\n" " </tbody>\n"
" </table>\n" " </table>\n"
" <hr>\n" " <hr>\n"
" <h2>A New Beginning</h2>\n" " <h2>A New Beginning</h2>\n"
" <div>Here is the start of a new page.</div>\n" " <div>Here is the start of a new page.</div>\n"
" <footer>\n"
" <p>Here is a footer. We want to ignore anything that is in this section</p>\n"
" </footer>\n"
" <div>\n"
" <p>Let's ignore anything after the footer too since it's probably garbage.</p>\n"
" </div>\n"
" </body>\n" " </body>\n"
"</html>\n" "</html>\n"
) )
html_document = HTMLDocument.from_file(filename=filename).doc_after_cleaners( html_document = HTMLDocument.from_file(filename)
skip_headers_and_footers=True, skip_table=True
)
assert len(html_document.pages) == 2 assert len(html_document.pages) == 2
assert all(isinstance(p, Page) for p in html_document.pages) assert all(isinstance(p, Page) for p in html_document.pages)
# -- # --
page_one = html_document.pages[0] p = html_document.pages[0]
assert len(page_one.elements) == 4 assert len(p.elements) == 5
assert page_one.elements == [ assert p.elements == [
Title("A Great and Glorious Section"), Title("A Great and Glorious Section"),
NarrativeText("Dear Leader is the best. He is such a wonderful engineer!"), NarrativeText("Dear Leader is the best. He is such a wonderful engineer!"),
Title("Another Magnificent Title"), Title("Another Magnificent Title"),
NarrativeText("The prior element is a title based on its capitalization patterns!"), NarrativeText("The prior element is a title based on its capitalization patterns!"),
Table("I'm in a table"),
] ]
# -- # --
page_two = html_document.pages[1] p = html_document.pages[1]
assert len(page_two.elements) == 2 assert len(p.elements) == 2
assert page_two.elements == [ assert p.elements == [
Title("A New Beginning"), Title("A New Beginning"),
NarrativeText("Here is the start of a new page."), NarrativeText("Here is the start of a new page."),
] ]
@ -348,64 +338,6 @@ def test_HTMLDocument_can_construct_from_existing_pages():
assert html_document.pages == [page] assert html_document.pages == [page]
# -- HTMLDocument.doc_after_cleaners() -----------------------------------------------------------
def test_include_headers_and_footers(sample_doc: HTMLDocument):
html_document = sample_doc.doc_after_cleaners(skip_headers_and_footers=False)
assert len(html_document.pages[1].elements) == 3
def test_read_without_skipping_table(is_possible_narrative_text_: Mock):
is_possible_narrative_text_.return_value = True
document = HTMLDocument.from_string(
"<html>\n"
" <body>\n"
" <table>\n"
" <tbody>\n"
" <tr>\n"
" <td><p>Hi there! I am Matt!</p></td>\n"
" </tr>\n"
" </tbody>\n"
" </table>\n"
" </body>\n"
"</html>\n"
).doc_after_cleaners(skip_table=False)
assert document.pages[0].elements[0] == Table(text="Hi there! I am Matt!")
def test_include_table_text(sample_doc: HTMLDocument):
html_document = sample_doc.doc_after_cleaners(skip_table=False)
assert len(html_document.pages[0].elements) == 2
def test_tag_types_table(sample_doc: HTMLDocument):
html_document = sample_doc.doc_after_cleaners(skip_table=True)
assert len(html_document.pages[0].elements) == 2
def test_cleaner_raises_on_non_element_elements(sample_doc: HTMLDocument, pages_prop_: Mock):
page = Page(0)
page.elements = [
"this should def not be a string" # pyright: ignore[reportAttributeAccessIssue]
]
pages_prop_.return_value = [page]
with pytest.raises(ValueError):
sample_doc.doc_after_cleaners()
def test_cleaner_can_filter_out_tables_in_place():
doc = HTMLDocument.from_string(
"<table><tbody><tr><td>A table thing.</td></tr></tbody></table>\n"
"<p>A non-table thing</p>\n"
)
assert len(doc.elements) == 2
doc.doc_after_cleaners(skip_table=True, inplace=True)
assert len(doc.elements) == 1
# -- HTMLDocument.elements ----------------------------------------------------------------------- # -- HTMLDocument.elements -----------------------------------------------------------------------
@ -429,19 +361,16 @@ def test_parses_tags_correctly():
def test_nested_text_tags(): def test_nested_text_tags():
tag1, tag2 = [tag for tag in html.TEXT_TAGS if tag not in html.TABLE_TAGS][:2] html_document = HTMLDocument.from_string(
html_str = ( "<body>\n"
f"<body>\n" " <p>\n"
f" <{tag1}>\n" " <a>\n"
f" <{tag2}>\n" " There is some text here.\n"
f" There is some text here.\n" " </a>\n"
f" </{tag2}>\n" " </p>\n"
f" </{tag1}>\n" "</body>\n"
f"</body>\n"
) )
html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table=False)
assert len(html_document.pages[0].elements) == 1 assert len(html_document.pages[0].elements) == 1
@ -575,19 +504,6 @@ def test_exclude_tag_types(tag: str):
assert len(html_document.pages) == 0 assert len(html_document.pages) == 0
# -- has_table_ancestor() ------------------------------------------------------------------------
def test_has_table_ancestor():
title = HTMLTitle("I am a Title", tag="td", ancestortags=["html", "body", "table", "tr"])
assert html.has_table_ancestor(title)
def test_has_no_table_ancestor():
title = HTMLTitle("I am a Title", tag="p", ancestortags=["html", "body"])
assert not html.has_table_ancestor(title)
# -- _bulleted_text_from_table() ----------------------------------------------------------------- # -- _bulleted_text_from_table() -----------------------------------------------------------------
@ -856,8 +772,8 @@ def test_parse_nothing():
assert parsed_el is None assert parsed_el is None
def test_parse_not_anything(is_narrative_tag_: Mock, is_possible_title_: Mock): def test_parse_not_anything(_is_narrative_tag_: Mock, is_possible_title_: Mock): # noqa: PT019
is_narrative_tag_.return_value = False _is_narrative_tag_.return_value = False
is_possible_title_.return_value = False is_possible_title_.return_value = False
doc = """<p>This is nothing</p>""" doc = """<p>This is nothing</p>"""
document_tree = etree.fromstring(doc, etree.HTMLParser()) document_tree = etree.fromstring(doc, etree.HTMLParser())
@ -942,7 +858,7 @@ def test_process_list_item_returns_none_if_next_has_no_text():
document_tree = etree.fromstring(doc, etree.HTMLParser()) document_tree = etree.fromstring(doc, etree.HTMLParser())
el = document_tree.find(".//div") el = document_tree.find(".//div")
assert el is not None assert el is not None
assert html.is_list_item_tag(el) is True assert html._is_list_item_tag(el) is True
parsed_el, _ = html._process_list_item(el) parsed_el, _ = html._process_list_item(el)
assert parsed_el is None assert parsed_el is None
@ -1071,8 +987,8 @@ class Describe_parse_HTMLTable_from_table_elem:
@pytest.fixture @pytest.fixture
def is_narrative_tag_(request: FixtureRequest): def _is_narrative_tag_(request: FixtureRequest):
return function_mock(request, "unstructured.documents.html.is_narrative_tag") return function_mock(request, "unstructured.documents.html._is_narrative_tag")
@pytest.fixture @pytest.fixture

View File

@ -28,7 +28,7 @@ from unstructured.documents.elements import (
TableChunk, TableChunk,
Title, Title,
) )
from unstructured.documents.html import HTMLTable, TagsMixin from unstructured.documents.html_elements import HTMLTable, TagsMixin
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html
# -- document-source (filename, file, text, url) ------------------------------------------------- # -- document-source (filename, file, text, url) -------------------------------------------------

View File

@ -1 +1 @@
__version__ = "0.14.5-dev5" # pragma: no cover __version__ = "0.14.5-dev6" # pragma: no cover

View File

@ -2,26 +2,21 @@
from __future__ import annotations from __future__ import annotations
from typing import Any, Callable, Final, Iterator, Optional, Sequence, cast from typing import Final, Iterator, Optional, cast
from lxml import etree from lxml import etree
from unstructured.cleaners.core import ( from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
clean_bullets,
replace_unicode_quotes,
)
from unstructured.documents.base import Page from unstructured.documents.base import Page
from unstructured.documents.elements import ( from unstructured.documents.elements import Element, ElementMetadata, Link
Address, from unstructured.documents.html_elements import (
Element, HTMLAddress,
ElementMetadata, HTMLEmailAddress,
EmailAddress, HTMLListItem,
Link, HTMLNarrativeText,
ListItem, HTMLTable,
NarrativeText, HTMLText,
Table, HTMLTitle,
Text,
Title,
) )
from unstructured.documents.xml import VALID_PARSERS, XMLDocument from unstructured.documents.xml import VALID_PARSERS, XMLDocument
from unstructured.logger import logger from unstructured.logger import logger
@ -47,82 +42,6 @@ HEADER_OR_FOOTER_TAGS: Final[list[str]] = ["header", "footer"]
SECTION_TAGS: Final[list[str]] = ["div", "pre"] SECTION_TAGS: Final[list[str]] = ["div", "pre"]
# -- HTML-specific document-elements and methods -------------------------------------------------
class TagsMixin:
"""Mixin that allows a class to retain tag information."""
def __init__(
self,
*args: Any,
tag: Optional[str] = None,
ancestortags: Sequence[str] = (),
links: Sequence[Link] = [],
emphasized_texts: Sequence[dict[str, str]] = [],
text_as_html: Optional[str] = None,
**kwargs: Any,
):
if tag is None:
raise TypeError("tag argument must be passed and not None")
else:
self.tag = tag
self.ancestortags = ancestortags
self.links = links
self.emphasized_texts = emphasized_texts
self.text_as_html = text_as_html
super().__init__(*args, **kwargs)
class HTMLText(TagsMixin, Text):
"""Text with tag information."""
class HTMLAddress(TagsMixin, Address):
"""Address with tag information."""
class HTMLEmailAddress(TagsMixin, EmailAddress):
"""EmailAddress with tag information"""
class HTMLTitle(TagsMixin, Title):
"""Title with tag information."""
class HTMLNarrativeText(TagsMixin, NarrativeText):
"""NarrativeText with tag information."""
class HTMLListItem(TagsMixin, ListItem):
"""NarrativeText with tag information."""
class HTMLTable(TagsMixin, Table):
"""NarrativeText with tag information"""
def has_table_ancestor(element: TagsMixin) -> bool:
"""Checks to see if an element has ancestors that are table elements. If so, we consider
it to be a table element rather than a section of narrative text."""
return any(ancestor in TABLE_TAGS for ancestor in element.ancestortags)
def in_header_or_footer(element: TagsMixin) -> bool:
"""Checks to see if an element is contained within a header or a footer tag."""
if any(ancestor in HEADER_OR_FOOTER_TAGS for ancestor in element.ancestortags):
return True
return False
def is_table(element: TagsMixin) -> bool:
"""Checks to see if an element is a table"""
return element.tag in TABLE_TAGS
# -- HTML element-tree processing ----------------------------------------------------------------
class HTMLDocument(XMLDocument): class HTMLDocument(XMLDocument):
"""Class for handling HTML documents. Uses rules based parsing to identify sections """Class for handling HTML documents. Uses rules based parsing to identify sections
of interest within the document.""" of interest within the document."""
@ -200,7 +119,7 @@ class HTMLDocument(XMLDocument):
page.elements.extend(bulleted_text) page.elements.extend(bulleted_text)
descendanttag_elems = tuple(tag_elem.iterdescendants()) descendanttag_elems = tuple(tag_elem.iterdescendants())
elif is_list_item_tag(tag_elem): elif _is_list_item_tag(tag_elem):
element, next_element = _process_list_item(tag_elem) element, next_element = _process_list_item(tag_elem)
if element is not None: if element is not None:
page.elements.append(element) page.elements.append(element)
@ -228,67 +147,170 @@ class HTMLDocument(XMLDocument):
return pages return pages
def doc_after_cleaners(
self,
skip_headers_and_footers: bool = False,
skip_table: bool = False,
inplace: bool = False,
) -> HTMLDocument:
"""Filters elements returning new instance based on the criteria specified.
Note that the number of pages can change in the case that all elements on a page are # -- candidate HTMLDocument methods --------------------------------------------------------------
filtered out.
Parameters
----------
skip_table:
If True, skips table element
skip_headers_and_footers:
If True, ignores any content that is within <header> or <footer> tags
inplace:
If True, document is modified in place and returned.
"""
excluders: list[Callable[[TagsMixin], bool]] = []
if skip_headers_and_footers:
excluders.append(in_header_or_footer)
if skip_table:
excluders.append(is_table)
pages: list[Page] = [] def _find_articles(root: etree._Element, assemble_articles: bool = True) -> list[etree._Element]:
page_number = 0 """Parse articles from `root` of an HTML document.
new_page = Page(number=page_number)
for page in self.pages: Each `<article>` element in the HTML becomes its own "sub-document" (article). If no article
elements: list[Element] = [] elements are present, the entire document (`root`) is returned as the single document article.
for el in page.elements: """
if not isinstance(el, TagsMixin): if assemble_articles is False:
raise ValueError( return [root]
f"elements of class {self.__class__} should be of type HTMLTitle "
f"HTMLNarrativeText, or HTMLListItem but " articles = root.findall(".//article")
f"object has an element of type {type(el)}", if len(articles) == 0:
) # NOTE(robinson) - ref: https://schema.org/Article
if not any(excluder(el) for excluder in excluders): articles = root.findall(".//div[@itemprop='articleBody']")
elements.append(el) return [root] if len(articles) == 0 else articles
if skip_headers_and_footers and "footer" in tuple(el.ancestortags) + (el.tag,):
break
if elements: def _find_main(root: etree._Element) -> etree._Element:
new_page.elements = elements """The first <main> tag under `root` if it exists, othewise `root`."""
pages.append(new_page) main_tag_elem = root.find(".//main")
page_number += 1 return main_tag_elem if main_tag_elem is not None else root
new_page = Page(number=page_number)
if inplace:
self._pages = pages # -- tag classifiers -----------------------------------------------------------------------------
self._elements = None
return self
else: def _is_bulleted_table(table_elem: etree._Element) -> bool:
out = self.__class__.from_pages(pages) """True when all text in `table_elem` is bulleted text.
if not isinstance(out, HTMLDocument):
# NOTE(robinson) - Skipping for test coverage because this condition is impossible. A table-row containing no text is not considered, but at least one bulleted-text item must be
# Added type check because from_pages is a method on Document. Without the type present. A table with no text in any row is not a bulleted table.
# check, mypy complains about returning Document instead of HTMLDocument """
raise ValueError( if table_elem.tag != "table":
f"Unexpected class: {self.__class__.__name__}", return False
) # pragma: no cover
return out trs = table_elem.findall(".//tr")
tr_texts = [_construct_text(tr) for tr in trs]
# -- a table with no text is not a bulleted table --
if all(not text for text in tr_texts):
return False
# -- all non-empty rows must contain bulleted text --
if any(text and not is_bulleted_text(text) for text in tr_texts):
return False
return True
def _is_container_with_text(tag_elem: etree._Element) -> bool:
"""Checks if a tag is a container that also happens to contain text.
Example
-------
<div>Hi there,
<div>This is my message.</div>
<div>Please read my message!</div>
</div>
"""
if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0:
return False
tag_elem_text = tag_elem.text.strip() if tag_elem.text else None
tag_elem_tail = tag_elem.tail.strip() if tag_elem.tail else None
if not tag_elem_text and not tag_elem_tail:
return False
return True
def _is_list_item_tag(tag_elem: etree._Element) -> bool:
"""True when `tag_elem` contains bulleted text."""
return tag_elem.tag in LIST_ITEM_TAGS or (
tag_elem.tag in SECTION_TAGS and is_bulleted_text(_construct_text(tag_elem))
)
def _is_text_tag(
tag_elem: etree._Element, max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN
) -> bool:
"""True when `tag_element` potentially contains narrative text."""
# NOTE(robinson) - Only consider elements with limited depth. Otherwise,
# it could be the text representation of a giant div
# Exclude empty tags from tag_elem
empty_elems_len = len([el for el in tag_elem if el.tag in EMPTY_TAGS])
if len(tag_elem) > max_predecessor_len + empty_elems_len:
return False
if tag_elem.tag in TEXT_TAGS + HEADING_TAGS + TEXTBREAK_TAGS:
return True
# NOTE(robinson) - This indicates that a div tag has no children. If that's the
# case and the tag has text, its potential a text tag
children = list(tag_elem)
if tag_elem.tag in SECTION_TAGS + ["body"] and len(children) == 0:
return True
if _has_adjacent_bulleted_spans(tag_elem, children):
return True
return False
# -- tag processors ------------------------------------------------------------------------------
def _bulleted_text_from_table(table: etree._Element) -> list[Element]:
"""Extracts bulletized narrative text from the `<table>` element in `table`.
NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted. I.e., _read()
will drop non-bullet narrative text in the table.
"""
bulleted_text: list[Element] = []
rows = table.findall(".//tr")
for row in rows:
text = _construct_text(row)
if is_bulleted_text(text):
bulleted_text.append(HTMLListItem(text=clean_bullets(text), tag=row.tag))
return bulleted_text
def _construct_text(tag_elem: etree._Element, include_tail_text: bool = True) -> str:
"""Extract "clean"" text from `tag_elem`."""
text = "".join(str(t) for t in tag_elem.itertext() if t)
if include_tail_text and tag_elem.tail:
text = text + tag_elem.tail
text = replace_unicode_quotes(text)
return text.strip()
def _get_bullet_descendants(
element: Optional[etree._Element], next_element: Optional[etree._Element]
) -> tuple[etree._Element, ...]:
"""Helper for list-item processing.
Gathers the descendants of `next_element` so they can be marked visited.
"""
return () if element is None or next_element is None else tuple(next_element.iterdescendants())
def _get_emphasized_texts_from_tag(tag_elem: etree._Element) -> list[dict[str, str]]:
"""Emphasized text within and below `tag_element`.
Emphasis is indicated by `<strong>`, `<em>`, `<span>`, `<b>`, `<i>` tags.
"""
emphasized_texts: list[dict[str, str]] = []
tags_to_track = ["strong", "em", "span", "b", "i"]
if tag_elem.tag in tags_to_track:
text = _construct_text(tag_elem, False)
if text:
emphasized_texts.append({"text": text, "tag": tag_elem.tag})
for descendant_tag_elem in tag_elem.iterdescendants(*tags_to_track):
text = _construct_text(descendant_tag_elem, False)
if text:
emphasized_texts.append({"text": text, "tag": descendant_tag_elem.tag})
return emphasized_texts
def _get_links_from_tag(tag_elem: etree._Element) -> list[Link]: def _get_links_from_tag(tag_elem: etree._Element) -> list[Link]:
@ -313,27 +335,23 @@ def _get_links_from_tag(tag_elem: etree._Element) -> list[Link]:
return links return links
def _is_bulleted_table(table_elem: etree._Element) -> bool: def _has_adjacent_bulleted_spans(tag_elem: etree._Element, children: list[etree._Element]) -> bool:
"""True when all text in `table_elem` is bulleted text. """True when `tag_elem` is a <div> or <pre> containing two or more adjacent bulleted spans.
A table-row containing no text is not considered, but at least one bulleted-text item must be A bulleted span is one beginning with a bullet. If there are two or more adjacent to each other
present. A table with no text in any row is not a bulleted table. they are treated as a single bulleted text element.
""" """
if table_elem.tag != "table": if tag_elem.tag in SECTION_TAGS:
return False all_spans = all(child.tag == "span" for child in children)
_is_bulleted = children[0].text is not None and is_bulleted_text(children[0].text)
if all_spans and _is_bulleted:
return True
return False
trs = table_elem.findall(".//tr")
tr_texts = [_construct_text(tr) for tr in trs]
# -- a table with no text is not a bulleted table -- def _has_break_tags(tag_elem: etree._Element) -> bool:
if all(not text for text in tr_texts): """True when `tab_elem` contains a `<br>` descendant."""
return False return any(descendant.tag in TEXTBREAK_TAGS for descendant in tag_elem.iterdescendants())
# -- all non-empty rows must contain bulleted text --
if any(text and not is_bulleted_text(text) for text in tr_texts):
return False
return True
def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Element]: def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Element]:
@ -378,27 +396,6 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele
) )
def _get_emphasized_texts_from_tag(tag_elem: etree._Element) -> list[dict[str, str]]:
"""Emphasized text within and below `tag_element`.
Emphasis is indicated by `<strong>`, `<em>`, `<span>`, `<b>`, `<i>` tags.
"""
emphasized_texts: list[dict[str, str]] = []
tags_to_track = ["strong", "em", "span", "b", "i"]
if tag_elem.tag in tags_to_track:
text = _construct_text(tag_elem, False)
if text:
emphasized_texts.append({"text": text, "tag": tag_elem.tag})
for descendant_tag_elem in tag_elem.iterdescendants(*tags_to_track):
text = _construct_text(descendant_tag_elem, False)
if text:
emphasized_texts.append({"text": text, "tag": descendant_tag_elem.tag})
return emphasized_texts
def _parse_tag( def _parse_tag(
tag_elem: etree._Element, tag_elem: etree._Element,
include_tail_text: bool = True, include_tail_text: bool = True,
@ -440,193 +437,6 @@ def _parse_tag(
) )
def _text_to_element(
text: str,
tag: str,
ancestortags: tuple[str, ...],
depth: int,
links: list[Link] = [],
emphasized_texts: list[dict[str, str]] = [],
) -> Optional[Element]:
"""Produce a document-element of the appropriate sub-type for `text`."""
if is_bulleted_text(text):
if not clean_bullets(text):
return None
return HTMLListItem(
text=clean_bullets(text),
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
metadata=ElementMetadata(category_depth=depth),
)
elif is_us_city_state_zip(text):
return HTMLAddress(
text=text,
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
)
elif is_email_address(text):
return HTMLEmailAddress(
text=text,
tag=tag,
links=links,
emphasized_texts=emphasized_texts,
)
if len(text) < 2:
return None
elif is_narrative_tag(text, tag):
return HTMLNarrativeText(
text,
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
)
elif is_heading_tag(tag) or is_possible_title(text):
return HTMLTitle(
text,
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
metadata=ElementMetadata(category_depth=depth),
)
else:
return HTMLText(
text,
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
)
def _is_container_with_text(tag_elem: etree._Element) -> bool:
"""Checks if a tag is a container that also happens to contain text.
Example
-------
<div>Hi there,
<div>This is my message.</div>
<div>Please read my message!</div>
</div>
"""
if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0:
return False
tag_elem_text = tag_elem.text.strip() if tag_elem.text else None
tag_elem_tail = tag_elem.tail.strip() if tag_elem.tail else None
if not tag_elem_text and not tag_elem_tail:
return False
return True
def is_narrative_tag(text: str, tag: str) -> bool:
"""Uses tag information to infer whether text is narrative."""
return tag not in HEADING_TAGS and is_possible_narrative_text(text)
def is_heading_tag(tag: str) -> bool:
"""Uses tag information to infer whether text is a heading."""
return tag in HEADING_TAGS
def _construct_text(tag_elem: etree._Element, include_tail_text: bool = True) -> str:
"""Extract "clean"" text from `tag_elem`."""
text = "".join(str(t) for t in tag_elem.itertext() if t)
if include_tail_text and tag_elem.tail:
text = text + tag_elem.tail
text = replace_unicode_quotes(text)
return text.strip()
def _has_break_tags(tag_elem: etree._Element) -> bool:
"""True when `tab_elem` contains a `<br>` descendant."""
return any(descendant.tag in TEXTBREAK_TAGS for descendant in tag_elem.iterdescendants())
def _unfurl_break_tags(tag_elem: etree._Element) -> list[etree._Element]:
"""Sequence of `tag_elem` and its children with `<br>` elements removed.
NOTE that these are "loose" `etree._Element` instances that are NOT linked to the original HTML
element-tree, so methods like `.getchildren()`, `.find()` etc. will happily produce empty
results.
"""
unfurled: list[etree._Element] = []
if tag_elem.text:
_tag_elem = etree.Element(tag_elem.tag)
_tag_elem.text = tag_elem.text
unfurled.append(_tag_elem)
for child in tag_elem:
if not _has_break_tags(child):
unfurled.append(child)
else:
if child.text:
_tag_elem = etree.Element(child.tag)
_tag_elem.text = child.text
unfurled.append(_tag_elem)
unfurled.extend(_unfurl_break_tags(child))
return unfurled
def _is_text_tag(
tag_elem: etree._Element, max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN
) -> bool:
"""True when `tag_element` potentially contains narrative text."""
# NOTE(robinson) - Only consider elements with limited depth. Otherwise,
# it could be the text representation of a giant div
# Exclude empty tags from tag_elem
empty_elems_len = len([el for el in tag_elem if el.tag in EMPTY_TAGS])
if len(tag_elem) > max_predecessor_len + empty_elems_len:
return False
if tag_elem.tag in TEXT_TAGS + HEADING_TAGS + TEXTBREAK_TAGS:
return True
# NOTE(robinson) - This indicates that a div tag has no children. If that's the
# case and the tag has text, its potential a text tag
children = list(tag_elem)
if tag_elem.tag in SECTION_TAGS + ["body"] and len(children) == 0:
return True
if _has_adjacent_bulleted_spans(tag_elem, children):
return True
return False
def _process_text_tag(
tag_elem: etree._Element, include_tail_text: bool = True
) -> tuple[list[Element], tuple[etree._Element, ...]]:
"""Produces a document element from `tag_elem`."""
page_elements: list[Element] = []
if _has_break_tags(tag_elem):
flattened_elems = _unfurl_break_tags(tag_elem)
for _tag_elem in flattened_elems:
element = _parse_tag(_tag_elem, include_tail_text)
if element is not None:
page_elements.append(element)
else:
element = _parse_tag(tag_elem, include_tail_text)
if element is not None:
page_elements.append(element)
descendant_tag_elems = tuple(tag_elem.iterdescendants())
return page_elements, descendant_tag_elems
def _process_list_item( def _process_list_item(
tag_elem: etree._Element, tag_elem: etree._Element,
max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN, max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
@ -671,69 +481,131 @@ def _process_list_item(
return None, None return None, None
def _get_bullet_descendants( def _process_text_tag(
element: Optional[etree._Element], next_element: Optional[etree._Element] tag_elem: etree._Element, include_tail_text: bool = True
) -> tuple[etree._Element, ...]: ) -> tuple[list[Element], tuple[etree._Element, ...]]:
"""Helper for list-item processing. """Produces a document element from `tag_elem`."""
Gathers the descendants of `next_element` so they can be marked visited. page_elements: list[Element] = []
if _has_break_tags(tag_elem):
flattened_elems = _unfurl_break_tags(tag_elem)
for _tag_elem in flattened_elems:
element = _parse_tag(_tag_elem, include_tail_text)
if element is not None:
page_elements.append(element)
else:
element = _parse_tag(tag_elem, include_tail_text)
if element is not None:
page_elements.append(element)
descendant_tag_elems = tuple(tag_elem.iterdescendants())
return page_elements, descendant_tag_elems
def _unfurl_break_tags(tag_elem: etree._Element) -> list[etree._Element]:
"""Sequence of `tag_elem` and its children with `<br>` elements removed.
NOTE that these are "loose" `etree._Element` instances that are NOT linked to the original HTML
element-tree, so methods like `.getchildren()`, `.find()` etc. will happily produce empty
results.
""" """
return () if element is None or next_element is None else tuple(next_element.iterdescendants()) unfurled: list[etree._Element] = []
if tag_elem.text:
_tag_elem = etree.Element(tag_elem.tag)
_tag_elem.text = tag_elem.text
unfurled.append(_tag_elem)
for child in tag_elem:
if not _has_break_tags(child):
unfurled.append(child)
else:
if child.text:
_tag_elem = etree.Element(child.tag)
_tag_elem.text = child.text
unfurled.append(_tag_elem)
unfurled.extend(_unfurl_break_tags(child))
return unfurled
def is_list_item_tag(tag_elem: etree._Element) -> bool: # -- text-element classifier ---------------------------------------------------------------------
"""True when `tag_elem` contains bulleted text."""
return tag_elem.tag in LIST_ITEM_TAGS or (
tag_elem.tag in SECTION_TAGS and is_bulleted_text(_construct_text(tag_elem))
)
def _bulleted_text_from_table(table: etree._Element) -> list[Element]: def _text_to_element(
"""Extracts bulletized narrative text from the `<table>` element in `table`. text: str,
tag: str,
ancestortags: tuple[str, ...],
depth: int,
links: list[Link] = [],
emphasized_texts: list[dict[str, str]] = [],
) -> Optional[Element]:
"""Produce a document-element of the appropriate sub-type for `text`."""
if is_bulleted_text(text):
if not clean_bullets(text):
return None
return HTMLListItem(
text=clean_bullets(text),
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
metadata=ElementMetadata(category_depth=depth),
)
elif is_us_city_state_zip(text):
return HTMLAddress(
text=text,
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
)
elif is_email_address(text):
return HTMLEmailAddress(
text=text,
tag=tag,
links=links,
emphasized_texts=emphasized_texts,
)
NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted. I.e., _read() if len(text) < 2:
will drop non-bullet narrative text in the table. return None
""" elif _is_narrative_tag(text, tag):
bulleted_text: list[Element] = [] return HTMLNarrativeText(
rows = table.findall(".//tr") text,
for row in rows: tag=tag,
text = _construct_text(row) ancestortags=ancestortags,
if is_bulleted_text(text): links=links,
bulleted_text.append(HTMLListItem(text=clean_bullets(text), tag=row.tag)) emphasized_texts=emphasized_texts,
return bulleted_text )
elif _is_heading_tag(tag) or is_possible_title(text):
return HTMLTitle(
text,
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
metadata=ElementMetadata(category_depth=depth),
)
else:
return HTMLText(
text,
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
)
def _has_adjacent_bulleted_spans(tag_elem: etree._Element, children: list[etree._Element]) -> bool: # -- HTML-specific text classifiers --------------------------------------------------------------
"""True when `tag_elem` is a <div> or <pre> containing two or more adjacent bulleted spans.
A bulleted span is one beginning with a bullet. If there are two or more adjacent to each other
they are treated as a single bulleted text element.
"""
if tag_elem.tag in SECTION_TAGS:
all_spans = all(child.tag == "span" for child in children)
_is_bulleted = children[0].text is not None and is_bulleted_text(children[0].text)
if all_spans and _is_bulleted:
return True
return False
def _find_main(root: etree._Element) -> etree._Element: def _is_heading_tag(tag: str) -> bool:
"""The first <main> tag under `root` if it exists, othewise `root`.""" """Uses tag information to infer whether text is a heading."""
main_tag_elem = root.find(".//main") return tag in HEADING_TAGS
return main_tag_elem if main_tag_elem is not None else root
def _find_articles(root: etree._Element, assemble_articles: bool = True) -> list[etree._Element]: def _is_narrative_tag(text: str, tag: str) -> bool:
"""Parse articles from `root` of an HTML document. """Uses tag information to infer whether text is narrative."""
return tag not in HEADING_TAGS and is_possible_narrative_text(text)
Each `<article>` element in the HTML becomes its own "sub-document" (article). If no article
elements are present, the entire document (`root`) is returned as the single document article.
"""
if assemble_articles is False:
return [root]
articles = root.findall(".//article")
if len(articles) == 0:
# NOTE(robinson) - ref: https://schema.org/Article
articles = root.findall(".//div[@itemprop='articleBody']")
return [root] if len(articles) == 0 else articles

View File

@ -0,0 +1,68 @@
"""Document elements specific to the HTML partitioner."""
from __future__ import annotations
from typing import Any, Dict, Optional, Sequence
from unstructured.documents.elements import (
Address,
EmailAddress,
Link,
ListItem,
NarrativeText,
Table,
Text,
Title,
)
class TagsMixin:
"""Mixin that allows a class to retain tag information."""
def __init__(
self,
*args: Any,
tag: Optional[str] = None,
ancestortags: Sequence[str] = (),
links: Sequence[Link] = [],
emphasized_texts: Sequence[Dict[str, str]] = [],
text_as_html: Optional[str] = None,
**kwargs: Any,
):
if tag is None:
raise TypeError("tag argument must be passed and not None")
else:
self.tag = tag
self.ancestortags = ancestortags
self.links = links
self.emphasized_texts = emphasized_texts
self.text_as_html = text_as_html
super().__init__(*args, **kwargs)
class HTMLText(TagsMixin, Text):
"""Text with tag information."""
class HTMLAddress(TagsMixin, Address):
"""Address with tag information."""
class HTMLEmailAddress(TagsMixin, EmailAddress):
"""EmailAddress with tag information"""
class HTMLTitle(TagsMixin, Title):
"""Title with tag information."""
class HTMLNarrativeText(TagsMixin, NarrativeText):
"""NarrativeText with tag information."""
class HTMLListItem(TagsMixin, ListItem):
"""NarrativeText with tag information."""
class HTMLTable(TagsMixin, Table):
"""NarrativeText with tag information"""

View File

@ -6,7 +6,8 @@ import requests
from unstructured.chunking import add_chunking_strategy from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata from unstructured.documents.elements import Element, process_metadata
from unstructured.documents.html import HTMLDocument, TagsMixin from unstructured.documents.html import HTMLDocument
from unstructured.documents.html_elements import TagsMixin
from unstructured.documents.xml import VALID_PARSERS from unstructured.documents.xml import VALID_PARSERS
from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.file_conversion import convert_file_to_html_text from unstructured.file_utils.file_conversion import convert_file_to_html_text