rfctr(html): organize and improve HTMLDocument tests (#3161)

**Summary**
In preparation for further work on HTMLDocument, organize the organic
growth in `documents/tests_html.py` and improving typing and expression.

**Reviewers:** Commits are groomed and review is probably eased by going
commit-by-commit
This commit is contained in:
Steve Canny 2024-06-06 11:16:02 -07:00 committed by GitHub
parent f1cab248ce
commit 8378ddaa3b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 708 additions and 663 deletions

View File

@ -1,4 +1,4 @@
## 0.14.5-dev4
## 0.14.5-dev5
### Enhancements

File diff suppressed because it is too large Load Diff

View File

@ -1 +1 @@
__version__ = "0.14.5-dev4" # pragma: no cover
__version__ = "0.14.5-dev5" # pragma: no cover

View File

@ -2,15 +2,7 @@
from __future__ import annotations
import sys
from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, cast
from unstructured.partition.utils.constants import HTML_MAX_PREDECESSOR_LEN
if sys.version_info < (3, 8):
from typing_extensions import Final
else:
from typing import Final
from typing import Any, Callable, Final, Iterator, Optional, Sequence, cast
from lxml import etree
@ -40,18 +32,19 @@ from unstructured.partition.text_type import (
is_possible_title,
is_us_city_state_zip,
)
from unstructured.partition.utils.constants import HTML_MAX_PREDECESSOR_LEN
from unstructured.utils import htmlify_matrix_of_cell_texts
TEXT_TAGS: Final[List[str]] = ["p", "a", "td", "span", "b", "font"]
LIST_ITEM_TAGS: Final[List[str]] = ["li", "dd"]
LIST_TAGS: Final[List[str]] = ["ul", "ol", "dl"]
HEADING_TAGS: Final[List[str]] = ["h1", "h2", "h3", "h4", "h5", "h6"]
TABLE_TAGS: Final[List[str]] = ["table", "tbody", "td", "tr"]
TEXTBREAK_TAGS: Final[List[str]] = ["br"]
PAGEBREAK_TAGS: Final[List[str]] = ["hr"]
EMPTY_TAGS: Final[List[str]] = PAGEBREAK_TAGS + TEXTBREAK_TAGS
HEADER_OR_FOOTER_TAGS: Final[List[str]] = ["header", "footer"]
SECTION_TAGS: Final[List[str]] = ["div", "pre"]
TEXT_TAGS: Final[list[str]] = ["p", "a", "td", "span", "b", "font"]
LIST_ITEM_TAGS: Final[list[str]] = ["li", "dd"]
LIST_TAGS: Final[list[str]] = ["ul", "ol", "dl"]
HEADING_TAGS: Final[list[str]] = ["h1", "h2", "h3", "h4", "h5", "h6"]
TABLE_TAGS: Final[list[str]] = ["table", "tbody", "td", "tr"]
TEXTBREAK_TAGS: Final[list[str]] = ["br"]
PAGEBREAK_TAGS: Final[list[str]] = ["hr"]
EMPTY_TAGS: Final[list[str]] = PAGEBREAK_TAGS + TEXTBREAK_TAGS
HEADER_OR_FOOTER_TAGS: Final[list[str]] = ["header", "footer"]
SECTION_TAGS: Final[list[str]] = ["div", "pre"]
# -- HTML-specific document-elements and methods -------------------------------------------------
@ -66,7 +59,7 @@ class TagsMixin:
tag: Optional[str] = None,
ancestortags: Sequence[str] = (),
links: Sequence[Link] = [],
emphasized_texts: Sequence[Dict[str, str]] = [],
emphasized_texts: Sequence[dict[str, str]] = [],
text_as_html: Optional[str] = None,
**kwargs: Any,
):
@ -143,7 +136,7 @@ class HTMLDocument(XMLDocument):
self.assembled_articles = assemble_articles
super().__init__(stylesheet=stylesheet, parser=parser)
def _parse_pages_from_element_tree(self) -> List[Page]:
def _parse_pages_from_element_tree(self) -> list[Page]:
"""Parse HTML elements into pages.
A *page* is a subsequence of the document-elements parsed from the HTML document
@ -154,7 +147,7 @@ class HTMLDocument(XMLDocument):
if self._pages:
return self._pages
logger.info("Reading document ...")
pages: List[Page] = []
pages: list[Page] = []
etree.strip_elements(self.document_tree, ["script", "style"], with_tail=False)
root = _find_main(self.document_tree)
@ -162,7 +155,7 @@ class HTMLDocument(XMLDocument):
page_number = 0
page = Page(number=page_number)
for article in articles:
descendanttag_elems: Tuple[etree._Element, ...] = ()
descendanttag_elems: tuple[etree._Element, ...] = ()
for tag_elem in article.iter():
if tag_elem in descendanttag_elems:
# Prevent repeating something that's been flagged as text as we chase it
@ -180,6 +173,7 @@ class HTMLDocument(XMLDocument):
page.elements.extend(_page_elements)
# NOTE(christine): generate a separate element using a tag tail
assert tag_elem.tail is not None
element = _text_to_element(
tag_elem.tail,
tag_elem.tag,
@ -189,6 +183,7 @@ class HTMLDocument(XMLDocument):
else:
links = _get_links_from_tag(tag_elem)
emphasized_texts = _get_emphasized_texts_from_tag(tag_elem)
assert tag_elem.text is not None
element = _text_to_element(
tag_elem.text,
tag_elem.tag,
@ -253,17 +248,17 @@ class HTMLDocument(XMLDocument):
inplace:
If True, document is modified in place and returned.
"""
excluders: List[Callable[[TagsMixin], bool]] = []
excluders: list[Callable[[TagsMixin], bool]] = []
if skip_headers_and_footers:
excluders.append(in_header_or_footer)
if skip_table:
excluders.append(is_table)
pages: List[Page] = []
pages: list[Page] = []
page_number = 0
new_page = Page(number=page_number)
for page in self.pages:
elements: List[Element] = []
elements: list[Element] = []
for el in page.elements:
if not isinstance(el, TagsMixin):
raise ValueError(
@ -296,9 +291,9 @@ class HTMLDocument(XMLDocument):
return out
def _get_links_from_tag(tag_elem: etree._Element) -> List[Link]:
def _get_links_from_tag(tag_elem: etree._Element) -> list[Link]:
"""Hyperlinks within and below `tag_elem`."""
links: List[Link] = []
links: list[Link] = []
tag_elem_href = tag_elem.get("href")
if tag_elem_href:
tag_elem_text = _construct_text(tag_elem, False)
@ -351,7 +346,7 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele
# -- cell within the table within the cell too.)
trs = cast(
List[etree._Element], table_elem.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr")
list[etree._Element], table_elem.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr")
)
if not trs:
@ -360,12 +355,12 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele
def iter_cell_texts(tr: etree._Element) -> Iterator[str]:
"""Generate the text of each cell in `tr`."""
# -- a cell can be either a "data" cell (td) or a "heading" cell (th) --
tds = cast(List[etree._Element], tr.xpath("./td | ./th"))
tds = cast(list[etree._Element], tr.xpath("./td | ./th"))
for td in tds:
# -- a cell can contain other elements like spans etc. so we can't count on the text
# -- being directly below the `<td>` element. `.itertext()` gets all of it recursively.
# -- Filter out whitespace text nodes that result from HTML formatting.
stripped_text_nodes = (t.strip() for t in cast(Iterator[str], td.itertext()))
stripped_text_nodes = (t.strip() for t in td.itertext())
yield " ".join(t for t in stripped_text_nodes if t)
table_data = [list(iter_cell_texts(tr)) for tr in trs]
@ -383,12 +378,12 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele
)
def _get_emphasized_texts_from_tag(tag_elem: etree._Element) -> List[Dict[str, str]]:
def _get_emphasized_texts_from_tag(tag_elem: etree._Element) -> list[dict[str, str]]:
"""Emphasized text within and below `tag_element`.
Emphasis is indicated by `<strong>`, `<em>`, `<span>`, `<b>`, `<i>` tags.
"""
emphasized_texts: List[Dict[str, str]] = []
emphasized_texts: list[dict[str, str]] = []
tags_to_track = ["strong", "em", "span", "b", "i"]
if tag_elem.tag in tags_to_track:
@ -414,7 +409,7 @@ def _parse_tag(
the document tree again. In the future we might want to keep descendants too, but we don't have
a use for them at the moment.
"""
ancestortags: Tuple[str, ...] = tuple(el.tag for el in tag_elem.iterancestors())[::-1]
ancestortags: tuple[str, ...] = tuple(el.tag for el in tag_elem.iterancestors())[::-1]
links = _get_links_from_tag(tag_elem)
emphasized_texts = _get_emphasized_texts_from_tag(tag_elem)
@ -448,10 +443,10 @@ def _parse_tag(
def _text_to_element(
text: str,
tag: str,
ancestortags: Tuple[str, ...],
ancestortags: tuple[str, ...],
depth: int,
links: List[Link] = [],
emphasized_texts: List[Dict[str, str]] = [],
links: list[Link] = [],
emphasized_texts: list[dict[str, str]] = [],
) -> Optional[Element]:
"""Produce a document-element of the appropriate sub-type for `text`."""
if is_bulleted_text(text):
@ -557,22 +552,21 @@ def _has_break_tags(tag_elem: etree._Element) -> bool:
return any(descendant.tag in TEXTBREAK_TAGS for descendant in tag_elem.iterdescendants())
def _unfurl_break_tags(tag_elem: etree._Element) -> List[etree._Element]:
def _unfurl_break_tags(tag_elem: etree._Element) -> list[etree._Element]:
"""Sequence of `tag_elem` and its children with `<br>` elements removed.
NOTE that these are "loose" `etree._Element` instances that are NOT linked to the original HTML
element-tree, so methods like `.getchildren()`, `.find()` etc. will happily produce empty
results.
"""
unfurled: List[etree._Element] = []
unfurled: list[etree._Element] = []
if tag_elem.text:
_tag_elem = etree.Element(tag_elem.tag)
_tag_elem.text = tag_elem.text
unfurled.append(_tag_elem)
children = tag_elem.getchildren()
for child in children:
for child in tag_elem:
if not _has_break_tags(child):
unfurled.append(child)
else:
@ -586,14 +580,13 @@ def _unfurl_break_tags(tag_elem: etree._Element) -> List[etree._Element]:
def _is_text_tag(
tag_elem: etree._Element,
max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
tag_elem: etree._Element, max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN
) -> bool:
"""True when `tag_element` potentially contains narrative text."""
# NOTE(robinson) - Only consider elements with limited depth. Otherwise,
# it could be the text representation of a giant div
# Exclude empty tags from tag_elem
empty_elems_len = len([el for el in tag_elem.getchildren() if el.tag in EMPTY_TAGS])
empty_elems_len = len([el for el in tag_elem if el.tag in EMPTY_TAGS])
if len(tag_elem) > max_predecessor_len + empty_elems_len:
return False
@ -602,7 +595,7 @@ def _is_text_tag(
# NOTE(robinson) - This indicates that a div tag has no children. If that's the
# case and the tag has text, its potential a text tag
children = tag_elem.getchildren()
children = list(tag_elem)
if tag_elem.tag in SECTION_TAGS + ["body"] and len(children) == 0:
return True
@ -613,12 +606,11 @@ def _is_text_tag(
def _process_text_tag(
tag_elem: etree._Element,
include_tail_text: bool = True,
) -> tuple[list[Element], tuple[etree._Element]]:
tag_elem: etree._Element, include_tail_text: bool = True
) -> tuple[list[Element], tuple[etree._Element, ...]]:
"""Produces a document element from `tag_elem`."""
page_elements = []
page_elements: list[Element] = []
if _has_break_tags(tag_elem):
flattened_elems = _unfurl_break_tags(tag_elem)
for _tag_elem in flattened_elems:
@ -638,7 +630,7 @@ def _process_text_tag(
def _process_list_item(
tag_elem: etree._Element,
max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
) -> Tuple[Optional[Element], Optional[etree._Element]]:
) -> tuple[Optional[Element], Optional[etree._Element]]:
"""Produces an `HTMLListItem` document element from `tag_elem`.
When `tag_elem` contains bulleted text, the relevant bulleted text is extracted. Also returns
@ -670,9 +662,7 @@ def _process_list_item(
next_text = _construct_text(next_element)
# NOTE(robinson) - Only consider elements with limited depth. Otherwise,
# it could be the text representation of a giant div
empty_elems_len = len(
[el for el in tag_elem.getchildren() if el.tag in EMPTY_TAGS],
)
empty_elems_len = len([el for el in tag_elem if el.tag in EMPTY_TAGS])
if len(tag_elem) > max_predecessor_len + empty_elems_len:
return None, None
if next_text:
@ -683,7 +673,7 @@ def _process_list_item(
def _get_bullet_descendants(
element: Optional[etree._Element], next_element: Optional[etree._Element]
) -> Tuple[etree._Element, ...]:
) -> tuple[etree._Element, ...]:
"""Helper for list-item processing.
Gathers the descendants of `next_element` so they can be marked visited.
@ -698,13 +688,13 @@ def is_list_item_tag(tag_elem: etree._Element) -> bool:
)
def _bulleted_text_from_table(table: etree._Element) -> List[Element]:
def _bulleted_text_from_table(table: etree._Element) -> list[Element]:
"""Extracts bulletized narrative text from the `<table>` element in `table`.
NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted. I.e., _read()
will drop non-bullet narrative text in the table.
"""
bulleted_text: List[Element] = []
bulleted_text: list[Element] = []
rows = table.findall(".//tr")
for row in rows:
text = _construct_text(row)
@ -713,7 +703,7 @@ def _bulleted_text_from_table(table: etree._Element) -> List[Element]:
return bulleted_text
def _has_adjacent_bulleted_spans(tag_elem: etree._Element, children: List[etree._Element]) -> bool:
def _has_adjacent_bulleted_spans(tag_elem: etree._Element, children: list[etree._Element]) -> bool:
"""True when `tag_elem` is a <div> or <pre> containing two or more adjacent bulleted spans.
A bulleted span is one beginning with a bullet. If there are two or more adjacent to each other
@ -733,7 +723,7 @@ def _find_main(root: etree._Element) -> etree._Element:
return main_tag_elem if main_tag_elem is not None else root
def _find_articles(root: etree._Element, assemble_articles: bool = True) -> List[etree._Element]:
def _find_articles(root: etree._Element, assemble_articles: bool = True) -> list[etree._Element]:
"""Parse articles from `root` of an HTML document.
Each `<article>` element in the HTML becomes its own "sub-document" (article). If no article

View File

@ -41,7 +41,7 @@ class XMLDocument(Document):
self.stylesheet = stylesheet
self.parser = parser
self.document_tree = None
self.document_tree: etree._Element = None
super().__init__()
def _parse_pages_from_element_tree(self) -> List[Page]:

View File

@ -39,6 +39,8 @@ if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured.documents.html import HTMLDocument
HIERARCHY_RULE_SET = {
"Title": [
"Text",
@ -541,7 +543,7 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]:
# unstructured.documents.html, which imports this module so we can't import the class for type
# hints. Moreover, those two types of documents have different lists of attributes
def document_to_element_list(
document: "DocumentLayout",
document: "DocumentLayout | HTMLDocument",
sortable: bool = False,
include_page_breaks: bool = False,
last_modification_date: Optional[str] = None,

View File

@ -1,12 +1,12 @@
from __future__ import annotations
from typing import IO, TYPE_CHECKING, Any, Optional
from typing import IO, Any, Optional, cast
import requests
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.documents.html import HTMLDocument
from unstructured.documents.html import HTMLDocument, TagsMixin
from unstructured.documents.xml import VALID_PARSERS
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.file_conversion import convert_file_to_html_text
@ -22,9 +22,6 @@ from unstructured.partition.common import (
)
from unstructured.partition.lang import apply_lang_metadata
if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout
@process_metadata()
@add_metadata_with_filetype(FileType.HTML)
@ -130,7 +127,8 @@ def partition_html(
assemble_articles=html_assemble_articles,
)
elif url is not None:
else:
assert url is not None
response = requests.get(url, headers=headers, verify=ssl_verify)
if not response.ok:
raise ValueError(f"Error status code on GET of provided URL: {response.status_code}")
@ -142,7 +140,7 @@ def partition_html(
document = HTMLDocument.from_string(response.text, parser=parser)
if skip_headers_and_footers:
document = filter_footer_and_header(document)
document = _filter_footer_and_header(document)
elements = list(
apply_lang_metadata(
@ -236,12 +234,12 @@ def convert_and_partition_html(
)
def filter_footer_and_header(document: "DocumentLayout") -> "DocumentLayout":
def _filter_footer_and_header(document: HTMLDocument) -> HTMLDocument:
for page in document.pages:
page.elements = list(
filter(
lambda el: "footer" not in el.ancestortags and "header" not in el.ancestortags,
page.elements,
),
)
page.elements = [
e
for e in page.elements
if "header" not in cast(TagsMixin, e).ancestortags
and "footer" not in cast(TagsMixin, e).ancestortags
]
return document