From a883fc9df2cce0ef752b060c348bd8a562f99557 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Thu, 6 Jun 2024 14:21:33 -0700 Subject: [PATCH] rfctr(html): improve SNR in HTMLDocument (#3162) **Summary** Remove dead code and organize helpers of HTMLDocument in preparation for improvements and bug-fixes to follow --- CHANGELOG.md | 2 +- test_unstructured/documents/test_html.py | 134 +---- test_unstructured/partition/test_html.py | 2 +- unstructured/__version__.py | 2 +- unstructured/documents/html.py | 730 ++++++++++------------- unstructured/documents/html_elements.py | 68 +++ unstructured/partition/html.py | 3 +- 7 files changed, 399 insertions(+), 542 deletions(-) create mode 100644 unstructured/documents/html_elements.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 18162421a..ffbd87343 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.5-dev5 +## 0.14.5-dev6 ### Enhancements diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py index 377672272..1baae57ad 100644 --- a/test_unstructured/documents/test_html.py +++ b/test_unstructured/documents/test_html.py @@ -1,4 +1,5 @@ # pyright: reportPrivateUsage=false +# pyright: reportUnusedFunction=false """Test suite for `unstructured.documents.html` module.""" @@ -27,15 +28,14 @@ from unstructured.documents.elements import ( Text, Title, ) -from unstructured.documents.html import ( +from unstructured.documents.html import HTMLDocument, _parse_HTMLTable_from_table_elem +from unstructured.documents.html_elements import ( HTMLAddress, - HTMLDocument, HTMLNarrativeText, HTMLTable, HTMLText, HTMLTitle, TagsMixin, - _parse_HTMLTable_from_table_elem, ) TAGS = ( @@ -287,9 +287,6 @@ def test_read_html_doc(tmp_path: pathlib.Path): f.write( "\n" " \n" - "
\n" - "

Here is a header. We want to ignore anything that is in this section.

\n" - "
\n" "

A Great and Glorious Section

\n" "

Dear Leader is the best. He is such a wonderful engineer!

\n" "

\n" @@ -298,42 +295,35 @@ def test_read_html_doc(tmp_path: pathlib.Path): " \n" " \n" " \n" - " \n" + " \n" " \n" " \n" "

Skip me because I'm in a table

I'm in a table

\n" "
\n" "

A New Beginning

\n" "
Here is the start of a new page.
\n" - " \n" - "
\n" - "

Let's ignore anything after the footer too since it's probably garbage.

\n" - "
\n" " \n" "\n" ) - html_document = HTMLDocument.from_file(filename=filename).doc_after_cleaners( - skip_headers_and_footers=True, skip_table=True - ) + html_document = HTMLDocument.from_file(filename) assert len(html_document.pages) == 2 assert all(isinstance(p, Page) for p in html_document.pages) # -- - page_one = html_document.pages[0] - assert len(page_one.elements) == 4 - assert page_one.elements == [ + p = html_document.pages[0] + assert len(p.elements) == 5 + assert p.elements == [ Title("A Great and Glorious Section"), NarrativeText("Dear Leader is the best. He is such a wonderful engineer!"), Title("Another Magnificent Title"), NarrativeText("The prior element is a title based on its capitalization patterns!"), + Table("I'm in a table"), ] # -- - page_two = html_document.pages[1] - assert len(page_two.elements) == 2 - assert page_two.elements == [ + p = html_document.pages[1] + assert len(p.elements) == 2 + assert p.elements == [ Title("A New Beginning"), NarrativeText("Here is the start of a new page."), ] @@ -348,64 +338,6 @@ def test_HTMLDocument_can_construct_from_existing_pages(): assert html_document.pages == [page] -# -- HTMLDocument.doc_after_cleaners() ----------------------------------------------------------- - - -def test_include_headers_and_footers(sample_doc: HTMLDocument): - html_document = sample_doc.doc_after_cleaners(skip_headers_and_footers=False) - assert len(html_document.pages[1].elements) == 3 - - -def test_read_without_skipping_table(is_possible_narrative_text_: Mock): - is_possible_narrative_text_.return_value = True - document = HTMLDocument.from_string( - "\n" - " \n" - " \n" - " \n" - " \n" - " \n" - " \n" - " \n" - "

Hi there! I am Matt!

\n" - " \n" - "\n" - ).doc_after_cleaners(skip_table=False) - assert document.pages[0].elements[0] == Table(text="Hi there! I am Matt!") - - -def test_include_table_text(sample_doc: HTMLDocument): - html_document = sample_doc.doc_after_cleaners(skip_table=False) - assert len(html_document.pages[0].elements) == 2 - - -def test_tag_types_table(sample_doc: HTMLDocument): - html_document = sample_doc.doc_after_cleaners(skip_table=True) - assert len(html_document.pages[0].elements) == 2 - - -def test_cleaner_raises_on_non_element_elements(sample_doc: HTMLDocument, pages_prop_: Mock): - page = Page(0) - page.elements = [ - "this should def not be a string" # pyright: ignore[reportAttributeAccessIssue] - ] - pages_prop_.return_value = [page] - with pytest.raises(ValueError): - sample_doc.doc_after_cleaners() - - -def test_cleaner_can_filter_out_tables_in_place(): - doc = HTMLDocument.from_string( - "
A table thing.
\n" - "

A non-table thing

\n" - ) - assert len(doc.elements) == 2 - - doc.doc_after_cleaners(skip_table=True, inplace=True) - - assert len(doc.elements) == 1 - - # -- HTMLDocument.elements ----------------------------------------------------------------------- @@ -429,19 +361,16 @@ def test_parses_tags_correctly(): def test_nested_text_tags(): - tag1, tag2 = [tag for tag in html.TEXT_TAGS if tag not in html.TABLE_TAGS][:2] - html_str = ( - f"\n" - f" <{tag1}>\n" - f" <{tag2}>\n" - f" There is some text here.\n" - f" \n" - f" \n" - f"\n" + html_document = HTMLDocument.from_string( + "\n" + "

\n" + " \n" + " There is some text here.\n" + " \n" + "

\n" + "\n" ) - html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table=False) - assert len(html_document.pages[0].elements) == 1 @@ -575,19 +504,6 @@ def test_exclude_tag_types(tag: str): assert len(html_document.pages) == 0 -# -- has_table_ancestor() ------------------------------------------------------------------------ - - -def test_has_table_ancestor(): - title = HTMLTitle("I am a Title", tag="td", ancestortags=["html", "body", "table", "tr"]) - assert html.has_table_ancestor(title) - - -def test_has_no_table_ancestor(): - title = HTMLTitle("I am a Title", tag="p", ancestortags=["html", "body"]) - assert not html.has_table_ancestor(title) - - # -- _bulleted_text_from_table() ----------------------------------------------------------------- @@ -856,8 +772,8 @@ def test_parse_nothing(): assert parsed_el is None -def test_parse_not_anything(is_narrative_tag_: Mock, is_possible_title_: Mock): - is_narrative_tag_.return_value = False +def test_parse_not_anything(_is_narrative_tag_: Mock, is_possible_title_: Mock): # noqa: PT019 + _is_narrative_tag_.return_value = False is_possible_title_.return_value = False doc = """

This is nothing

""" document_tree = etree.fromstring(doc, etree.HTMLParser()) @@ -942,7 +858,7 @@ def test_process_list_item_returns_none_if_next_has_no_text(): document_tree = etree.fromstring(doc, etree.HTMLParser()) el = document_tree.find(".//div") assert el is not None - assert html.is_list_item_tag(el) is True + assert html._is_list_item_tag(el) is True parsed_el, _ = html._process_list_item(el) assert parsed_el is None @@ -1071,8 +987,8 @@ class Describe_parse_HTMLTable_from_table_elem: @pytest.fixture -def is_narrative_tag_(request: FixtureRequest): - return function_mock(request, "unstructured.documents.html.is_narrative_tag") +def _is_narrative_tag_(request: FixtureRequest): + return function_mock(request, "unstructured.documents.html._is_narrative_tag") @pytest.fixture diff --git a/test_unstructured/partition/test_html.py b/test_unstructured/partition/test_html.py index e21e66414..2e353d6f6 100644 --- a/test_unstructured/partition/test_html.py +++ b/test_unstructured/partition/test_html.py @@ -28,7 +28,7 @@ from unstructured.documents.elements import ( TableChunk, Title, ) -from unstructured.documents.html import HTMLTable, TagsMixin +from unstructured.documents.html_elements import HTMLTable, TagsMixin from unstructured.partition.html import partition_html # -- document-source (filename, file, text, url) ------------------------------------------------- diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 195c73cca..40be1c9bb 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.5-dev5" # pragma: no cover +__version__ = "0.14.5-dev6" # pragma: no cover diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 284742981..1c868acae 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -2,26 +2,21 @@ from __future__ import annotations -from typing import Any, Callable, Final, Iterator, Optional, Sequence, cast +from typing import Final, Iterator, Optional, cast from lxml import etree -from unstructured.cleaners.core import ( - clean_bullets, - replace_unicode_quotes, -) +from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes from unstructured.documents.base import Page -from unstructured.documents.elements import ( - Address, - Element, - ElementMetadata, - EmailAddress, - Link, - ListItem, - NarrativeText, - Table, - Text, - Title, +from unstructured.documents.elements import Element, ElementMetadata, Link +from unstructured.documents.html_elements import ( + HTMLAddress, + HTMLEmailAddress, + HTMLListItem, + HTMLNarrativeText, + HTMLTable, + HTMLText, + HTMLTitle, ) from unstructured.documents.xml import VALID_PARSERS, XMLDocument from unstructured.logger import logger @@ -47,82 +42,6 @@ HEADER_OR_FOOTER_TAGS: Final[list[str]] = ["header", "footer"] SECTION_TAGS: Final[list[str]] = ["div", "pre"] -# -- HTML-specific document-elements and methods ------------------------------------------------- - - -class TagsMixin: - """Mixin that allows a class to retain tag information.""" - - def __init__( - self, - *args: Any, - tag: Optional[str] = None, - ancestortags: Sequence[str] = (), - links: Sequence[Link] = [], - emphasized_texts: Sequence[dict[str, str]] = [], - text_as_html: Optional[str] = None, - **kwargs: Any, - ): - if tag is None: - raise TypeError("tag argument must be passed and not None") - else: - self.tag = tag - self.ancestortags = ancestortags - self.links = links - self.emphasized_texts = emphasized_texts - self.text_as_html = text_as_html - super().__init__(*args, **kwargs) - - -class HTMLText(TagsMixin, Text): - """Text with tag information.""" - - -class HTMLAddress(TagsMixin, Address): - """Address with tag information.""" - - -class HTMLEmailAddress(TagsMixin, EmailAddress): - """EmailAddress with tag information""" - - -class HTMLTitle(TagsMixin, Title): - """Title with tag information.""" - - -class HTMLNarrativeText(TagsMixin, NarrativeText): - """NarrativeText with tag information.""" - - -class HTMLListItem(TagsMixin, ListItem): - """NarrativeText with tag information.""" - - -class HTMLTable(TagsMixin, Table): - """NarrativeText with tag information""" - - -def has_table_ancestor(element: TagsMixin) -> bool: - """Checks to see if an element has ancestors that are table elements. If so, we consider - it to be a table element rather than a section of narrative text.""" - return any(ancestor in TABLE_TAGS for ancestor in element.ancestortags) - - -def in_header_or_footer(element: TagsMixin) -> bool: - """Checks to see if an element is contained within a header or a footer tag.""" - if any(ancestor in HEADER_OR_FOOTER_TAGS for ancestor in element.ancestortags): - return True - return False - - -def is_table(element: TagsMixin) -> bool: - """Checks to see if an element is a table""" - return element.tag in TABLE_TAGS - - -# -- HTML element-tree processing ---------------------------------------------------------------- - - class HTMLDocument(XMLDocument): """Class for handling HTML documents. Uses rules based parsing to identify sections of interest within the document.""" @@ -200,7 +119,7 @@ class HTMLDocument(XMLDocument): page.elements.extend(bulleted_text) descendanttag_elems = tuple(tag_elem.iterdescendants()) - elif is_list_item_tag(tag_elem): + elif _is_list_item_tag(tag_elem): element, next_element = _process_list_item(tag_elem) if element is not None: page.elements.append(element) @@ -228,67 +147,170 @@ class HTMLDocument(XMLDocument): return pages - def doc_after_cleaners( - self, - skip_headers_and_footers: bool = False, - skip_table: bool = False, - inplace: bool = False, - ) -> HTMLDocument: - """Filters elements returning new instance based on the criteria specified. - Note that the number of pages can change in the case that all elements on a page are - filtered out. +# -- candidate HTMLDocument methods -------------------------------------------------------------- - Parameters - ---------- - skip_table: - If True, skips table element - skip_headers_and_footers: - If True, ignores any content that is within
or