rfctr(html): improve SNR in HTMLDocument (#3162)

**Summary** Remove dead code and organize helpers of HTMLDocument in preparation for improvements and bug-fixes to follow
2025-12-04 11:10:22 +00:00 · 2024-06-06 14:21:33 -07:00 · 2024-06-06 14:21:33 -07:00 · a883fc9df2
commit a883fc9df2
parent 8378ddaa3b
7 changed files with 399 additions and 542 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.14.5-dev5
+## 0.14.5-dev6

 ### Enhancements

--- a/test_unstructured/documents/test_html.py
+++ b/test_unstructured/documents/test_html.py
@ -1,4 +1,5 @@
 # pyright: reportPrivateUsage=false
+# pyright: reportUnusedFunction=false

 """Test suite for `unstructured.documents.html` module."""

@ -27,15 +28,14 @@ from unstructured.documents.elements import (
    Text,
    Title,
 )
-from unstructured.documents.html import (
+from unstructured.documents.html import HTMLDocument, _parse_HTMLTable_from_table_elem
+from unstructured.documents.html_elements import (
    HTMLAddress,
-    HTMLDocument,
    HTMLNarrativeText,
    HTMLTable,
    HTMLText,
    HTMLTitle,
    TagsMixin,
-    _parse_HTMLTable_from_table_elem,
 )

 TAGS = (
@ -287,9 +287,6 @@ def test_read_html_doc(tmp_path: pathlib.Path):
        f.write(
            "<html>\n"
            "  <body>\n"
-            "    <header>\n"
-            "      <p>Here is a header. We want to ignore anything that is in this section.</p>\n"
-            "    </header>\n"
            "    <h1>A Great and Glorious Section</h1>\n"
            "    <p>Dear Leader is the best. He is such a wonderful engineer!</p>\n"
            "    <p></p>\n"
@ -298,42 +295,35 @@ def test_read_html_doc(tmp_path: pathlib.Path):
            "    <table>\n"
            "      <tbody>\n"
            "        <tr>\n"
-            "          <td><p>Skip me because I'm in a table</p></td>\n"
+            "          <td><p>I'm in a table</p></td>\n"
            "        </tr>\n"
            "      </tbody>\n"
            "    </table>\n"
            "    <hr>\n"
            "    <h2>A New Beginning</h2>\n"
            "    <div>Here is the start of a new page.</div>\n"
-            "    <footer>\n"
-            "      <p>Here is a footer. We want to ignore anything that is in this section</p>\n"
-            "    </footer>\n"
-            "    <div>\n"
-            "      <p>Let's ignore anything after the footer too since it's probably garbage.</p>\n"
-            "    </div>\n"
            "  </body>\n"
            "</html>\n"
        )

-    html_document = HTMLDocument.from_file(filename=filename).doc_after_cleaners(
-        skip_headers_and_footers=True, skip_table=True
-    )
+    html_document = HTMLDocument.from_file(filename)

    assert len(html_document.pages) == 2
    assert all(isinstance(p, Page) for p in html_document.pages)
    # --
-    page_one = html_document.pages[0]
-    assert len(page_one.elements) == 4
-    assert page_one.elements == [
+    p = html_document.pages[0]
+    assert len(p.elements) == 5
+    assert p.elements == [
        Title("A Great and Glorious Section"),
        NarrativeText("Dear Leader is the best. He is such a wonderful engineer!"),
        Title("Another Magnificent Title"),
        NarrativeText("The prior element is a title based on its capitalization patterns!"),
+        Table("I'm in a table"),
    ]
    # --
-    page_two = html_document.pages[1]
-    assert len(page_two.elements) == 2
-    assert page_two.elements == [
+    p = html_document.pages[1]
+    assert len(p.elements) == 2
+    assert p.elements == [
        Title("A New Beginning"),
        NarrativeText("Here is the start of a new page."),
    ]
@ -348,64 +338,6 @@ def test_HTMLDocument_can_construct_from_existing_pages():
    assert html_document.pages == [page]


-# -- HTMLDocument.doc_after_cleaners() -----------------------------------------------------------
-
-
-def test_include_headers_and_footers(sample_doc: HTMLDocument):
-    html_document = sample_doc.doc_after_cleaners(skip_headers_and_footers=False)
-    assert len(html_document.pages[1].elements) == 3
-
-
-def test_read_without_skipping_table(is_possible_narrative_text_: Mock):
-    is_possible_narrative_text_.return_value = True
-    document = HTMLDocument.from_string(
-        "<html>\n"
-        "  <body>\n"
-        "    <table>\n"
-        "      <tbody>\n"
-        "        <tr>\n"
-        "          <td><p>Hi there! I am Matt!</p></td>\n"
-        "        </tr>\n"
-        "      </tbody>\n"
-        "    </table>\n"
-        "  </body>\n"
-        "</html>\n"
-    ).doc_after_cleaners(skip_table=False)
-    assert document.pages[0].elements[0] == Table(text="Hi there! I am Matt!")
-
-
-def test_include_table_text(sample_doc: HTMLDocument):
-    html_document = sample_doc.doc_after_cleaners(skip_table=False)
-    assert len(html_document.pages[0].elements) == 2
-
-
-def test_tag_types_table(sample_doc: HTMLDocument):
-    html_document = sample_doc.doc_after_cleaners(skip_table=True)
-    assert len(html_document.pages[0].elements) == 2
-
-
-def test_cleaner_raises_on_non_element_elements(sample_doc: HTMLDocument, pages_prop_: Mock):
-    page = Page(0)
-    page.elements = [
-        "this should def not be a string"  # pyright: ignore[reportAttributeAccessIssue]
-    ]
-    pages_prop_.return_value = [page]
-    with pytest.raises(ValueError):
-        sample_doc.doc_after_cleaners()
-
-
-def test_cleaner_can_filter_out_tables_in_place():
-    doc = HTMLDocument.from_string(
-        "<table><tbody><tr><td>A table thing.</td></tr></tbody></table>\n"
-        "<p>A non-table thing</p>\n"
-    )
-    assert len(doc.elements) == 2
-
-    doc.doc_after_cleaners(skip_table=True, inplace=True)
-
-    assert len(doc.elements) == 1
-
-
 # -- HTMLDocument.elements -----------------------------------------------------------------------


@ -429,19 +361,16 @@ def test_parses_tags_correctly():


 def test_nested_text_tags():
-    tag1, tag2 = [tag for tag in html.TEXT_TAGS if tag not in html.TABLE_TAGS][:2]
-    html_str = (
-        f"<body>\n"
-        f"    <{tag1}>\n"
-        f"        <{tag2}>\n"
-        f"            There is some text here.\n"
-        f"        </{tag2}>\n"
-        f"    </{tag1}>\n"
-        f"</body>\n"
+    html_document = HTMLDocument.from_string(
+        "<body>\n"
+        "  <p>\n"
+        "    <a>\n"
+        "      There is some text here.\n"
+        "    </a>\n"
+        "  </p>\n"
+        "</body>\n"
    )

-    html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table=False)
-
    assert len(html_document.pages[0].elements) == 1


@ -575,19 +504,6 @@ def test_exclude_tag_types(tag: str):
    assert len(html_document.pages) == 0


-# -- has_table_ancestor() ------------------------------------------------------------------------
-
-
-def test_has_table_ancestor():
-    title = HTMLTitle("I am a Title", tag="td", ancestortags=["html", "body", "table", "tr"])
-    assert html.has_table_ancestor(title)
-
-
-def test_has_no_table_ancestor():
-    title = HTMLTitle("I am a Title", tag="p", ancestortags=["html", "body"])
-    assert not html.has_table_ancestor(title)
-
-
 # -- _bulleted_text_from_table() -----------------------------------------------------------------


@ -856,8 +772,8 @@ def test_parse_nothing():
    assert parsed_el is None


-def test_parse_not_anything(is_narrative_tag_: Mock, is_possible_title_: Mock):
-    is_narrative_tag_.return_value = False
+def test_parse_not_anything(_is_narrative_tag_: Mock, is_possible_title_: Mock):  # noqa: PT019
+    _is_narrative_tag_.return_value = False
    is_possible_title_.return_value = False
    doc = """<p>This is nothing</p>"""
    document_tree = etree.fromstring(doc, etree.HTMLParser())
@ -942,7 +858,7 @@ def test_process_list_item_returns_none_if_next_has_no_text():
    document_tree = etree.fromstring(doc, etree.HTMLParser())
    el = document_tree.find(".//div")
    assert el is not None
-    assert html.is_list_item_tag(el) is True
+    assert html._is_list_item_tag(el) is True
    parsed_el, _ = html._process_list_item(el)
    assert parsed_el is None

@ -1071,8 +987,8 @@ class Describe_parse_HTMLTable_from_table_elem:


@pytest.fixture
-def is_narrative_tag_(request: FixtureRequest):
-    return function_mock(request, "unstructured.documents.html.is_narrative_tag")
+def _is_narrative_tag_(request: FixtureRequest):
+    return function_mock(request, "unstructured.documents.html._is_narrative_tag")


@pytest.fixture
--- a/test_unstructured/partition/test_html.py
+++ b/test_unstructured/partition/test_html.py
@ -28,7 +28,7 @@ from unstructured.documents.elements import (
    TableChunk,
    Title,
 )
-from unstructured.documents.html import HTMLTable, TagsMixin
+from unstructured.documents.html_elements import HTMLTable, TagsMixin
 from unstructured.partition.html import partition_html

 # -- document-source (filename, file, text, url) -------------------------------------------------
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.14.5-dev5"  # pragma: no cover
+__version__ = "0.14.5-dev6"  # pragma: no cover
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@ -2,26 +2,21 @@

 from __future__ import annotations

-from typing import Any, Callable, Final, Iterator, Optional, Sequence, cast
+from typing import Final, Iterator, Optional, cast

 from lxml import etree

-from unstructured.cleaners.core import (
-    clean_bullets,
-    replace_unicode_quotes,
-)
+from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
 from unstructured.documents.base import Page
-from unstructured.documents.elements import (
-    Address,
-    Element,
-    ElementMetadata,
-    EmailAddress,
-    Link,
-    ListItem,
-    NarrativeText,
-    Table,
-    Text,
-    Title,
+from unstructured.documents.elements import Element, ElementMetadata, Link
+from unstructured.documents.html_elements import (
+    HTMLAddress,
+    HTMLEmailAddress,
+    HTMLListItem,
+    HTMLNarrativeText,
+    HTMLTable,
+    HTMLText,
+    HTMLTitle,
 )
 from unstructured.documents.xml import VALID_PARSERS, XMLDocument
 from unstructured.logger import logger
@ -47,82 +42,6 @@ HEADER_OR_FOOTER_TAGS: Final[list[str]] = ["header", "footer"]
 SECTION_TAGS: Final[list[str]] = ["div", "pre"]


-# -- HTML-specific document-elements and methods -------------------------------------------------
-
-
-class TagsMixin:
-    """Mixin that allows a class to retain tag information."""
-
-    def __init__(
-        self,
-        *args: Any,
-        tag: Optional[str] = None,
-        ancestortags: Sequence[str] = (),
-        links: Sequence[Link] = [],
-        emphasized_texts: Sequence[dict[str, str]] = [],
-        text_as_html: Optional[str] = None,
-        **kwargs: Any,
-    ):
-        if tag is None:
-            raise TypeError("tag argument must be passed and not None")
-        else:
-            self.tag = tag
-        self.ancestortags = ancestortags
-        self.links = links
-        self.emphasized_texts = emphasized_texts
-        self.text_as_html = text_as_html
-        super().__init__(*args, **kwargs)
-
-
-class HTMLText(TagsMixin, Text):
-    """Text with tag information."""
-
-
-class HTMLAddress(TagsMixin, Address):
-    """Address with tag information."""
-
-
-class HTMLEmailAddress(TagsMixin, EmailAddress):
-    """EmailAddress with tag information"""
-
-
-class HTMLTitle(TagsMixin, Title):
-    """Title with tag information."""
-
-
-class HTMLNarrativeText(TagsMixin, NarrativeText):
-    """NarrativeText with tag information."""
-
-
-class HTMLListItem(TagsMixin, ListItem):
-    """NarrativeText with tag information."""
-
-
-class HTMLTable(TagsMixin, Table):
-    """NarrativeText with tag information"""
-
-
-def has_table_ancestor(element: TagsMixin) -> bool:
-    """Checks to see if an element has ancestors that are table elements. If so, we consider
-    it to be a table element rather than a section of narrative text."""
-    return any(ancestor in TABLE_TAGS for ancestor in element.ancestortags)
-
-
-def in_header_or_footer(element: TagsMixin) -> bool:
-    """Checks to see if an element is contained within a header or a footer tag."""
-    if any(ancestor in HEADER_OR_FOOTER_TAGS for ancestor in element.ancestortags):
-        return True
-    return False
-
-
-def is_table(element: TagsMixin) -> bool:
-    """Checks to see if an element is a table"""
-    return element.tag in TABLE_TAGS
-
-
-# -- HTML element-tree processing ----------------------------------------------------------------
-
-
 class HTMLDocument(XMLDocument):
    """Class for handling HTML documents. Uses rules based parsing to identify sections
    of interest within the document."""
@ -200,7 +119,7 @@ class HTMLDocument(XMLDocument):
                    page.elements.extend(bulleted_text)
                    descendanttag_elems = tuple(tag_elem.iterdescendants())

-                elif is_list_item_tag(tag_elem):
+                elif _is_list_item_tag(tag_elem):
                    element, next_element = _process_list_item(tag_elem)
                    if element is not None:
                        page.elements.append(element)
@ -228,67 +147,170 @@ class HTMLDocument(XMLDocument):

        return pages

-    def doc_after_cleaners(
-        self,
-        skip_headers_and_footers: bool = False,
-        skip_table: bool = False,
-        inplace: bool = False,
-    ) -> HTMLDocument:
-        """Filters elements returning new instance based on the criteria specified.

-        Note that the number of pages can change in the case that all elements on a page are
-        filtered out.
+# -- candidate HTMLDocument methods --------------------------------------------------------------

-        Parameters
-        ----------
-        skip_table:
-            If True, skips table element
-        skip_headers_and_footers:
-            If True, ignores any content that is within <header> or <footer> tags
-        inplace:
-            If True, document is modified in place and returned.
-        """
-        excluders: list[Callable[[TagsMixin], bool]] = []
-        if skip_headers_and_footers:
-            excluders.append(in_header_or_footer)
-        if skip_table:
-            excluders.append(is_table)

-        pages: list[Page] = []
-        page_number = 0
-        new_page = Page(number=page_number)
-        for page in self.pages:
-            elements: list[Element] = []
-            for el in page.elements:
-                if not isinstance(el, TagsMixin):
-                    raise ValueError(
-                        f"elements of class {self.__class__} should be of type HTMLTitle "
-                        f"HTMLNarrativeText, or HTMLListItem but "
-                        f"object has an element of type {type(el)}",
-                    )
-                if not any(excluder(el) for excluder in excluders):
-                    elements.append(el)
-                if skip_headers_and_footers and "footer" in tuple(el.ancestortags) + (el.tag,):
-                    break
-            if elements:
-                new_page.elements = elements
-                pages.append(new_page)
-                page_number += 1
-                new_page = Page(number=page_number)
-        if inplace:
-            self._pages = pages
-            self._elements = None
-            return self
-        else:
-            out = self.__class__.from_pages(pages)
-            if not isinstance(out, HTMLDocument):
-                # NOTE(robinson) - Skipping for test coverage because this condition is impossible.
-                # Added type check because from_pages is a method on Document. Without the type
-                # check, mypy complains about returning Document instead of HTMLDocument
-                raise ValueError(
-                    f"Unexpected class: {self.__class__.__name__}",
-                )  # pragma: no cover
-            return out
+def _find_articles(root: etree._Element, assemble_articles: bool = True) -> list[etree._Element]:
+    """Parse articles from `root` of an HTML document.
+
+    Each `<article>` element in the HTML becomes its own "sub-document" (article). If no article
+    elements are present, the entire document (`root`) is returned as the single document article.
+    """
+    if assemble_articles is False:
+        return [root]
+
+    articles = root.findall(".//article")
+    if len(articles) == 0:
+        # NOTE(robinson) - ref: https://schema.org/Article
+        articles = root.findall(".//div[@itemprop='articleBody']")
+    return [root] if len(articles) == 0 else articles
+
+
+def _find_main(root: etree._Element) -> etree._Element:
+    """The first <main> tag under `root` if it exists, othewise `root`."""
+    main_tag_elem = root.find(".//main")
+    return main_tag_elem if main_tag_elem is not None else root
+
+
+# -- tag classifiers -----------------------------------------------------------------------------
+
+
+def _is_bulleted_table(table_elem: etree._Element) -> bool:
+    """True when all text in `table_elem` is bulleted text.
+
+    A table-row containing no text is not considered, but at least one bulleted-text item must be
+    present. A table with no text in any row is not a bulleted table.
+    """
+    if table_elem.tag != "table":
+        return False
+
+    trs = table_elem.findall(".//tr")
+    tr_texts = [_construct_text(tr) for tr in trs]
+
+    # -- a table with no text is not a bulleted table --
+    if all(not text for text in tr_texts):
+        return False
+
+    # -- all non-empty rows must contain bulleted text --
+    if any(text and not is_bulleted_text(text) for text in tr_texts):
+        return False
+
+    return True
+
+
+def _is_container_with_text(tag_elem: etree._Element) -> bool:
+    """Checks if a tag is a container that also happens to contain text.
+
+    Example
+    -------
+    <div>Hi there,
+        <div>This is my message.</div>
+        <div>Please read my message!</div>
+    </div>
+    """
+    if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0:
+        return False
+
+    tag_elem_text = tag_elem.text.strip() if tag_elem.text else None
+    tag_elem_tail = tag_elem.tail.strip() if tag_elem.tail else None
+    if not tag_elem_text and not tag_elem_tail:
+        return False
+
+    return True
+
+
+def _is_list_item_tag(tag_elem: etree._Element) -> bool:
+    """True when `tag_elem` contains bulleted text."""
+    return tag_elem.tag in LIST_ITEM_TAGS or (
+        tag_elem.tag in SECTION_TAGS and is_bulleted_text(_construct_text(tag_elem))
+    )
+
+
+def _is_text_tag(
+    tag_elem: etree._Element, max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN
+) -> bool:
+    """True when `tag_element` potentially contains narrative text."""
+    # NOTE(robinson) - Only consider elements with limited depth. Otherwise,
+    # it could be the text representation of a giant div
+    # Exclude empty tags from tag_elem
+    empty_elems_len = len([el for el in tag_elem if el.tag in EMPTY_TAGS])
+    if len(tag_elem) > max_predecessor_len + empty_elems_len:
+        return False
+
+    if tag_elem.tag in TEXT_TAGS + HEADING_TAGS + TEXTBREAK_TAGS:
+        return True
+
+    # NOTE(robinson) - This indicates that a div tag has no children. If that's the
+    # case and the tag has text, its potential a text tag
+    children = list(tag_elem)
+    if tag_elem.tag in SECTION_TAGS + ["body"] and len(children) == 0:
+        return True
+
+    if _has_adjacent_bulleted_spans(tag_elem, children):
+        return True
+
+    return False
+
+
+# -- tag processors ------------------------------------------------------------------------------
+
+
+def _bulleted_text_from_table(table: etree._Element) -> list[Element]:
+    """Extracts bulletized narrative text from the `<table>` element in `table`.
+
+    NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted. I.e., _read()
+    will drop non-bullet narrative text in the table.
+    """
+    bulleted_text: list[Element] = []
+    rows = table.findall(".//tr")
+    for row in rows:
+        text = _construct_text(row)
+        if is_bulleted_text(text):
+            bulleted_text.append(HTMLListItem(text=clean_bullets(text), tag=row.tag))
+    return bulleted_text
+
+
+def _construct_text(tag_elem: etree._Element, include_tail_text: bool = True) -> str:
+    """Extract "clean"" text from `tag_elem`."""
+    text = "".join(str(t) for t in tag_elem.itertext() if t)
+
+    if include_tail_text and tag_elem.tail:
+        text = text + tag_elem.tail
+
+    text = replace_unicode_quotes(text)
+    return text.strip()
+
+
+def _get_bullet_descendants(
+    element: Optional[etree._Element], next_element: Optional[etree._Element]
+) -> tuple[etree._Element, ...]:
+    """Helper for list-item processing.
+
+    Gathers the descendants of `next_element` so they can be marked visited.
+    """
+    return () if element is None or next_element is None else tuple(next_element.iterdescendants())
+
+
+def _get_emphasized_texts_from_tag(tag_elem: etree._Element) -> list[dict[str, str]]:
+    """Emphasized text within and below `tag_element`.
+
+    Emphasis is indicated by `<strong>`, `<em>`, `<span>`, `<b>`, `<i>` tags.
+    """
+    emphasized_texts: list[dict[str, str]] = []
+    tags_to_track = ["strong", "em", "span", "b", "i"]
+
+    if tag_elem.tag in tags_to_track:
+        text = _construct_text(tag_elem, False)
+        if text:
+            emphasized_texts.append({"text": text, "tag": tag_elem.tag})
+
+    for descendant_tag_elem in tag_elem.iterdescendants(*tags_to_track):
+        text = _construct_text(descendant_tag_elem, False)
+        if text:
+            emphasized_texts.append({"text": text, "tag": descendant_tag_elem.tag})
+
+    return emphasized_texts


 def _get_links_from_tag(tag_elem: etree._Element) -> list[Link]:
@ -313,27 +335,23 @@ def _get_links_from_tag(tag_elem: etree._Element) -> list[Link]:
    return links


-def _is_bulleted_table(table_elem: etree._Element) -> bool:
-    """True when all text in `table_elem` is bulleted text.
+def _has_adjacent_bulleted_spans(tag_elem: etree._Element, children: list[etree._Element]) -> bool:
+    """True when `tag_elem` is a <div> or <pre> containing two or more adjacent bulleted spans.

-    A table-row containing no text is not considered, but at least one bulleted-text item must be
-    present. A table with no text in any row is not a bulleted table.
+    A bulleted span is one beginning with a bullet. If there are two or more adjacent to each other
+    they are treated as a single bulleted text element.
    """
-    if table_elem.tag != "table":
-        return False
+    if tag_elem.tag in SECTION_TAGS:
+        all_spans = all(child.tag == "span" for child in children)
+        _is_bulleted = children[0].text is not None and is_bulleted_text(children[0].text)
+        if all_spans and _is_bulleted:
+            return True
+    return False

-    trs = table_elem.findall(".//tr")
-    tr_texts = [_construct_text(tr) for tr in trs]

-    # -- a table with no text is not a bulleted table --
-    if all(not text for text in tr_texts):
-        return False
-
-    # -- all non-empty rows must contain bulleted text --
-    if any(text and not is_bulleted_text(text) for text in tr_texts):
-        return False
-
-    return True
+def _has_break_tags(tag_elem: etree._Element) -> bool:
+    """True when `tab_elem` contains a `<br>` descendant."""
+    return any(descendant.tag in TEXTBREAK_TAGS for descendant in tag_elem.iterdescendants())


 def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Element]:
@ -378,27 +396,6 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele
    )


-def _get_emphasized_texts_from_tag(tag_elem: etree._Element) -> list[dict[str, str]]:
-    """Emphasized text within and below `tag_element`.
-
-    Emphasis is indicated by `<strong>`, `<em>`, `<span>`, `<b>`, `<i>` tags.
-    """
-    emphasized_texts: list[dict[str, str]] = []
-    tags_to_track = ["strong", "em", "span", "b", "i"]
-
-    if tag_elem.tag in tags_to_track:
-        text = _construct_text(tag_elem, False)
-        if text:
-            emphasized_texts.append({"text": text, "tag": tag_elem.tag})
-
-    for descendant_tag_elem in tag_elem.iterdescendants(*tags_to_track):
-        text = _construct_text(descendant_tag_elem, False)
-        if text:
-            emphasized_texts.append({"text": text, "tag": descendant_tag_elem.tag})
-
-    return emphasized_texts
-
-
 def _parse_tag(
    tag_elem: etree._Element,
    include_tail_text: bool = True,
@ -440,193 +437,6 @@ def _parse_tag(
    )


-def _text_to_element(
-    text: str,
-    tag: str,
-    ancestortags: tuple[str, ...],
-    depth: int,
-    links: list[Link] = [],
-    emphasized_texts: list[dict[str, str]] = [],
-) -> Optional[Element]:
-    """Produce a document-element of the appropriate sub-type for `text`."""
-    if is_bulleted_text(text):
-        if not clean_bullets(text):
-            return None
-        return HTMLListItem(
-            text=clean_bullets(text),
-            tag=tag,
-            ancestortags=ancestortags,
-            links=links,
-            emphasized_texts=emphasized_texts,
-            metadata=ElementMetadata(category_depth=depth),
-        )
-    elif is_us_city_state_zip(text):
-        return HTMLAddress(
-            text=text,
-            tag=tag,
-            ancestortags=ancestortags,
-            links=links,
-            emphasized_texts=emphasized_texts,
-        )
-    elif is_email_address(text):
-        return HTMLEmailAddress(
-            text=text,
-            tag=tag,
-            links=links,
-            emphasized_texts=emphasized_texts,
-        )
-
-    if len(text) < 2:
-        return None
-    elif is_narrative_tag(text, tag):
-        return HTMLNarrativeText(
-            text,
-            tag=tag,
-            ancestortags=ancestortags,
-            links=links,
-            emphasized_texts=emphasized_texts,
-        )
-    elif is_heading_tag(tag) or is_possible_title(text):
-        return HTMLTitle(
-            text,
-            tag=tag,
-            ancestortags=ancestortags,
-            links=links,
-            emphasized_texts=emphasized_texts,
-            metadata=ElementMetadata(category_depth=depth),
-        )
-    else:
-        return HTMLText(
-            text,
-            tag=tag,
-            ancestortags=ancestortags,
-            links=links,
-            emphasized_texts=emphasized_texts,
-        )
-
-
-def _is_container_with_text(tag_elem: etree._Element) -> bool:
-    """Checks if a tag is a container that also happens to contain text.
-
-    Example
-    -------
-    <div>Hi there,
-        <div>This is my message.</div>
-        <div>Please read my message!</div>
-    </div>
-    """
-    if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0:
-        return False
-
-    tag_elem_text = tag_elem.text.strip() if tag_elem.text else None
-    tag_elem_tail = tag_elem.tail.strip() if tag_elem.tail else None
-    if not tag_elem_text and not tag_elem_tail:
-        return False
-
-    return True
-
-
-def is_narrative_tag(text: str, tag: str) -> bool:
-    """Uses tag information to infer whether text is narrative."""
-    return tag not in HEADING_TAGS and is_possible_narrative_text(text)
-
-
-def is_heading_tag(tag: str) -> bool:
-    """Uses tag information to infer whether text is a heading."""
-    return tag in HEADING_TAGS
-
-
-def _construct_text(tag_elem: etree._Element, include_tail_text: bool = True) -> str:
-    """Extract "clean"" text from `tag_elem`."""
-    text = "".join(str(t) for t in tag_elem.itertext() if t)
-
-    if include_tail_text and tag_elem.tail:
-        text = text + tag_elem.tail
-
-    text = replace_unicode_quotes(text)
-    return text.strip()
-
-
-def _has_break_tags(tag_elem: etree._Element) -> bool:
-    """True when `tab_elem` contains a `<br>` descendant."""
-    return any(descendant.tag in TEXTBREAK_TAGS for descendant in tag_elem.iterdescendants())
-
-
-def _unfurl_break_tags(tag_elem: etree._Element) -> list[etree._Element]:
-    """Sequence of `tag_elem` and its children with `<br>` elements removed.
-
-    NOTE that these are "loose" `etree._Element` instances that are NOT linked to the original HTML
-    element-tree, so methods like `.getchildren()`, `.find()` etc. will happily produce empty
-    results.
-    """
-    unfurled: list[etree._Element] = []
-
-    if tag_elem.text:
-        _tag_elem = etree.Element(tag_elem.tag)
-        _tag_elem.text = tag_elem.text
-        unfurled.append(_tag_elem)
-
-    for child in tag_elem:
-        if not _has_break_tags(child):
-            unfurled.append(child)
-        else:
-            if child.text:
-                _tag_elem = etree.Element(child.tag)
-                _tag_elem.text = child.text
-                unfurled.append(_tag_elem)
-            unfurled.extend(_unfurl_break_tags(child))
-
-    return unfurled
-
-
-def _is_text_tag(
-    tag_elem: etree._Element, max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN
-) -> bool:
-    """True when `tag_element` potentially contains narrative text."""
-    # NOTE(robinson) - Only consider elements with limited depth. Otherwise,
-    # it could be the text representation of a giant div
-    # Exclude empty tags from tag_elem
-    empty_elems_len = len([el for el in tag_elem if el.tag in EMPTY_TAGS])
-    if len(tag_elem) > max_predecessor_len + empty_elems_len:
-        return False
-
-    if tag_elem.tag in TEXT_TAGS + HEADING_TAGS + TEXTBREAK_TAGS:
-        return True
-
-    # NOTE(robinson) - This indicates that a div tag has no children. If that's the
-    # case and the tag has text, its potential a text tag
-    children = list(tag_elem)
-    if tag_elem.tag in SECTION_TAGS + ["body"] and len(children) == 0:
-        return True
-
-    if _has_adjacent_bulleted_spans(tag_elem, children):
-        return True
-
-    return False
-
-
-def _process_text_tag(
-    tag_elem: etree._Element, include_tail_text: bool = True
-) -> tuple[list[Element], tuple[etree._Element, ...]]:
-    """Produces a document element from `tag_elem`."""
-
-    page_elements: list[Element] = []
-    if _has_break_tags(tag_elem):
-        flattened_elems = _unfurl_break_tags(tag_elem)
-        for _tag_elem in flattened_elems:
-            element = _parse_tag(_tag_elem, include_tail_text)
-            if element is not None:
-                page_elements.append(element)
-
-    else:
-        element = _parse_tag(tag_elem, include_tail_text)
-        if element is not None:
-            page_elements.append(element)
-    descendant_tag_elems = tuple(tag_elem.iterdescendants())
-
-    return page_elements, descendant_tag_elems
-
-
 def _process_list_item(
    tag_elem: etree._Element,
    max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
@ -671,69 +481,131 @@ def _process_list_item(
    return None, None


-def _get_bullet_descendants(
-    element: Optional[etree._Element], next_element: Optional[etree._Element]
-) -> tuple[etree._Element, ...]:
-    """Helper for list-item processing.
+def _process_text_tag(
+    tag_elem: etree._Element, include_tail_text: bool = True
+) -> tuple[list[Element], tuple[etree._Element, ...]]:
+    """Produces a document element from `tag_elem`."""

-    Gathers the descendants of `next_element` so they can be marked visited.
+    page_elements: list[Element] = []
+    if _has_break_tags(tag_elem):
+        flattened_elems = _unfurl_break_tags(tag_elem)
+        for _tag_elem in flattened_elems:
+            element = _parse_tag(_tag_elem, include_tail_text)
+            if element is not None:
+                page_elements.append(element)
+
+    else:
+        element = _parse_tag(tag_elem, include_tail_text)
+        if element is not None:
+            page_elements.append(element)
+    descendant_tag_elems = tuple(tag_elem.iterdescendants())
+
+    return page_elements, descendant_tag_elems
+
+
+def _unfurl_break_tags(tag_elem: etree._Element) -> list[etree._Element]:
+    """Sequence of `tag_elem` and its children with `<br>` elements removed.
+
+    NOTE that these are "loose" `etree._Element` instances that are NOT linked to the original HTML
+    element-tree, so methods like `.getchildren()`, `.find()` etc. will happily produce empty
+    results.
    """
-    return () if element is None or next_element is None else tuple(next_element.iterdescendants())
+    unfurled: list[etree._Element] = []
+
+    if tag_elem.text:
+        _tag_elem = etree.Element(tag_elem.tag)
+        _tag_elem.text = tag_elem.text
+        unfurled.append(_tag_elem)
+
+    for child in tag_elem:
+        if not _has_break_tags(child):
+            unfurled.append(child)
+        else:
+            if child.text:
+                _tag_elem = etree.Element(child.tag)
+                _tag_elem.text = child.text
+                unfurled.append(_tag_elem)
+            unfurled.extend(_unfurl_break_tags(child))
+
+    return unfurled


-def is_list_item_tag(tag_elem: etree._Element) -> bool:
-    """True when `tag_elem` contains bulleted text."""
-    return tag_elem.tag in LIST_ITEM_TAGS or (
-        tag_elem.tag in SECTION_TAGS and is_bulleted_text(_construct_text(tag_elem))
-    )
+# -- text-element classifier ---------------------------------------------------------------------


-def _bulleted_text_from_table(table: etree._Element) -> list[Element]:
-    """Extracts bulletized narrative text from the `<table>` element in `table`.
+def _text_to_element(
+    text: str,
+    tag: str,
+    ancestortags: tuple[str, ...],
+    depth: int,
+    links: list[Link] = [],
+    emphasized_texts: list[dict[str, str]] = [],
+) -> Optional[Element]:
+    """Produce a document-element of the appropriate sub-type for `text`."""
+    if is_bulleted_text(text):
+        if not clean_bullets(text):
+            return None
+        return HTMLListItem(
+            text=clean_bullets(text),
+            tag=tag,
+            ancestortags=ancestortags,
+            links=links,
+            emphasized_texts=emphasized_texts,
+            metadata=ElementMetadata(category_depth=depth),
+        )
+    elif is_us_city_state_zip(text):
+        return HTMLAddress(
+            text=text,
+            tag=tag,
+            ancestortags=ancestortags,
+            links=links,
+            emphasized_texts=emphasized_texts,
+        )
+    elif is_email_address(text):
+        return HTMLEmailAddress(
+            text=text,
+            tag=tag,
+            links=links,
+            emphasized_texts=emphasized_texts,
+        )

-    NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted. I.e., _read()
-    will drop non-bullet narrative text in the table.
-    """
-    bulleted_text: list[Element] = []
-    rows = table.findall(".//tr")
-    for row in rows:
-        text = _construct_text(row)
-        if is_bulleted_text(text):
-            bulleted_text.append(HTMLListItem(text=clean_bullets(text), tag=row.tag))
-    return bulleted_text
+    if len(text) < 2:
+        return None
+    elif _is_narrative_tag(text, tag):
+        return HTMLNarrativeText(
+            text,
+            tag=tag,
+            ancestortags=ancestortags,
+            links=links,
+            emphasized_texts=emphasized_texts,
+        )
+    elif _is_heading_tag(tag) or is_possible_title(text):
+        return HTMLTitle(
+            text,
+            tag=tag,
+            ancestortags=ancestortags,
+            links=links,
+            emphasized_texts=emphasized_texts,
+            metadata=ElementMetadata(category_depth=depth),
+        )
+    else:
+        return HTMLText(
+            text,
+            tag=tag,
+            ancestortags=ancestortags,
+            links=links,
+            emphasized_texts=emphasized_texts,
+        )


-def _has_adjacent_bulleted_spans(tag_elem: etree._Element, children: list[etree._Element]) -> bool:
-    """True when `tag_elem` is a <div> or <pre> containing two or more adjacent bulleted spans.
-
-    A bulleted span is one beginning with a bullet. If there are two or more adjacent to each other
-    they are treated as a single bulleted text element.
-    """
-    if tag_elem.tag in SECTION_TAGS:
-        all_spans = all(child.tag == "span" for child in children)
-        _is_bulleted = children[0].text is not None and is_bulleted_text(children[0].text)
-        if all_spans and _is_bulleted:
-            return True
-    return False
+# -- HTML-specific text classifiers --------------------------------------------------------------


-def _find_main(root: etree._Element) -> etree._Element:
-    """The first <main> tag under `root` if it exists, othewise `root`."""
-    main_tag_elem = root.find(".//main")
-    return main_tag_elem if main_tag_elem is not None else root
+def _is_heading_tag(tag: str) -> bool:
+    """Uses tag information to infer whether text is a heading."""
+    return tag in HEADING_TAGS


-def _find_articles(root: etree._Element, assemble_articles: bool = True) -> list[etree._Element]:
-    """Parse articles from `root` of an HTML document.
-
-    Each `<article>` element in the HTML becomes its own "sub-document" (article). If no article
-    elements are present, the entire document (`root`) is returned as the single document article.
-    """
-    if assemble_articles is False:
-        return [root]
-
-    articles = root.findall(".//article")
-    if len(articles) == 0:
-        # NOTE(robinson) - ref: https://schema.org/Article
-        articles = root.findall(".//div[@itemprop='articleBody']")
-    return [root] if len(articles) == 0 else articles
+def _is_narrative_tag(text: str, tag: str) -> bool:
+    """Uses tag information to infer whether text is narrative."""
+    return tag not in HEADING_TAGS and is_possible_narrative_text(text)
--- a/unstructured/documents/html_elements.py
+++ b/unstructured/documents/html_elements.py
@ -0,0 +1,68 @@
+"""Document elements specific to the HTML partitioner."""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Optional, Sequence
+
+from unstructured.documents.elements import (
+    Address,
+    EmailAddress,
+    Link,
+    ListItem,
+    NarrativeText,
+    Table,
+    Text,
+    Title,
+)
+
+
+class TagsMixin:
+    """Mixin that allows a class to retain tag information."""
+
+    def __init__(
+        self,
+        *args: Any,
+        tag: Optional[str] = None,
+        ancestortags: Sequence[str] = (),
+        links: Sequence[Link] = [],
+        emphasized_texts: Sequence[Dict[str, str]] = [],
+        text_as_html: Optional[str] = None,
+        **kwargs: Any,
+    ):
+        if tag is None:
+            raise TypeError("tag argument must be passed and not None")
+        else:
+            self.tag = tag
+        self.ancestortags = ancestortags
+        self.links = links
+        self.emphasized_texts = emphasized_texts
+        self.text_as_html = text_as_html
+        super().__init__(*args, **kwargs)
+
+
+class HTMLText(TagsMixin, Text):
+    """Text with tag information."""
+
+
+class HTMLAddress(TagsMixin, Address):
+    """Address with tag information."""
+
+
+class HTMLEmailAddress(TagsMixin, EmailAddress):
+    """EmailAddress with tag information"""
+
+
+class HTMLTitle(TagsMixin, Title):
+    """Title with tag information."""
+
+
+class HTMLNarrativeText(TagsMixin, NarrativeText):
+    """NarrativeText with tag information."""
+
+
+class HTMLListItem(TagsMixin, ListItem):
+    """NarrativeText with tag information."""
+
+
+class HTMLTable(TagsMixin, Table):
+    """NarrativeText with tag information"""
--- a/unstructured/partition/html.py
+++ b/unstructured/partition/html.py
@ -6,7 +6,8 @@ import requests

 from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import Element, process_metadata
-from unstructured.documents.html import HTMLDocument, TagsMixin
+from unstructured.documents.html import HTMLDocument
+from unstructured.documents.html_elements import TagsMixin
 from unstructured.documents.xml import VALID_PARSERS
 from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.file_conversion import convert_file_to_html_text