rfctr(html): improve SNR in HTMLDocument (#3162)

**Summary** Remove dead code and organize helpers of HTMLDocument in preparation for improvements and bug-fixes to follow
2025-10-21 21:13:47 +00:00 · 2024-06-06 14:21:33 -07:00 · 2024-06-06 14:21:33 -07:00 · a883fc9df2
commit a883fc9df2
parent 8378ddaa3b
7 changed files with 399 additions and 542 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.14.5-dev5
+## 0.14.5-dev6
 ### Enhancements
--- a/test_unstructured/documents/test_html.py
+++ b/test_unstructured/documents/test_html.py
@ -1,4 +1,5 @@
 # pyright: reportPrivateUsage=false
 # pyright: reportUnusedFunction=false
 """Test suite for `unstructured.documents.html` module."""
@ -27,15 +28,14 @@ from unstructured.documents.elements import (
    Text,
    Title,
 )
-from unstructured.documents.html import (
+from unstructured.documents.html import HTMLDocument, _parse_HTMLTable_from_table_elem
 from unstructured.documents.html_elements import (
    HTMLAddress,
    HTMLDocument,
    HTMLNarrativeText,
    HTMLTable,
    HTMLText,
    HTMLTitle,
    TagsMixin,
    _parse_HTMLTable_from_table_elem,
 )
 TAGS = (
@ -287,9 +287,6 @@ def test_read_html_doc(tmp_path: pathlib.Path):
        f.write(
            "<html>\n"
            "  <body>\n"
            "    <header>\n"
            "      <p>Here is a header. We want to ignore anything that is in this section.</p>\n"
            "    </header>\n"
            "    <h1>A Great and Glorious Section</h1>\n"
            "    <p>Dear Leader is the best. He is such a wonderful engineer!</p>\n"
            "    <p></p>\n"
@ -298,42 +295,35 @@ def test_read_html_doc(tmp_path: pathlib.Path):
            "    <table>\n"
            "      <tbody>\n"
            "        <tr>\n"
-            "          <td><p>Skip me because I'm in a table</p></td>\n"
+            "          <td><p>I'm in a table</p></td>\n"
            "        </tr>\n"
            "      </tbody>\n"
            "    </table>\n"
            "    <hr>\n"
            "    <h2>A New Beginning</h2>\n"
            "    <div>Here is the start of a new page.</div>\n"
            "    <footer>\n"
            "      <p>Here is a footer. We want to ignore anything that is in this section</p>\n"
            "    </footer>\n"
            "    <div>\n"
            "      <p>Let's ignore anything after the footer too since it's probably garbage.</p>\n"
            "    </div>\n"
            "  </body>\n"
            "</html>\n"
        )
-    html_document = HTMLDocument.from_file(filename=filename).doc_after_cleaners(
+    html_document = HTMLDocument.from_file(filename)
        skip_headers_and_footers=True, skip_table=True
    )
    assert len(html_document.pages) == 2
    assert all(isinstance(p, Page) for p in html_document.pages)
    # --
-    page_one = html_document.pages[0]
+    p = html_document.pages[0]
-    assert len(page_one.elements) == 4
+    assert len(p.elements) == 5
-    assert page_one.elements == [
+    assert p.elements == [
        Title("A Great and Glorious Section"),
        NarrativeText("Dear Leader is the best. He is such a wonderful engineer!"),
        Title("Another Magnificent Title"),
        NarrativeText("The prior element is a title based on its capitalization patterns!"),
        Table("I'm in a table"),
    ]
    # --
-    page_two = html_document.pages[1]
+    p = html_document.pages[1]
-    assert len(page_two.elements) == 2
+    assert len(p.elements) == 2
-    assert page_two.elements == [
+    assert p.elements == [
        Title("A New Beginning"),
        NarrativeText("Here is the start of a new page."),
    ]
@ -348,64 +338,6 @@ def test_HTMLDocument_can_construct_from_existing_pages():
    assert html_document.pages == [page]
 # -- HTMLDocument.doc_after_cleaners() -----------------------------------------------------------
 def test_include_headers_and_footers(sample_doc: HTMLDocument):
    html_document = sample_doc.doc_after_cleaners(skip_headers_and_footers=False)
    assert len(html_document.pages[1].elements) == 3
 def test_read_without_skipping_table(is_possible_narrative_text_: Mock):
    is_possible_narrative_text_.return_value = True
    document = HTMLDocument.from_string(
        "<html>\n"
        "  <body>\n"
        "    <table>\n"
        "      <tbody>\n"
        "        <tr>\n"
        "          <td><p>Hi there! I am Matt!</p></td>\n"
        "        </tr>\n"
        "      </tbody>\n"
        "    </table>\n"
        "  </body>\n"
        "</html>\n"
    ).doc_after_cleaners(skip_table=False)
    assert document.pages[0].elements[0] == Table(text="Hi there! I am Matt!")
 def test_include_table_text(sample_doc: HTMLDocument):
    html_document = sample_doc.doc_after_cleaners(skip_table=False)
    assert len(html_document.pages[0].elements) == 2
 def test_tag_types_table(sample_doc: HTMLDocument):
    html_document = sample_doc.doc_after_cleaners(skip_table=True)
    assert len(html_document.pages[0].elements) == 2
 def test_cleaner_raises_on_non_element_elements(sample_doc: HTMLDocument, pages_prop_: Mock):
    page = Page(0)
    page.elements = [
        "this should def not be a string"  # pyright: ignore[reportAttributeAccessIssue]
    ]
    pages_prop_.return_value = [page]
    with pytest.raises(ValueError):
        sample_doc.doc_after_cleaners()
 def test_cleaner_can_filter_out_tables_in_place():
    doc = HTMLDocument.from_string(
        "<table><tbody><tr><td>A table thing.</td></tr></tbody></table>\n"
        "<p>A non-table thing</p>\n"
    )
    assert len(doc.elements) == 2
    doc.doc_after_cleaners(skip_table=True, inplace=True)
    assert len(doc.elements) == 1
 # -- HTMLDocument.elements -----------------------------------------------------------------------
@ -429,19 +361,16 @@ def test_parses_tags_correctly():
 def test_nested_text_tags():
-    tag1, tag2 = [tag for tag in html.TEXT_TAGS if tag not in html.TABLE_TAGS][:2]
+    html_document = HTMLDocument.from_string(
-    html_str = (
+        "<body>\n"
-        f"<body>\n"
+        "  <p>\n"
-        f"    <{tag1}>\n"
+        "    <a>\n"
-        f"        <{tag2}>\n"
+        "      There is some text here.\n"
-        f"            There is some text here.\n"
+        "    </a>\n"
-        f"        </{tag2}>\n"
+        "  </p>\n"
-        f"    </{tag1}>\n"
+        "</body>\n"
        f"</body>\n"
    )
    html_document = HTMLDocument.from_string(html_str).doc_after_cleaners(skip_table=False)
    assert len(html_document.pages[0].elements) == 1
@ -575,19 +504,6 @@ def test_exclude_tag_types(tag: str):
    assert len(html_document.pages) == 0
 # -- has_table_ancestor() ------------------------------------------------------------------------
 def test_has_table_ancestor():
    title = HTMLTitle("I am a Title", tag="td", ancestortags=["html", "body", "table", "tr"])
    assert html.has_table_ancestor(title)
 def test_has_no_table_ancestor():
    title = HTMLTitle("I am a Title", tag="p", ancestortags=["html", "body"])
    assert not html.has_table_ancestor(title)
 # -- _bulleted_text_from_table() -----------------------------------------------------------------
@ -856,8 +772,8 @@ def test_parse_nothing():
    assert parsed_el is None
-def test_parse_not_anything(is_narrative_tag_: Mock, is_possible_title_: Mock):
+def test_parse_not_anything(_is_narrative_tag_: Mock, is_possible_title_: Mock):  # noqa: PT019
-    is_narrative_tag_.return_value = False
+    _is_narrative_tag_.return_value = False
    is_possible_title_.return_value = False
    doc = """<p>This is nothing</p>"""
    document_tree = etree.fromstring(doc, etree.HTMLParser())
@ -942,7 +858,7 @@ def test_process_list_item_returns_none_if_next_has_no_text():
    document_tree = etree.fromstring(doc, etree.HTMLParser())
    el = document_tree.find(".//div")
    assert el is not None
-    assert html.is_list_item_tag(el) is True
+    assert html._is_list_item_tag(el) is True
    parsed_el, _ = html._process_list_item(el)
    assert parsed_el is None
@ -1071,8 +987,8 @@ class Describe_parse_HTMLTable_from_table_elem:
@pytest.fixture
-def is_narrative_tag_(request: FixtureRequest):
+def _is_narrative_tag_(request: FixtureRequest):
-    return function_mock(request, "unstructured.documents.html.is_narrative_tag")
+    return function_mock(request, "unstructured.documents.html._is_narrative_tag")
@pytest.fixture
--- a/test_unstructured/partition/test_html.py
+++ b/test_unstructured/partition/test_html.py
@ -28,7 +28,7 @@ from unstructured.documents.elements import (
    TableChunk,
    Title,
 )
-from unstructured.documents.html import HTMLTable, TagsMixin
+from unstructured.documents.html_elements import HTMLTable, TagsMixin
 from unstructured.partition.html import partition_html
 # -- document-source (filename, file, text, url) -------------------------------------------------
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.14.5-dev5"  # pragma: no cover
+__version__ = "0.14.5-dev6"  # pragma: no cover
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@ -2,26 +2,21 @@
 from __future__ import annotations
-from typing import Any, Callable, Final, Iterator, Optional, Sequence, cast
+from typing import Final, Iterator, Optional, cast
 from lxml import etree
-from unstructured.cleaners.core import (
+from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes
    clean_bullets,
    replace_unicode_quotes,
 )
 from unstructured.documents.base import Page
-from unstructured.documents.elements import (
+from unstructured.documents.elements import Element, ElementMetadata, Link
-    Address,
+from unstructured.documents.html_elements import (
-    Element,
+    HTMLAddress,
-    ElementMetadata,
+    HTMLEmailAddress,
-    EmailAddress,
+    HTMLListItem,
-    Link,
+    HTMLNarrativeText,
-    ListItem,
+    HTMLTable,
-    NarrativeText,
+    HTMLText,
-    Table,
+    HTMLTitle,
    Text,
    Title,
 )
 from unstructured.documents.xml import VALID_PARSERS, XMLDocument
 from unstructured.logger import logger
@ -47,82 +42,6 @@ HEADER_OR_FOOTER_TAGS: Final[list[str]] = ["header", "footer"]
 SECTION_TAGS: Final[list[str]] = ["div", "pre"]
 # -- HTML-specific document-elements and methods -------------------------------------------------
 class TagsMixin:
    """Mixin that allows a class to retain tag information."""
    def __init__(
        self,
        *args: Any,
        tag: Optional[str] = None,
        ancestortags: Sequence[str] = (),
        links: Sequence[Link] = [],
        emphasized_texts: Sequence[dict[str, str]] = [],
        text_as_html: Optional[str] = None,
        **kwargs: Any,
    ):
        if tag is None:
            raise TypeError("tag argument must be passed and not None")
        else:
            self.tag = tag
        self.ancestortags = ancestortags
        self.links = links
        self.emphasized_texts = emphasized_texts
        self.text_as_html = text_as_html
        super().__init__(*args, **kwargs)
 class HTMLText(TagsMixin, Text):
    """Text with tag information."""
 class HTMLAddress(TagsMixin, Address):
    """Address with tag information."""
 class HTMLEmailAddress(TagsMixin, EmailAddress):
    """EmailAddress with tag information"""
 class HTMLTitle(TagsMixin, Title):
    """Title with tag information."""
 class HTMLNarrativeText(TagsMixin, NarrativeText):
    """NarrativeText with tag information."""
 class HTMLListItem(TagsMixin, ListItem):
    """NarrativeText with tag information."""
 class HTMLTable(TagsMixin, Table):
    """NarrativeText with tag information"""
 def has_table_ancestor(element: TagsMixin) -> bool:
    """Checks to see if an element has ancestors that are table elements. If so, we consider
    it to be a table element rather than a section of narrative text."""
    return any(ancestor in TABLE_TAGS for ancestor in element.ancestortags)
 def in_header_or_footer(element: TagsMixin) -> bool:
    """Checks to see if an element is contained within a header or a footer tag."""
    if any(ancestor in HEADER_OR_FOOTER_TAGS for ancestor in element.ancestortags):
        return True
    return False
 def is_table(element: TagsMixin) -> bool:
    """Checks to see if an element is a table"""
    return element.tag in TABLE_TAGS
 # -- HTML element-tree processing ----------------------------------------------------------------
 class HTMLDocument(XMLDocument):
    """Class for handling HTML documents. Uses rules based parsing to identify sections
    of interest within the document."""
@ -200,7 +119,7 @@ class HTMLDocument(XMLDocument):
                    page.elements.extend(bulleted_text)
                    descendanttag_elems = tuple(tag_elem.iterdescendants())
-                elif is_list_item_tag(tag_elem):
+                elif _is_list_item_tag(tag_elem):
                    element, next_element = _process_list_item(tag_elem)
                    if element is not None:
                        page.elements.append(element)
@ -228,67 +147,170 @@ class HTMLDocument(XMLDocument):
        return pages
    def doc_after_cleaners(
        self,
        skip_headers_and_footers: bool = False,
        skip_table: bool = False,
        inplace: bool = False,
    ) -> HTMLDocument:
        """Filters elements returning new instance based on the criteria specified.
-        Note that the number of pages can change in the case that all elements on a page are
+# -- candidate HTMLDocument methods --------------------------------------------------------------
        filtered out.
        Parameters
        ----------
        skip_table:
            If True, skips table element
        skip_headers_and_footers:
            If True, ignores any content that is within <header> or <footer> tags
        inplace:
            If True, document is modified in place and returned.
        """
        excluders: list[Callable[[TagsMixin], bool]] = []
        if skip_headers_and_footers:
            excluders.append(in_header_or_footer)
        if skip_table:
            excluders.append(is_table)
-        pages: list[Page] = []
+def _find_articles(root: etree._Element, assemble_articles: bool = True) -> list[etree._Element]:
-        page_number = 0
+    """Parse articles from `root` of an HTML document.
-        new_page = Page(number=page_number)
+
-        for page in self.pages:
+    Each `<article>` element in the HTML becomes its own "sub-document" (article). If no article
-            elements: list[Element] = []
+    elements are present, the entire document (`root`) is returned as the single document article.
-            for el in page.elements:
+    """
-                if not isinstance(el, TagsMixin):
+    if assemble_articles is False:
-                    raise ValueError(
+        return [root]
-                        f"elements of class {self.__class__} should be of type HTMLTitle "
+
-                        f"HTMLNarrativeText, or HTMLListItem but "
+    articles = root.findall(".//article")
-                        f"object has an element of type {type(el)}",
+    if len(articles) == 0:
-                    )
+        # NOTE(robinson) - ref: https://schema.org/Article
-                if not any(excluder(el) for excluder in excluders):
+        articles = root.findall(".//div[@itemprop='articleBody']")
-                    elements.append(el)
+    return [root] if len(articles) == 0 else articles
-                if skip_headers_and_footers and "footer" in tuple(el.ancestortags) + (el.tag,):
+
-                    break
+
-            if elements:
+def _find_main(root: etree._Element) -> etree._Element:
-                new_page.elements = elements
+    """The first <main> tag under `root` if it exists, othewise `root`."""
-                pages.append(new_page)
+    main_tag_elem = root.find(".//main")
-                page_number += 1
+    return main_tag_elem if main_tag_elem is not None else root
-                new_page = Page(number=page_number)
+
-        if inplace:
+
-            self._pages = pages
+# -- tag classifiers -----------------------------------------------------------------------------
-            self._elements = None
+
-            return self
+
-        else:
+def _is_bulleted_table(table_elem: etree._Element) -> bool:
-            out = self.__class__.from_pages(pages)
+    """True when all text in `table_elem` is bulleted text.
-            if not isinstance(out, HTMLDocument):
+
-                # NOTE(robinson) - Skipping for test coverage because this condition is impossible.
+    A table-row containing no text is not considered, but at least one bulleted-text item must be
-                # Added type check because from_pages is a method on Document. Without the type
+    present. A table with no text in any row is not a bulleted table.
-                # check, mypy complains about returning Document instead of HTMLDocument
+    """
-                raise ValueError(
+    if table_elem.tag != "table":
-                    f"Unexpected class: {self.__class__.__name__}",
+        return False
-                )  # pragma: no cover
+
-            return out
+    trs = table_elem.findall(".//tr")
    tr_texts = [_construct_text(tr) for tr in trs]
    # -- a table with no text is not a bulleted table --
    if all(not text for text in tr_texts):
        return False
    # -- all non-empty rows must contain bulleted text --
    if any(text and not is_bulleted_text(text) for text in tr_texts):
        return False
    return True
 def _is_container_with_text(tag_elem: etree._Element) -> bool:
    """Checks if a tag is a container that also happens to contain text.
    Example
    -------
    <div>Hi there,
        <div>This is my message.</div>
        <div>Please read my message!</div>
    </div>
    """
    if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0:
        return False
    tag_elem_text = tag_elem.text.strip() if tag_elem.text else None
    tag_elem_tail = tag_elem.tail.strip() if tag_elem.tail else None
    if not tag_elem_text and not tag_elem_tail:
        return False
    return True
 def _is_list_item_tag(tag_elem: etree._Element) -> bool:
    """True when `tag_elem` contains bulleted text."""
    return tag_elem.tag in LIST_ITEM_TAGS or (
        tag_elem.tag in SECTION_TAGS and is_bulleted_text(_construct_text(tag_elem))
    )
 def _is_text_tag(
    tag_elem: etree._Element, max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN
 ) -> bool:
    """True when `tag_element` potentially contains narrative text."""
    # NOTE(robinson) - Only consider elements with limited depth. Otherwise,
    # it could be the text representation of a giant div
    # Exclude empty tags from tag_elem
    empty_elems_len = len([el for el in tag_elem if el.tag in EMPTY_TAGS])
    if len(tag_elem) > max_predecessor_len + empty_elems_len:
        return False
    if tag_elem.tag in TEXT_TAGS + HEADING_TAGS + TEXTBREAK_TAGS:
        return True
    # NOTE(robinson) - This indicates that a div tag has no children. If that's the
    # case and the tag has text, its potential a text tag
    children = list(tag_elem)
    if tag_elem.tag in SECTION_TAGS + ["body"] and len(children) == 0:
        return True
    if _has_adjacent_bulleted_spans(tag_elem, children):
        return True
    return False
 # -- tag processors ------------------------------------------------------------------------------
 def _bulleted_text_from_table(table: etree._Element) -> list[Element]:
    """Extracts bulletized narrative text from the `<table>` element in `table`.
    NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted. I.e., _read()
    will drop non-bullet narrative text in the table.
    """
    bulleted_text: list[Element] = []
    rows = table.findall(".//tr")
    for row in rows:
        text = _construct_text(row)
        if is_bulleted_text(text):
            bulleted_text.append(HTMLListItem(text=clean_bullets(text), tag=row.tag))
    return bulleted_text
 def _construct_text(tag_elem: etree._Element, include_tail_text: bool = True) -> str:
    """Extract "clean"" text from `tag_elem`."""
    text = "".join(str(t) for t in tag_elem.itertext() if t)
    if include_tail_text and tag_elem.tail:
        text = text + tag_elem.tail
    text = replace_unicode_quotes(text)
    return text.strip()
 def _get_bullet_descendants(
    element: Optional[etree._Element], next_element: Optional[etree._Element]
 ) -> tuple[etree._Element, ...]:
    """Helper for list-item processing.
    Gathers the descendants of `next_element` so they can be marked visited.
    """
    return () if element is None or next_element is None else tuple(next_element.iterdescendants())
 def _get_emphasized_texts_from_tag(tag_elem: etree._Element) -> list[dict[str, str]]:
    """Emphasized text within and below `tag_element`.
    Emphasis is indicated by `<strong>`, `<em>`, `<span>`, `<b>`, `<i>` tags.
    """
    emphasized_texts: list[dict[str, str]] = []
    tags_to_track = ["strong", "em", "span", "b", "i"]
    if tag_elem.tag in tags_to_track:
        text = _construct_text(tag_elem, False)
        if text:
            emphasized_texts.append({"text": text, "tag": tag_elem.tag})
    for descendant_tag_elem in tag_elem.iterdescendants(*tags_to_track):
        text = _construct_text(descendant_tag_elem, False)
        if text:
            emphasized_texts.append({"text": text, "tag": descendant_tag_elem.tag})
    return emphasized_texts
 def _get_links_from_tag(tag_elem: etree._Element) -> list[Link]:
@ -313,27 +335,23 @@ def _get_links_from_tag(tag_elem: etree._Element) -> list[Link]:
    return links
-def _is_bulleted_table(table_elem: etree._Element) -> bool:
+def _has_adjacent_bulleted_spans(tag_elem: etree._Element, children: list[etree._Element]) -> bool:
-    """True when all text in `table_elem` is bulleted text.
+    """True when `tag_elem` is a <div> or <pre> containing two or more adjacent bulleted spans.
-    A table-row containing no text is not considered, but at least one bulleted-text item must be
+    A bulleted span is one beginning with a bullet. If there are two or more adjacent to each other
-    present. A table with no text in any row is not a bulleted table.
+    they are treated as a single bulleted text element.
    """
-    if table_elem.tag != "table":
+    if tag_elem.tag in SECTION_TAGS:
-        return False
+        all_spans = all(child.tag == "span" for child in children)
        _is_bulleted = children[0].text is not None and is_bulleted_text(children[0].text)
        if all_spans and _is_bulleted:
            return True
    return False
    trs = table_elem.findall(".//tr")
    tr_texts = [_construct_text(tr) for tr in trs]
-    # -- a table with no text is not a bulleted table --
+def _has_break_tags(tag_elem: etree._Element) -> bool:
-    if all(not text for text in tr_texts):
+    """True when `tab_elem` contains a `<br>` descendant."""
-        return False
+    return any(descendant.tag in TEXTBREAK_TAGS for descendant in tag_elem.iterdescendants())
    # -- all non-empty rows must contain bulleted text --
    if any(text and not is_bulleted_text(text) for text in tr_texts):
        return False
    return True
 def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Element]:
@ -378,27 +396,6 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele
    )
 def _get_emphasized_texts_from_tag(tag_elem: etree._Element) -> list[dict[str, str]]:
    """Emphasized text within and below `tag_element`.
    Emphasis is indicated by `<strong>`, `<em>`, `<span>`, `<b>`, `<i>` tags.
    """
    emphasized_texts: list[dict[str, str]] = []
    tags_to_track = ["strong", "em", "span", "b", "i"]
    if tag_elem.tag in tags_to_track:
        text = _construct_text(tag_elem, False)
        if text:
            emphasized_texts.append({"text": text, "tag": tag_elem.tag})
    for descendant_tag_elem in tag_elem.iterdescendants(*tags_to_track):
        text = _construct_text(descendant_tag_elem, False)
        if text:
            emphasized_texts.append({"text": text, "tag": descendant_tag_elem.tag})
    return emphasized_texts
 def _parse_tag(
    tag_elem: etree._Element,
    include_tail_text: bool = True,
@ -440,193 +437,6 @@ def _parse_tag(
    )
 def _text_to_element(
    text: str,
    tag: str,
    ancestortags: tuple[str, ...],
    depth: int,
    links: list[Link] = [],
    emphasized_texts: list[dict[str, str]] = [],
 ) -> Optional[Element]:
    """Produce a document-element of the appropriate sub-type for `text`."""
    if is_bulleted_text(text):
        if not clean_bullets(text):
            return None
        return HTMLListItem(
            text=clean_bullets(text),
            tag=tag,
            ancestortags=ancestortags,
            links=links,
            emphasized_texts=emphasized_texts,
            metadata=ElementMetadata(category_depth=depth),
        )
    elif is_us_city_state_zip(text):
        return HTMLAddress(
            text=text,
            tag=tag,
            ancestortags=ancestortags,
            links=links,
            emphasized_texts=emphasized_texts,
        )
    elif is_email_address(text):
        return HTMLEmailAddress(
            text=text,
            tag=tag,
            links=links,
            emphasized_texts=emphasized_texts,
        )
    if len(text) < 2:
        return None
    elif is_narrative_tag(text, tag):
        return HTMLNarrativeText(
            text,
            tag=tag,
            ancestortags=ancestortags,
            links=links,
            emphasized_texts=emphasized_texts,
        )
    elif is_heading_tag(tag) or is_possible_title(text):
        return HTMLTitle(
            text,
            tag=tag,
            ancestortags=ancestortags,
            links=links,
            emphasized_texts=emphasized_texts,
            metadata=ElementMetadata(category_depth=depth),
        )
    else:
        return HTMLText(
            text,
            tag=tag,
            ancestortags=ancestortags,
            links=links,
            emphasized_texts=emphasized_texts,
        )
 def _is_container_with_text(tag_elem: etree._Element) -> bool:
    """Checks if a tag is a container that also happens to contain text.
    Example
    -------
    <div>Hi there,
        <div>This is my message.</div>
        <div>Please read my message!</div>
    </div>
    """
    if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0:
        return False
    tag_elem_text = tag_elem.text.strip() if tag_elem.text else None
    tag_elem_tail = tag_elem.tail.strip() if tag_elem.tail else None
    if not tag_elem_text and not tag_elem_tail:
        return False
    return True
 def is_narrative_tag(text: str, tag: str) -> bool:
    """Uses tag information to infer whether text is narrative."""
    return tag not in HEADING_TAGS and is_possible_narrative_text(text)
 def is_heading_tag(tag: str) -> bool:
    """Uses tag information to infer whether text is a heading."""
    return tag in HEADING_TAGS
 def _construct_text(tag_elem: etree._Element, include_tail_text: bool = True) -> str:
    """Extract "clean"" text from `tag_elem`."""
    text = "".join(str(t) for t in tag_elem.itertext() if t)
    if include_tail_text and tag_elem.tail:
        text = text + tag_elem.tail
    text = replace_unicode_quotes(text)
    return text.strip()
 def _has_break_tags(tag_elem: etree._Element) -> bool:
    """True when `tab_elem` contains a `<br>` descendant."""
    return any(descendant.tag in TEXTBREAK_TAGS for descendant in tag_elem.iterdescendants())
 def _unfurl_break_tags(tag_elem: etree._Element) -> list[etree._Element]:
    """Sequence of `tag_elem` and its children with `<br>` elements removed.
    NOTE that these are "loose" `etree._Element` instances that are NOT linked to the original HTML
    element-tree, so methods like `.getchildren()`, `.find()` etc. will happily produce empty
    results.
    """
    unfurled: list[etree._Element] = []
    if tag_elem.text:
        _tag_elem = etree.Element(tag_elem.tag)
        _tag_elem.text = tag_elem.text
        unfurled.append(_tag_elem)
    for child in tag_elem:
        if not _has_break_tags(child):
            unfurled.append(child)
        else:
            if child.text:
                _tag_elem = etree.Element(child.tag)
                _tag_elem.text = child.text
                unfurled.append(_tag_elem)
            unfurled.extend(_unfurl_break_tags(child))
    return unfurled
 def _is_text_tag(
    tag_elem: etree._Element, max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN
 ) -> bool:
    """True when `tag_element` potentially contains narrative text."""
    # NOTE(robinson) - Only consider elements with limited depth. Otherwise,
    # it could be the text representation of a giant div
    # Exclude empty tags from tag_elem
    empty_elems_len = len([el for el in tag_elem if el.tag in EMPTY_TAGS])
    if len(tag_elem) > max_predecessor_len + empty_elems_len:
        return False
    if tag_elem.tag in TEXT_TAGS + HEADING_TAGS + TEXTBREAK_TAGS:
        return True
    # NOTE(robinson) - This indicates that a div tag has no children. If that's the
    # case and the tag has text, its potential a text tag
    children = list(tag_elem)
    if tag_elem.tag in SECTION_TAGS + ["body"] and len(children) == 0:
        return True
    if _has_adjacent_bulleted_spans(tag_elem, children):
        return True
    return False
 def _process_text_tag(
    tag_elem: etree._Element, include_tail_text: bool = True
 ) -> tuple[list[Element], tuple[etree._Element, ...]]:
    """Produces a document element from `tag_elem`."""
    page_elements: list[Element] = []
    if _has_break_tags(tag_elem):
        flattened_elems = _unfurl_break_tags(tag_elem)
        for _tag_elem in flattened_elems:
            element = _parse_tag(_tag_elem, include_tail_text)
            if element is not None:
                page_elements.append(element)
    else:
        element = _parse_tag(tag_elem, include_tail_text)
        if element is not None:
            page_elements.append(element)
    descendant_tag_elems = tuple(tag_elem.iterdescendants())
    return page_elements, descendant_tag_elems
 def _process_list_item(
    tag_elem: etree._Element,
    max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
@ -671,69 +481,131 @@ def _process_list_item(
    return None, None
-def _get_bullet_descendants(
+def _process_text_tag(
-    element: Optional[etree._Element], next_element: Optional[etree._Element]
+    tag_elem: etree._Element, include_tail_text: bool = True
-) -> tuple[etree._Element, ...]:
+) -> tuple[list[Element], tuple[etree._Element, ...]]:
-    """Helper for list-item processing.
+    """Produces a document element from `tag_elem`."""
-    Gathers the descendants of `next_element` so they can be marked visited.
+    page_elements: list[Element] = []
    if _has_break_tags(tag_elem):
        flattened_elems = _unfurl_break_tags(tag_elem)
        for _tag_elem in flattened_elems:
            element = _parse_tag(_tag_elem, include_tail_text)
            if element is not None:
                page_elements.append(element)
    else:
        element = _parse_tag(tag_elem, include_tail_text)
        if element is not None:
            page_elements.append(element)
    descendant_tag_elems = tuple(tag_elem.iterdescendants())
    return page_elements, descendant_tag_elems
 def _unfurl_break_tags(tag_elem: etree._Element) -> list[etree._Element]:
    """Sequence of `tag_elem` and its children with `<br>` elements removed.
    NOTE that these are "loose" `etree._Element` instances that are NOT linked to the original HTML
    element-tree, so methods like `.getchildren()`, `.find()` etc. will happily produce empty
    results.
    """
-    return () if element is None or next_element is None else tuple(next_element.iterdescendants())
+    unfurled: list[etree._Element] = []
    if tag_elem.text:
        _tag_elem = etree.Element(tag_elem.tag)
        _tag_elem.text = tag_elem.text
        unfurled.append(_tag_elem)
    for child in tag_elem:
        if not _has_break_tags(child):
            unfurled.append(child)
        else:
            if child.text:
                _tag_elem = etree.Element(child.tag)
                _tag_elem.text = child.text
                unfurled.append(_tag_elem)
            unfurled.extend(_unfurl_break_tags(child))
    return unfurled
-def is_list_item_tag(tag_elem: etree._Element) -> bool:
+# -- text-element classifier ---------------------------------------------------------------------
    """True when `tag_elem` contains bulleted text."""
    return tag_elem.tag in LIST_ITEM_TAGS or (
        tag_elem.tag in SECTION_TAGS and is_bulleted_text(_construct_text(tag_elem))
    )
-def _bulleted_text_from_table(table: etree._Element) -> list[Element]:
+def _text_to_element(
-    """Extracts bulletized narrative text from the `<table>` element in `table`.
+    text: str,
    tag: str,
    ancestortags: tuple[str, ...],
    depth: int,
    links: list[Link] = [],
    emphasized_texts: list[dict[str, str]] = [],
 ) -> Optional[Element]:
    """Produce a document-element of the appropriate sub-type for `text`."""
    if is_bulleted_text(text):
        if not clean_bullets(text):
            return None
        return HTMLListItem(
            text=clean_bullets(text),
            tag=tag,
            ancestortags=ancestortags,
            links=links,
            emphasized_texts=emphasized_texts,
            metadata=ElementMetadata(category_depth=depth),
        )
    elif is_us_city_state_zip(text):
        return HTMLAddress(
            text=text,
            tag=tag,
            ancestortags=ancestortags,
            links=links,
            emphasized_texts=emphasized_texts,
        )
    elif is_email_address(text):
        return HTMLEmailAddress(
            text=text,
            tag=tag,
            links=links,
            emphasized_texts=emphasized_texts,
        )
-    NOTE: if a table has mixed bullets and non-bullets, only bullets are extracted. I.e., _read()
+    if len(text) < 2:
-    will drop non-bullet narrative text in the table.
+        return None
-    """
+    elif _is_narrative_tag(text, tag):
-    bulleted_text: list[Element] = []
+        return HTMLNarrativeText(
-    rows = table.findall(".//tr")
+            text,
-    for row in rows:
+            tag=tag,
-        text = _construct_text(row)
+            ancestortags=ancestortags,
-        if is_bulleted_text(text):
+            links=links,
-            bulleted_text.append(HTMLListItem(text=clean_bullets(text), tag=row.tag))
+            emphasized_texts=emphasized_texts,
-    return bulleted_text
+        )
    elif _is_heading_tag(tag) or is_possible_title(text):
        return HTMLTitle(
            text,
            tag=tag,
            ancestortags=ancestortags,
            links=links,
            emphasized_texts=emphasized_texts,
            metadata=ElementMetadata(category_depth=depth),
        )
    else:
        return HTMLText(
            text,
            tag=tag,
            ancestortags=ancestortags,
            links=links,
            emphasized_texts=emphasized_texts,
        )
-def _has_adjacent_bulleted_spans(tag_elem: etree._Element, children: list[etree._Element]) -> bool:
+# -- HTML-specific text classifiers --------------------------------------------------------------
    """True when `tag_elem` is a <div> or <pre> containing two or more adjacent bulleted spans.
    A bulleted span is one beginning with a bullet. If there are two or more adjacent to each other
    they are treated as a single bulleted text element.
    """
    if tag_elem.tag in SECTION_TAGS:
        all_spans = all(child.tag == "span" for child in children)
        _is_bulleted = children[0].text is not None and is_bulleted_text(children[0].text)
        if all_spans and _is_bulleted:
            return True
    return False
-def _find_main(root: etree._Element) -> etree._Element:
+def _is_heading_tag(tag: str) -> bool:
-    """The first <main> tag under `root` if it exists, othewise `root`."""
+    """Uses tag information to infer whether text is a heading."""
-    main_tag_elem = root.find(".//main")
+    return tag in HEADING_TAGS
    return main_tag_elem if main_tag_elem is not None else root
-def _find_articles(root: etree._Element, assemble_articles: bool = True) -> list[etree._Element]:
+def _is_narrative_tag(text: str, tag: str) -> bool:
-    """Parse articles from `root` of an HTML document.
+    """Uses tag information to infer whether text is narrative."""
-
+    return tag not in HEADING_TAGS and is_possible_narrative_text(text)
    Each `<article>` element in the HTML becomes its own "sub-document" (article). If no article
    elements are present, the entire document (`root`) is returned as the single document article.
    """
    if assemble_articles is False:
        return [root]
    articles = root.findall(".//article")
    if len(articles) == 0:
        # NOTE(robinson) - ref: https://schema.org/Article
        articles = root.findall(".//div[@itemprop='articleBody']")
    return [root] if len(articles) == 0 else articles
--- a/unstructured/documents/html_elements.py
+++ b/unstructured/documents/html_elements.py
@ -0,0 +1,68 @@
 """Document elements specific to the HTML partitioner."""
 from __future__ import annotations
 from typing import Any, Dict, Optional, Sequence
 from unstructured.documents.elements import (
    Address,
    EmailAddress,
    Link,
    ListItem,
    NarrativeText,
    Table,
    Text,
    Title,
 )
 class TagsMixin:
    """Mixin that allows a class to retain tag information."""
    def __init__(
        self,
        *args: Any,
        tag: Optional[str] = None,
        ancestortags: Sequence[str] = (),
        links: Sequence[Link] = [],
        emphasized_texts: Sequence[Dict[str, str]] = [],
        text_as_html: Optional[str] = None,
        **kwargs: Any,
    ):
        if tag is None:
            raise TypeError("tag argument must be passed and not None")
        else:
            self.tag = tag
        self.ancestortags = ancestortags
        self.links = links
        self.emphasized_texts = emphasized_texts
        self.text_as_html = text_as_html
        super().__init__(*args, **kwargs)
 class HTMLText(TagsMixin, Text):
    """Text with tag information."""
 class HTMLAddress(TagsMixin, Address):
    """Address with tag information."""
 class HTMLEmailAddress(TagsMixin, EmailAddress):
    """EmailAddress with tag information"""
 class HTMLTitle(TagsMixin, Title):
    """Title with tag information."""
 class HTMLNarrativeText(TagsMixin, NarrativeText):
    """NarrativeText with tag information."""
 class HTMLListItem(TagsMixin, ListItem):
    """NarrativeText with tag information."""
 class HTMLTable(TagsMixin, Table):
    """NarrativeText with tag information"""
--- a/unstructured/partition/html.py
+++ b/unstructured/partition/html.py
@ -6,7 +6,8 @@ import requests
 from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import Element, process_metadata
-from unstructured.documents.html import HTMLDocument, TagsMixin
+from unstructured.documents.html import HTMLDocument
 from unstructured.documents.html_elements import TagsMixin
 from unstructured.documents.xml import VALID_PARSERS
 from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.file_conversion import convert_file_to_html_text
`@ -1,4 +1,4 @@`
	`## 0.14.5-dev5`	`## 0.14.5-dev6`

	`### Enhancements`	`### Enhancements`
`@ -1 +1 @@`
	`__version__ = "0.14.5-dev5" # pragma: no cover`	`__version__ = "0.14.5-dev6" # pragma: no cover`