unstructured/test_unstructured/partition/html/test_parser.py

# pyright: reportPrivateUsage=false
# pyright: reportUnknownArgumentType=false

"""Test suite for `unstructured.partition.html.parser` module."""

from __future__ import annotations

from collections import deque

import pytest
from lxml import etree

from unstructured.documents.elements import Address, Element, ListItem, NarrativeText, Text, Title
from unstructured.partition.html.parser import (
    Anchor,
    Annotation,
    DefaultElement,
    Flow,
    Phrasing,
    RemovedPhrasing,
    TextSegment,
    _consolidate_annotations,
    _normalize_text,
    html_parser,
)

# -- MODULE-LEVEL FUNCTIONS ----------------------------------------------------------------------

# -- _consolidate_annotations() ------------------


def it_gathers_annotations_from_text_segments():
    text_segments = [
        TextSegment(
            " Ford Prefect ",
            {
                "link_texts": "Ford Prefect",
                "link_url": "https://wikipedia/Ford_Prefect",
                "emphasized_text_contents": "Ford Prefect",
                "emphasized_text_tags": "b",
            },
        ),
        TextSegment(
            " alien  encounter",
            {
                "emphasized_text_contents": "alien encounter",
                "emphasized_text_tags": "bi",
            },
        ),
    ]

    annotations = _consolidate_annotations(text_segments)

    assert annotations == {
        # -- each distinct key gets a list of values --
        "emphasized_text_contents": ["Ford Prefect", "alien encounter"],
        "emphasized_text_tags": ["b", "bi"],
        # -- even when there is only one value --
        "link_texts": ["Ford Prefect"],
        "link_url": ["https://wikipedia/Ford_Prefect"],
    }
    # -- and the annotations mapping is immutable --
    with pytest.raises(TypeError, match="object does not support item assignment"):
        annotations["new_key"] = "foobar"  # pyright: ignore[reportIndexIssue]
    # -- (but not its list values unfortunately) --
    annotations["emphasized_text_tags"].append("xyz")
    assert annotations["emphasized_text_tags"] == ["b", "bi", "xyz"]


# -- _normalize_text() ---------------------------


@pytest.mark.parametrize(
    ("text", "expected_value"),
    [
        # -- already normalized text is left unchanged --
        ("iterators allow", "iterators allow"),
        # -- newlines are treated as whitespace --
        ("algorithm\nto   be", "algorithm to be"),
        ("  separated\n  from  ", "separated from"),
        ("\n container\n details\n ", "container details"),
        (
            "\n  iterators  allow \n algorithm to be   \nexpressed  without container  \nnoise",
            "iterators allow algorithm to be expressed without container noise",
        ),
    ],
)
def test_normalize_text_produces_normalized_text(text: str, expected_value: str):
    assert _normalize_text(text) == expected_value


# -- FLOW (BLOCK-ITEM) ELEMENTS ------------------------------------------------------------------


class DescribeFlow:
    """Isolated unit-test suite for `unstructured.partition.html.parser.Flow`.

    The `Flow` class provides most behaviors for flow (block-level) elements.
    """

    # -- .is_phrasing -----------------------------------------------------

    def it_knows_it_is_NOT_a_phrasing_element(self):
        p = etree.fromstring("<p>Hello</p>", html_parser).xpath(".//p")[0]

        assert isinstance(p, Flow)
        assert p.is_phrasing is False

    # -- .iter_elements() -------------------------------------------------

    def it_generates_the_document_elements_from_the_Flow_element(self):
        """Phrasing siblings of child block elements are processed with text or tail.

        In the general case, a Flow element can contain text, phrasing content, and child flow
        elements.

        Each of these five lines in this example is a "paragraph" and gives rise to a distinct
        document-element.
        """
        html_text = """
          <div>
            Text of div <b>with <i>hierarchical</i>\nphrasing</b> content before first block item
            <p>Click <a href="http://blurb.io">here</a> to see the blurb for this block item. </p>
            tail of block item <b>with <i>hierarchical</i> phrasing </b> content
            <p>second block item</p>
            tail of block item <b>with <i>  hierarchical  </i></b> phrasing content
          </div>
        """
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        elements = div.iter_elements()

        e = next(elements)
        assert e == Title("Text of div with hierarchical phrasing content before first block item")
        assert e.metadata.to_dict() == {
            "category_depth": 0,
            "emphasized_text_contents": ["with", "hierarchical", "phrasing"],
            "emphasized_text_tags": ["b", "bi", "b"],
        }
        e = next(elements)
        assert e == NarrativeText("Click here to see the blurb for this block item.")
        assert e.metadata.to_dict() == {"link_texts": ["here"], "link_urls": ["http://blurb.io"]}
        e = next(elements)
        assert e == Title("tail of block item with hierarchical phrasing content")
        assert e.metadata.to_dict() == {
            "category_depth": 0,
            "emphasized_text_contents": ["with", "hierarchical", "phrasing"],
            "emphasized_text_tags": ["b", "bi", "b"],
        }
        e = next(elements)
        assert e == Title("second block item")
        assert e.metadata.to_dict() == {"category_depth": 0}
        e = next(elements)
        assert e == Title("tail of block item with hierarchical phrasing content")
        assert e.metadata.to_dict() == {
            "category_depth": 0,
            "emphasized_text_contents": ["with", "hierarchical"],
            "emphasized_text_tags": ["b", "bi"],
        }
        with pytest.raises(StopIteration):
            e = next(elements)

    # -- ._category_depth() -----------------------------------------------

    @pytest.mark.parametrize(
        ("html_text", "tag", "ElementCls", "expected_value"),
        [
            ("<p>Ford... you're turning into a penguin. Stop it.<p>", "p", Text, None),
            ("<p>* thanks for all the fish.</p>", "p", ListItem, 0),
            ("<li>thanks for all the fish.</li>", "li", ListItem, 0),
            ("<ul><li>So long</li><li>and thanks for all the fish.</li></ul>", "li", ListItem, 1),
            ("<dl><dd>So long<ol><li>and thanks for the fish.</li></ol></ul>", "li", ListItem, 2),
            ("<p>Examples</p>", "p", Title, 0),
            ("<h1>Examples</h1>", "h1", Title, 0),
            ("<h2>Examples</h2>", "h2", Title, 1),
            ("<h3>Examples</h3>", "h3", Title, 2),
            ("<h4>Examples</h4>", "h4", Title, 3),
            ("<h5>Examples</h5>", "h5", Title, 4),
            ("<h6>Examples</h6>", "h6", Title, 5),
        ],
    )
    def it_computes_the_category_depth_to_help(
        self, html_text: str, tag: str, ElementCls: type[Element], expected_value: int | None
    ):
        e = etree.fromstring(html_text, html_parser).xpath(f".//{tag}")[0]
        assert e._category_depth(ElementCls) == expected_value

    # -- ._element_from_text_or_tail() ------------------------------------

    def it_assembles_text_and_tail_document_elements_to_help(self):
        """Text and tails and their phrasing content are both processed the same way."""
        html_text = "<div>The \n Roman <b>poet <i>   Virgil</i> gave</b> his <q>pet</q> fly</div>"
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        elements = div._element_from_text_or_tail(div.text, deque(div), Text)

        e = next(elements)
        # -- element text is normalized --
        assert e == Text("The Roman poet Virgil gave his pet fly")
        # -- individual annotations are consolidated --
        assert e.metadata.to_dict() == {
            "emphasized_text_contents": ["poet", "Virgil", "gave"],
            "emphasized_text_tags": ["b", "bi", "b"],
        }

    def but_it_does_not_generate_a_document_element_when_only_whitespace_is_contained(self):
        html_text = "<div>   <b> \n <i>  \n </i>  </b>   <q> \n </q> \n  </div>"
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        elements = div._element_from_text_or_tail(div.text, deque(div), Text)

        with pytest.raises(StopIteration):
            next(elements)

    def it_uses_the_specified_element_class_to_form_the_document_element(self):
        html_text = "<div>\n  The line-storm clouds fly tattered and swift\n</div>"
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        elements = div._element_from_text_or_tail(div.text, deque(div), Address)

        e = next(elements)
        assert e == Address("The line-storm clouds fly tattered and swift")
        assert e.metadata.to_dict() == {}
        with pytest.raises(StopIteration):
            next(elements)

    def and_it_selects_the_document_element_class_by_analyzing_the_text_when_not_specified(self):
        html_text = "<div>\n  The line-storm clouds fly tattered and swift,\n</div>"
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        elements = div._element_from_text_or_tail(div.text, deque(div))

        assert next(elements) == NarrativeText("The line-storm clouds fly tattered and swift,")

    def but_it_does_not_generate_a_document_element_when_only_a_bullet_character_is_contained(self):
        html_text = "<div> * </div>"
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        elements = div._element_from_text_or_tail(div.text, deque(div))

        with pytest.raises(StopIteration):
            next(elements)

    # -- ._iter_text_segments() -------------------------------------------

    @pytest.mark.parametrize(
        ("html_text", "expected_value"),
        [
            (  # -- text with no phrasing --
                "<p>Ford... you're turning into a penguin.<p>",
                [("Ford... you're turning into a penguin.", {})],
            ),
            (  # -- text with phrasing --
                "<p>Ford... <b>you're turning</b> into\na <i>penguin</i>.<p>",
                [
                    ("Ford... ", {}),
                    (
                        "you're turning",
                        {"emphasized_text_contents": "you're turning", "emphasized_text_tags": "b"},
                    ),
                    (" into\na ", {}),
                    (
                        "penguin",
                        {"emphasized_text_contents": "penguin", "emphasized_text_tags": "i"},
                    ),
                    (".", {}),
                ],
            ),
            (  # -- text with nested phrasing --
                "<p>Ford... <b>you're <i>turning</i></b> into a penguin.<p>",
                [
                    ("Ford... ", {}),
                    (
                        "you're ",
                        {"emphasized_text_contents": "you're", "emphasized_text_tags": "b"},
                    ),
                    (
                        "turning",
                        {"emphasized_text_contents": "turning", "emphasized_text_tags": "bi"},
                    ),
                    (" into a penguin.", {}),
                ],
            ),
        ],
    )
    def it_recursively_generates_text_segments_from_text_and_phrasing_to_help(
        self, html_text: str, expected_value: list[Annotation]
    ):
        p = etree.fromstring(html_text, html_parser).xpath(".//p")[0]
        text_segments = list(p._iter_text_segments(p.text, deque(p)))

        assert text_segments == expected_value


class DescribePre:
    """Isolated unit-test suite for `unstructured.partition.html.parser.Pre`.

    The `Pre` class specializes behaviors for the `<pre>` (pre-formatted text) element.
    """

    def it_preserves_the_whitespace_of_its_phrasing_only_contents(self):
        """A `<pre>` element can contain only phrasing content."""
        html_text = (
            "<pre>\n"
            "  The Answer to the Great Question...   Of Life, the Universe and Everything...\n"
            "  Is... Forty-two, said Deep Thought, with infinite majesty and calm.\n"
            "</pre>\n"
        )
        pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]

        elements = pre.iter_elements()

        e = next(elements)
        assert e == Text(
            "  The Answer to the Great Question...   Of Life, the Universe and Everything...\n"
            "  Is... Forty-two, said Deep Thought, with infinite majesty and calm."
        )
        with pytest.raises(StopIteration):
            next(elements)

    @pytest.mark.parametrize(
        ("html_text", "expected_value"),
        [
            # -- a newline in the 0th position of pre.text is dropped --
            ("<pre>\n  foo  </pre>", "  foo  "),
            # -- but not when preceded by any other whitespace --
            ("<pre> \n  foo  </pre>", " \n  foo  "),
            # -- and only one is dropped --
            ("<pre>\n\n  foo  </pre>", "\n  foo  "),
            # -- a newline in the -1th position is dropped --
            ("<pre>  foo  \n</pre>", "  foo  "),
            # -- but not when followed by any other whitespace --
            ("<pre>  foo  \n </pre>", "  foo  \n "),
            # -- and only one is dropped --
            ("<pre>  foo  \n\n</pre>", "  foo  \n"),
            # -- a newline in both positions are both dropped --
            ("<pre>\n  foo  \n</pre>", "  foo  "),
            # -- or not when not at the absolute edge --
            ("<pre> \n  foo  \n </pre>", " \n  foo  \n "),
        ],
    )
    def but_it_strips_a_single_leading_or_trailing_newline(
        self, html_text: str, expected_value: str
    ):
        """Content starts on next line when opening `<pre>` tag is immediately followed by `\n`"""
        pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]
        e = next(pre.iter_elements())

        assert e.text == expected_value

    def it_assigns_emphasis_and_link_metadata_when_contents_have_those_phrasing_elements(self):
        html_text = '<pre>You\'re <b>turning</b> into a <a href="http://eie.io">penguin</a>.</pre>'
        pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]

        e = next(pre.iter_elements())

        assert e.text == "You're turning into a penguin."
        assert e.metadata.emphasized_text_contents == ["turning"]
        assert e.metadata.emphasized_text_tags == ["b"]
        assert e.metadata.link_texts == ["penguin"]
        assert e.metadata.link_urls == ["http://eie.io"]


class DescribeRemovedBlock:
    """Isolated unit-test suite for `unstructured.partition.html.parser.RemovedBlock`.

    This class is used for block level items we want to skip like `<hr/>` and `<figure>`.
    """

    def it_is_skipped_during_parsing(self):
        html_text = """
          <div>
            <hr/>
            <figure>
              <img src="/media/cc0-images/elephant-660-480.jpg" alt="Elephant at sunset" />
              <figcaption>An elephant at sunset</figcaption>
            </figure>
            <p>Content we want.</p>
          </div>
          """
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        assert list(div.iter_elements()) == [NarrativeText("Content we want.")]


# -- PHRASING (INLINE) ELEMENTS ------------------------------------------------------------------


class DescribePhrasing:
    """Isolated unit-test suite for `unstructured.partition.html.parser.Phrasing`.

    The `Phrasing` class provides most behaviors for phrasing (inline) elements.
    """

    def it_knows_it_is_a_phrasing_element(self):
        b = etree.fromstring("<b>Hello</b>", html_parser).xpath(".//b")[0]

        assert isinstance(b, Phrasing)
        assert b.is_phrasing is True

    @pytest.mark.parametrize(
        ("html_text", "expected_value"),
        [
            # -- an empty element produces no text segments --
            ("<code></code>", []),
            # -- element text produces one segment --
            ("<data> foo </data>", [(" foo ", {})]),
            # -- element tail produces one segment --
            ("<dfn/> bar ", [(" bar ", {})]),
            # -- element descendants each produce one segment --
            ("<kbd><mark>foo <meter>bar</meter></mark></kbd>", [("foo ", {}), ("bar", {})]),
            # -- and any combination produces a segment for each text, child, and tail --
            (
                "<kbd> <mark>foo <meter>bar</meter> baz</mark> </kbd>",
                [
                    (" ", {}),
                    ("foo ", {}),
                    ("bar", {}),
                    (" baz", {}),
                    (" ", {}),
                ],
            ),
        ],
    )
    def it_generates_text_segments_for_its_text_and_children_and_tail(
        self, html_text: str, expected_value: list[TextSegment]
    ):
        e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0]
        assert list(e.iter_text_segments()) == expected_value

    def it_forms_its_annotations_from_emphasis(self):
        cite = etree.fromstring("<cite>  rhombus </cite>", html_parser).xpath(".//cite")[0]
        assert cite._annotation(cite.text, "bi") == {
            "emphasized_text_contents": "rhombus",
            "emphasized_text_tags": "bi",
        }

    def but_not_when_text_is_empty_or_whitespace(self):
        cite = etree.fromstring("<cite>   </cite>", html_parser).xpath(".//cite")[0]
        assert cite._annotation(cite.text, "bi") == {}

    def and_not_when_there_is_no_emphasis(self):
        cite = etree.fromstring("<cite>rhombus</cite>", html_parser).xpath(".//cite")[0]
        assert cite._annotation(cite.text, "") == {}

    def it_uses_the_enclosing_emphasis_as_the_default_inside_emphasis(self):
        abbr = etree.fromstring("<abbr>LLM</abbr>", html_parser).xpath(".//abbr")[0]
        assert abbr._inside_emphasis("xyz") == "xyz"


class DescribeBold:
    """Isolated unit-test suite for `unstructured.partition.html.parser.Bold`.

    The `Bold` class is used for `<b>` and `<strong>` tags and adds emphasis metadata.
    """

    def it_annotates_its_text_segment_with_bold_emphasis(self):
        b = etree.fromstring("<b>rhombus</b>", html_parser).xpath(".//b")[0]

        text_segments = b.iter_text_segments()

        text, annotation = next(text_segments)
        assert text == "rhombus"
        assert annotation == {
            "emphasized_text_contents": "rhombus",
            "emphasized_text_tags": "b",
        }

    def and_its_children_are_also_annotated_with_bold_emphasis(self):
        b = etree.fromstring("<b>rhombus <i>pentagon</i></b>", html_parser).xpath(".//b")[0]

        text_segments = b.iter_text_segments()

        text, annotation = next(text_segments)
        assert text == "rhombus "
        assert annotation == {
            "emphasized_text_contents": "rhombus",
            "emphasized_text_tags": "b",
        }
        text, annotation = next(text_segments)
        assert text == "pentagon"
        assert annotation == {
            "emphasized_text_contents": "pentagon",
            "emphasized_text_tags": "bi",
        }

    def but_not_its_tail(self):
        b = etree.fromstring("<b>rhombus</b> pentagon", html_parser).xpath(".//b")[0]

        text_segments = b.iter_text_segments()

        text, annotation = next(text_segments)
        assert text == "rhombus"
        assert annotation == {
            "emphasized_text_contents": "rhombus",
            "emphasized_text_tags": "b",
        }
        text, annotation = next(text_segments)
        assert text == " pentagon"
        assert annotation == {}


class DescribeItalic:
    """Isolated unit-test suite for `unstructured.partition.html.parser.Italic`.

    The `Italic` class is used for `<i>` and `<em>` tags and adds emphasis metadata.
    """

    def it_annotates_its_text_segment_with_italic_emphasis(self):
        i = etree.fromstring("<i>rhombus</i>", html_parser).xpath(".//i")[0]

        text_segments = i.iter_text_segments()

        text, annotation = next(text_segments)
        assert text == "rhombus"
        assert annotation == {
            "emphasized_text_contents": "rhombus",
            "emphasized_text_tags": "i",
        }

    def and_its_children_are_also_annotated_with_italic_emphasis(self):
        em = etree.fromstring("<em>rhombus <b>pentagon</b></em>", html_parser).xpath(".//em")[0]

        text_segments = em.iter_text_segments()

        text, annotation = next(text_segments)
        assert text == "rhombus "
        assert annotation == {
            "emphasized_text_contents": "rhombus",
            "emphasized_text_tags": "i",
        }
        text, annotation = next(text_segments)
        assert text == "pentagon"
        assert annotation == {
            "emphasized_text_contents": "pentagon",
            "emphasized_text_tags": "bi",
        }

    def but_not_its_tail(self):
        i = etree.fromstring("<i>rhombus</i> pentagon", html_parser).xpath(".//i")[0]

        text_segments = i.iter_text_segments()

        text, annotation = next(text_segments)
        assert text == "rhombus"
        assert annotation == {
            "emphasized_text_contents": "rhombus",
            "emphasized_text_tags": "i",
        }
        text, annotation = next(text_segments)
        assert text == " pentagon"
        assert annotation == {}


class DescribeLineBreak:
    """Isolated unit-test suite for `unstructured.partition.html.parser.LineBreak`.

    Used for `<br/>` elements, it's only special behavior is to add whitespace such that phrasing
    butted up tight on both sides of the `<br/>` element is not joined, like `abc<br/>def` should
    become "abc def", not "abcdef".
    """

    def it_adds_a_newline_in_its_place(self):
        cite = etree.fromstring(
            "<cite>spaceships of the<br/>Vogon Constructor Fleet</cite>", html_parser
        ).xpath(".//cite")[0]

        text_segments = cite.iter_text_segments()

        texts = [ts.text for ts in text_segments]
        assert texts == ["spaceships of the", "\n", "Vogon Constructor Fleet"]
        assert _normalize_text("".join(texts)) == "spaceships of the Vogon Constructor Fleet"


class DescribeRemovedPhrasing:
    """Isolated unit-test suite for `unstructured.partition.html.parser.RemovedPhrasing`.

    Used for phrasing elements like `<label>` that we want to skip, including any content they
    enclose. The tail of such an element is not skipped though.
    """

    def it_behaves_like_an_empty_element(self):
        label = etree.fromstring(
            "<div>\n"
            "  <label>Space<p>is big</p>, <b>mind-bogglingly</b> big.</label>\n"
            "  Like vastly, hugely big.\n"
            "</div>",
            html_parser,
        ).xpath(".//label")[0]

        (text_segment,) = list(label.iter_text_segments())

        assert isinstance(label, RemovedPhrasing)
        assert label.is_phrasing is True
        assert text_segment.text == "\n  Like vastly, hugely big.\n"


# -- DUAL-ROLE ELEMENTS --------------------------------------------------------------------------


class DescribeAnchor:
    """Isolated unit-test suite for `unstructured.partition.html.parser.Anchor`.

    The `Anchor` class is used for `<a>` tags and provides link metadata.
    """

    # -- .is_phrasing -----------------------------------------------------

    @pytest.mark.parametrize(
        ("html_text", "expected_value"),
        [
            # -- an empty <a> identifies as phrasing --
            ('<a href="http://eie.io"></a>', True),
            # -- an <a> with text but no children identifies as phrasing --
            ('<a href="http://eie.io">“O Deep Thought computer," he said,</a>', True),
            # -- an <a> with no text and only phrasing children identifies as phrasing --
            ('<a href="http://eie.io"><i>“O Deep Thought computer,"</i></a>', True),
            # -- an <a> with both text and phrasing children identifies as phrasing --
            ('<a href="http://eie.io">“O <b>Deep Thought</b> computer,"</a>', True),
            # -- but an <a> with a block-item child does not --
            ('<a href="http://eie.io"><p>“O Deep Thought computer,"</p></a>', False),
            # -- and an <a> with both text and a block-item child does not --
            ('<a href="http://eie.io">“O Deep Thought computer,"<div>he said,</div></a>', False),
            # -- and an <a> with text and both block and phrasing children does not --
            ('<a href="http://eie.io">“O <b>Deep</b> Thought <div>computer," he</div></a>', False),
        ],
    )
    def it_determines_whether_it_is_phrasing_dynamically(
        self, html_text: str, expected_value: bool
    ):
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]

        assert isinstance(a, Anchor)
        assert a.is_phrasing is expected_value

    # -- .iter_elements() -------------------------------------------------

    def it_can_also_act_as_a_block_item(self):
        html_text = """
        <div>
          <a href="http://eie.io">
            O Deep Thought computer, he said,
            <div>The task we have designed you to perform is this.</div>
            <p>We want you to tell us.... he paused,</p>
          </a>
        </div>
        """
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]

        elements = a.iter_elements()

        assert [e.text for e in elements] == [
            "O Deep Thought computer, he said,",
            "The task we have designed you to perform is this.",
            "We want you to tell us.... he paused,",
        ]

    # -- .iter_text_segments() --------------------------------------------

    @pytest.mark.parametrize(
        ("html_text", "expected_value"),
        [
            # -- produces no text-segment or annotation for anchor.text when there is none --
            ('<a href="http://abc.com"></a>', []),
            # -- but it produces a text-segment for the tail if there is one --
            ('<a href="http://abc.com"></a> long tail ', [TextSegment(" long tail ", {})]),
            # -- produces text-segment but no annotation for anchor.text when it is whitespace --
            ('<a href="http://abc.com">  </a>', [TextSegment("  ", {})]),
            # -- produces text-segment and annotation for anchor text
            # -- Note link-texts annotation is whitespace-normalized but text-segment text is not.
            (
                '<a href="http://abc.com"> click here </a>',
                [
                    TextSegment(
                        " click here ",
                        {"link_texts": ["click here"], "link_urls": ["http://abc.com"]},
                    )
                ],
            ),
            # -- produces text-segment for both text and tail when present --
            (
                '<a href="http://abc.com"> click here </a> long tail',
                [
                    TextSegment(
                        " click here ",
                        {"link_texts": ["click here"], "link_urls": ["http://abc.com"]},
                    ),
                    TextSegment(" long tail", {}),
                ],
            ),
            # -- nested phrasing inside <a> element is handled as expected --
            (
                '<p>I am <a href="http://eie.io">one <u>with<i> the</i></u> Force</a>.</p>',
                [
                    TextSegment(
                        "one with the Force",
                        {
                            "emphasized_text_contents": ["the"],
                            "emphasized_text_tags": ["i"],
                            "link_texts": ["one with the Force"],
                            "link_urls": ["http://eie.io"],
                        },
                    ),
                    TextSegment(".", {}),
                ],
            ),
        ],
    )
    def it_generates_link_annotated_text_segments_for_its_text_and_a_tail_text_segment(
        self, html_text: str, expected_value: list[TextSegment]
    ):
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
        assert list(a.iter_text_segments()) == expected_value


# -- DEFAULT ELEMENT -----------------------------------------------------------------------------


class DescribeDefaultElement:
    """Isolated unit-test suite for `unstructured.partition.html.parser.DefaultElement`.

    Used for any element we haven't assigned a custom element-class too. This prominently includes
    any non-HTML elements that can be embedded in the HTML.

    It identifies as a block item but it can behave as either a block-item or phrasing. Its behavior
    is a combination of RemovedBlock and RemovedPhrasing. Namely, it iterates zero elements and only
    iterates a text-segment for its tail.
    """

    # -- .is_phrasing -----------------------------------------------------

    def it_identifies_as_a_phrasing_element(self):
        foobar = etree.fromstring("<foobar>Vogon</foobar>", html_parser).xpath(".//foobar")[0]

        assert isinstance(foobar, DefaultElement)
        assert foobar.is_phrasing is True

    # -- .iter_elements() -------------------------------------------------

    def it_generates_zero_elements_as_a_block_item(self):
        """Should never be called but belts and suspenders."""
        foobar = etree.fromstring(
            "<foobar>Space<p>is big</p>, <b>mind-bogglingly</b> big.</foobar>",
            html_parser,
        ).xpath(".//foobar")[0]

        elements = foobar.iter_elements()

        with pytest.raises(StopIteration):
            next(elements)

    # -- .iter_text_segments() --------------------------------------------

    def it_generates_its_tail_but_no_inner_text_segments_when_called_like_phrasing(self):
        foobar = etree.fromstring(
            "<div>\n"
            "  O Deep Thought computer, he said,\n"
            "  <foobar>Vogon Constructor Fleet</foobar>\n"
            "  The task we have designed you to perform is this.\n"
            "  <p>We want you to tell us.... he paused,</p>\n"
            "</div>",
            html_parser,
        ).xpath(".//foobar")[0]

        texts = [ts.text for ts in foobar.iter_text_segments()]

        assert texts == ["\n  The task we have designed you to perform is this.\n  "]

    def and_it_behaves_like_an_empty_phrasing_element_inside_a_block_element(self):
        div = etree.fromstring(
            "<div>\n"
            "  O Deep Thought computer, he said,\n"
            "  <foobar>Vogon Constructor Fleet</foobar>\n"
            "  The task we have designed you to perform is this.\n"
            "  <p>We want you to tell us.... he paused,</p>\n"
            "</div>",
            html_parser,
        ).xpath(".//div")[0]

        texts = [e.text for e in div.iter_elements()]

        assert texts == [
            "O Deep Thought computer, he said, The task we have designed you to perform is this.",
            "We want you to tell us.... he paused,",
        ]