unstructured/test_unstructured/partition/html/test_parser.py

# pyright: reportPrivateUsage=false
# pyright: reportUnknownArgumentType=false

"""Test suite for `unstructured.partition.html.parser` module."""

from __future__ import annotations

from collections import deque

import pytest
from lxml import etree

from unstructured.documents.elements import Address, Element, ListItem, NarrativeText, Text, Title
from unstructured.partition.html.parser import (
    Annotation,
    DefaultElement,
    Flow,
    Phrasing,
    RemovedPhrasing,
    TextSegment,
    _consolidate_annotations,
    _ElementAccumulator,
    _normalize_text,
    _PhraseAccumulator,
    _PreElementAccumulator,
    html_parser,
)

# -- MODULE-LEVEL FUNCTIONS ----------------------------------------------------------------------

# -- _consolidate_annotations() ------------------


def it_consolidates_annotations_from_multiple_text_segments():
    annotations = [
        {
            "link_texts": "Ford Prefect",
            "link_url": "https://wikipedia/Ford_Prefect",
            "emphasized_text_contents": "Ford Prefect",
            "emphasized_text_tags": "b",
        },
        {
            "emphasized_text_contents": "alien encounter",
            "emphasized_text_tags": "bi",
        },
    ]

    annotations = _consolidate_annotations(annotations)

    assert annotations == {
        # -- each distinct key gets a list of values --
        "emphasized_text_contents": ["Ford Prefect", "alien encounter"],
        "emphasized_text_tags": ["b", "bi"],
        # -- even when there is only one value --
        "link_texts": ["Ford Prefect"],
        "link_url": ["https://wikipedia/Ford_Prefect"],
    }
    # -- and the annotations mapping is immutable --
    with pytest.raises(TypeError, match="object does not support item assignment"):
        annotations["new_key"] = "foobar"  # pyright: ignore[reportIndexIssue]
    # -- (but not its list values unfortunately) --
    annotations["emphasized_text_tags"].append("xyz")
    assert annotations["emphasized_text_tags"] == ["b", "bi", "xyz"]


# -- _normalize_text() ---------------------------


@pytest.mark.parametrize(
    ("text", "expected_value"),
    [
        # -- already normalized text is left unchanged --
        ("iterators allow", "iterators allow"),
        # -- newlines are treated as whitespace --
        ("algorithm\nto   be", "algorithm to be"),
        ("  separated\n  from  ", "separated from"),
        ("\n container\n details\n ", "container details"),
        (
            "\n  iterators  allow \n algorithm to be   \nexpressed  without container  \nnoise",
            "iterators allow algorithm to be expressed without container noise",
        ),
    ],
)
def test_normalize_text_produces_normalized_text(text: str, expected_value: str):
    assert _normalize_text(text) == expected_value


# -- PHRASING ACCUMULATORS -----------------------------------------------------------------------


class Describe_PhraseAccumulator:
    """Isolated unit-test suite for `unstructured.partition.html.parser._PhraseAccumulator`."""

    def it_is_empty_on_construction(self):
        accum = _PhraseAccumulator()

        phrase_iter = accum.flush()

        with pytest.raises(StopIteration):
            next(phrase_iter)

    # -- .add() -----------------------------------------------------------

    def it_accumulates_text_segments(self):
        accum = _PhraseAccumulator()

        accum.add(TextSegment("Ford... you're turning ", {}))
        accum.add(TextSegment("into a penguin.", {}))
        phrase_iter = accum.flush()

        phrase = next(phrase_iter)
        assert phrase == (
            TextSegment("Ford... you're turning ", {}),
            TextSegment("into a penguin.", {}),
        )

        with pytest.raises(StopIteration):
            next(phrase_iter)

    # -- .flush() ---------------------------------------------------------

    def it_generates_zero_phrases_on_flush_when_empty(self):
        accum = _PhraseAccumulator()

        phrase_iter = accum.flush()

        with pytest.raises(StopIteration):
            next(phrase_iter)


class Describe_ElementAccumulator:
    """Isolated unit-test suite for `unstructured.partition.html.parser._ElementAccumulator`."""

    def it_is_empty_on_construction(self, html_element: etree.ElementBase):
        accum = _ElementAccumulator(html_element)

        element_iter = accum.flush(None)

        with pytest.raises(StopIteration):
            next(element_iter)

    # -- .add() -----------------------------------------------------------

    def it_accumulates_text_segments(self, html_element: etree.ElementBase):
        accum = _ElementAccumulator(html_element)

        accum.add(TextSegment("Ford... you're turning ", {}))
        accum.add(TextSegment("into a penguin.", {}))
        element_iter = accum.flush(None)

        element = next(element_iter)
        assert element == NarrativeText("Ford... you're turning into a penguin.")

        with pytest.raises(StopIteration):
            next(element_iter)

    # -- .flush() ---------------------------------------------------------

    def it_generates_zero_elements_when_empty(self, html_element: etree.ElementBase):
        accum = _ElementAccumulator(html_element)

        element_iter = accum.flush(None)

        with pytest.raises(StopIteration):
            next(element_iter)

    def and_it_generates_zero_elements_when_all_its_text_segments_are_whitespace_only(
        self, html_element: etree.ElementBase
    ):
        accum = _ElementAccumulator(html_element)
        accum.add(TextSegment(" \n   \t \n", {}))
        accum.add(TextSegment("   \n", {}))

        with pytest.raises(StopIteration):
            next(accum.flush(None))

    def and_it_generates_zero_elements_when_there_is_only_one_non_whitespace_character(
        self, html_element: etree.ElementBase
    ):
        accum = _ElementAccumulator(html_element)
        accum.add(TextSegment(" \n   \t \n", {}))
        accum.add(TextSegment(" X \n", {}))

        with pytest.raises(StopIteration):
            next(accum.flush(None))

    def it_normalizes_the_text_of_its_text_segments_on_flush(self, html_element: etree.ElementBase):
        accum = _ElementAccumulator(html_element)
        accum.add(TextSegment(" \n  Ford...   you're \t turning\n", {}))
        accum.add(TextSegment("into a   penguin.\n", {}))

        (element,) = accum.flush(None)

        assert element.text == "Ford... you're turning into a penguin."

    def it_creates_a_document_element_of_the_specified_type(self, html_element: etree.ElementBase):
        accum = _ElementAccumulator(html_element)
        accum.add(TextSegment("Ford... you're turning into a penguin.", {}))

        (element,) = accum.flush(ListItem)

        assert element == ListItem("Ford... you're turning into a penguin.")

    def but_it_derives_the_element_type_from_the_text_when_none_is_specified(
        self, html_element: etree.ElementBase
    ):
        accum = _ElementAccumulator(html_element)
        accum.add(TextSegment("Ford... you're turning into a penguin.", {}))

        (element,) = accum.flush(None)

        assert element == NarrativeText("Ford... you're turning into a penguin.")

    def it_removes_an_explicit_leading_bullet_character_from_a_list_item(
        self, html_element: etree.ElementBase
    ):
        accum = _ElementAccumulator(html_element)
        accum.add(TextSegment("* turning into a penguin", {}))

        (element,) = accum.flush(None)

        assert element == ListItem("turning into a penguin")

    def it_applies_category_depth_metadata(self):
        html_element = etree.fromstring("<h3>About fish</h3>", html_parser).xpath(".//h3")[0]
        accum = _ElementAccumulator(html_element)
        accum.add(TextSegment("Thanks for all those!", {}))

        (element,) = accum.flush(Title)

        e = element.to_dict()
        e.pop("element_id")
        assert e == {
            "metadata": {"category_depth": 2},
            "text": "Thanks for all those!",
            "type": "Title",
        }

    def and_it_consolidates_annotations_into_metadata(self, html_element: etree.ElementBase):
        accum = _ElementAccumulator(html_element)
        accum.add(
            TextSegment(
                "\n    Ford...",
                {
                    "emphasized_text_contents": "Ford",
                    "emphasized_text_tags": "b",
                },
            )
        )
        accum.add(TextSegment(" you're turning into a ", {}))
        accum.add(
            TextSegment(
                "penguin",
                {
                    "emphasized_text_contents": "penguin",
                    "emphasized_text_tags": "i",
                },
            )
        )
        accum.add(TextSegment(".\n", {}))

        (element,) = accum.flush(NarrativeText)

        e = element.to_dict()
        e.pop("element_id")
        assert e == {
            "metadata": {
                "emphasized_text_contents": [
                    "Ford",
                    "penguin",
                ],
                "emphasized_text_tags": [
                    "b",
                    "i",
                ],
            },
            "text": "Ford... you're turning into a penguin.",
            "type": "NarrativeText",
        }

    # -- ._category_depth() -----------------------------------------------

    @pytest.mark.parametrize(
        ("html_text", "tag", "ElementCls", "expected_value"),
        [
            ("<p>Ford... you're turning into a penguin. Stop it.<p>", "p", Text, None),
            ("<p>* thanks for all the fish.</p>", "p", ListItem, 0),
            ("<li>thanks for all the fish.</li>", "li", ListItem, 0),
            ("<ul><li>So long</li><li>and thanks for all the fish.</li></ul>", "li", ListItem, 1),
            ("<dl><dd>So long<ol><li>and thanks for the fish.</li></ol></ul>", "li", ListItem, 2),
            ("<p>Examples</p>", "p", Title, 0),
            ("<h1>Examples</h1>", "h1", Title, 0),
            ("<h2>Examples</h2>", "h2", Title, 1),
            ("<h3>Examples</h3>", "h3", Title, 2),
            ("<h4>Examples</h4>", "h4", Title, 3),
            ("<h5>Examples</h5>", "h5", Title, 4),
            ("<h6>Examples</h6>", "h6", Title, 5),
        ],
    )
    def it_computes_the_category_depth_to_help(
        self, html_text: str, tag: str, ElementCls: type[Element], expected_value: int | None
    ):
        e = etree.fromstring(html_text, html_parser).xpath(f".//{tag}")[0]
        accum = _ElementAccumulator(e)
        assert accum._category_depth(ElementCls) == expected_value

    # -- ._normalized_text ------------------------------------------------

    def it_computes_the_normalized_text_of_its_text_segments_to_help(
        self, html_element: etree.ElementBase
    ):
        accum = _ElementAccumulator(html_element)
        accum.add(TextSegment(" \n  Ford...   you're \t turning\n", {}))
        accum.add(TextSegment("into a   penguin.\n", {}))

        assert accum._normalized_text == "Ford... you're turning into a penguin."

    # -- fixtures --------------------------------------------------------------------------------

    @pytest.fixture()
    def html_element(self) -> etree.ElementBase:
        return etree.fromstring("<p/>", html_parser).xpath(".//p")[0]


class Describe_PreElementAccumulator:
    """Isolated unit-test suite for `unstructured.partition.html.parser._PreElementAccumulator`."""

    def it_computes_the_normalized_text_of_its_text_segments_to_help(self):
        html_element = etree.fromstring("<p/>", html_parser).xpath(".//p")[0]
        accum = _PreElementAccumulator(html_element)
        accum.add(TextSegment("\n\n", {}))
        accum.add(TextSegment("    The panel lit up\n", {}))
        accum.add(TextSegment("    with the words 'Please do not press\n", {}))
        accum.add(TextSegment("    this button again'\n\n", {}))

        # -- note single leading and trailing newline stripped --
        assert accum._normalized_text == (
            "\n"
            "    The panel lit up\n"
            "    with the words 'Please do not press\n"
            "    this button again'\n"
        )


# -- FLOW (BLOCK-ITEM) ELEMENTS ------------------------------------------------------------------


class DescribeFlow:
    """Isolated unit-test suite for `unstructured.partition.html.parser.Flow`.

    The `Flow` class provides most behaviors for flow (block-level) elements.
    """

    # -- .is_phrasing -----------------------------------------------------

    def it_knows_it_is_NOT_a_phrasing_element(self):
        p = etree.fromstring("<p>Hello</p>", html_parser).xpath(".//p")[0]

        assert isinstance(p, Flow)
        assert p.is_phrasing is False

    # -- .iter_elements() -------------------------------------------------

    def it_generates_the_document_elements_from_the_Flow_element(self):
        """Phrasing siblings of child block elements are processed with text or tail.

        In the general case, a Flow element can contain text, phrasing content, and child flow
        elements.

        Each of these five lines in this example is a "paragraph" and gives rise to a distinct
        document-element.
        """
        html_text = """
          <div>
            Text of div <b>with <i>hierarchical</i>\nphrasing</b> content before first block item
            <p>Click <a href="http://blurb.io">here</a> to see the blurb for this block item. </p>
            tail of block item <b>with <i>hierarchical</i> phrasing </b> content
            <p>second block item</p>
            tail of block item <b>with <i>  hierarchical  </i></b> phrasing content
          </div>
        """
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        elements = div.iter_elements()

        e = next(elements)
        assert e == Text("Text of div with hierarchical phrasing content before first block item")
        assert e.metadata.to_dict() == {
            "emphasized_text_contents": ["with", "hierarchical", "phrasing"],
            "emphasized_text_tags": ["b", "bi", "b"],
        }
        e = next(elements)
        assert e == NarrativeText("Click here to see the blurb for this block item.")
        assert e.metadata.to_dict() == {"link_texts": ["here"], "link_urls": ["http://blurb.io"]}
        e = next(elements)
        assert e == Text("tail of block item with hierarchical phrasing content")
        assert e.metadata.to_dict() == {
            "emphasized_text_contents": ["with", "hierarchical", "phrasing"],
            "emphasized_text_tags": ["b", "bi", "b"],
        }
        e = next(elements)
        assert e == Text("second block item")
        assert e.metadata.to_dict() == {}
        e = next(elements)
        assert e == Text("tail of block item with hierarchical phrasing content")
        assert e.metadata.to_dict() == {
            "emphasized_text_contents": ["with", "hierarchical"],
            "emphasized_text_tags": ["b", "bi"],
        }
        with pytest.raises(StopIteration):
            e = next(elements)

    # -- ._element_from_text_or_tail() ------------------------------------

    def it_assembles_text_and_tail_document_elements_to_help(self):
        """Text and tails and their phrasing content are both processed the same way."""
        html_text = "<div>The \n Roman <b>poet <i>   Virgil</i> gave</b> his <q>pet</q> fly</div>"
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        elements = div._element_from_text_or_tail(div.text, deque(div), Text)

        e = next(elements)
        # -- element text is normalized --
        assert e == Text("The Roman poet Virgil gave his pet fly")
        # -- individual annotations are consolidated --
        assert e.metadata.to_dict() == {
            "emphasized_text_contents": ["poet", "Virgil", "gave"],
            "emphasized_text_tags": ["b", "bi", "b"],
        }

    def but_it_does_not_generate_a_document_element_when_only_whitespace_is_contained(self):
        html_text = "<div>   <b> \n <i>  \n </i>  </b>   <q> \n </q> \n  </div>"
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        elements = div._element_from_text_or_tail(div.text, deque(div), Text)

        with pytest.raises(StopIteration):
            next(elements)

    def it_uses_the_specified_element_class_to_form_the_document_element(self):
        html_text = "<div>\n  The line-storm clouds fly tattered and swift\n</div>"
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        elements = div._element_from_text_or_tail(div.text, deque(div), Address)

        e = next(elements)
        assert e == Address("The line-storm clouds fly tattered and swift")
        assert e.metadata.to_dict() == {}
        with pytest.raises(StopIteration):
            next(elements)

    def and_it_selects_the_document_element_class_by_analyzing_the_text_when_not_specified(self):
        html_text = "<div>\n  The line-storm clouds fly tattered and swift,\n</div>"
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        elements = div._element_from_text_or_tail(div.text, deque(div))

        assert next(elements) == NarrativeText("The line-storm clouds fly tattered and swift,")

    def but_it_does_not_generate_a_document_element_when_only_a_bullet_character_is_contained(self):
        html_text = "<div> * </div>"
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        elements = div._element_from_text_or_tail(div.text, deque(div))

        with pytest.raises(StopIteration):
            next(elements)

    # -- ._iter_text_segments() -------------------------------------------

    @pytest.mark.parametrize(
        ("html_text", "expected_value"),
        [
            (  # -- text with no phrasing --
                "<p>Ford... you're turning into a penguin.<p>",
                [("Ford... you're turning into a penguin.", {})],
            ),
            (  # -- text with phrasing --
                "<p>Ford... <b>you're turning</b> into\na <i>penguin</i>.<p>",
                [
                    ("Ford... ", {}),
                    (
                        "you're turning",
                        {"emphasized_text_contents": "you're turning", "emphasized_text_tags": "b"},
                    ),
                    (" into\na ", {}),
                    (
                        "penguin",
                        {"emphasized_text_contents": "penguin", "emphasized_text_tags": "i"},
                    ),
                    (".", {}),
                ],
            ),
            (  # -- text with nested phrasing --
                "<p>Ford... <b>you're <i>turning</i></b> into a penguin.<p>",
                [
                    ("Ford... ", {}),
                    (
                        "you're ",
                        {"emphasized_text_contents": "you're", "emphasized_text_tags": "b"},
                    ),
                    (
                        "turning",
                        {"emphasized_text_contents": "turning", "emphasized_text_tags": "bi"},
                    ),
                    (" into a penguin.", {}),
                ],
            ),
        ],
    )
    def it_recursively_generates_text_segments_from_text_and_phrasing_to_help(
        self, html_text: str, expected_value: list[Annotation]
    ):
        p = etree.fromstring(html_text, html_parser).xpath(".//p")[0]
        text_segments = list(p._iter_text_segments(p.text, deque(p)))

        assert text_segments == expected_value


class DescribePre:
    """Isolated unit-test suite for `unstructured.partition.html.parser.Pre`.

    The `Pre` class specializes behaviors for the `<pre>` (pre-formatted text) element.
    """

    def it_preserves_the_whitespace_of_its_phrasing_only_contents(self):
        """A `<pre>` element can contain only phrasing content."""
        html_text = (
            "<pre>\n"
            "  The Answer to the Great Question...   Of Life, the Universe and Everything...\n"
            "  Is... Forty-two, said Deep Thought, with infinite majesty and calm.\n"
            "</pre>\n"
        )
        pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]

        elements = pre.iter_elements()

        e = next(elements)
        assert e == Text(
            "  The Answer to the Great Question...   Of Life, the Universe and Everything...\n"
            "  Is... Forty-two, said Deep Thought, with infinite majesty and calm."
        )
        with pytest.raises(StopIteration):
            next(elements)

    @pytest.mark.parametrize(
        ("html_text", "expected_value"),
        [
            # -- a newline in the 0th position of pre.text is dropped --
            ("<pre>\n  foo  </pre>", "  foo  "),
            # -- but not when preceded by any other whitespace --
            ("<pre> \n  foo  </pre>", " \n  foo  "),
            # -- and only one is dropped --
            ("<pre>\n\n  foo  </pre>", "\n  foo  "),
            # -- a newline in the -1th position is dropped --
            ("<pre>  foo  \n</pre>", "  foo  "),
            # -- but not when followed by any other whitespace --
            ("<pre>  foo  \n </pre>", "  foo  \n "),
            # -- and only one is dropped --
            ("<pre>  foo  \n\n</pre>", "  foo  \n"),
            # -- a newline in both positions are both dropped --
            ("<pre>\n  foo  \n</pre>", "  foo  "),
            # -- or not when not at the absolute edge --
            ("<pre> \n  foo  \n </pre>", " \n  foo  \n "),
        ],
    )
    def but_it_strips_a_single_leading_or_trailing_newline(
        self, html_text: str, expected_value: str
    ):
        """Content starts on next line when opening `<pre>` tag is immediately followed by `\n`"""
        pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]
        e = next(pre.iter_elements())

        assert e.text == expected_value

    def it_assigns_emphasis_and_link_metadata_when_contents_have_those_phrasing_elements(self):
        html_text = '<pre>You\'re <b>turning</b> into a <a href="http://eie.io">penguin</a>.</pre>'
        pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]

        e = next(pre.iter_elements())

        assert e.text == "You're turning into a penguin."
        assert e.metadata.emphasized_text_contents == ["turning"]
        assert e.metadata.emphasized_text_tags == ["b"]
        assert e.metadata.link_texts == ["penguin"]
        assert e.metadata.link_urls == ["http://eie.io"]


class DescribeRemovedBlock:
    """Isolated unit-test suite for `unstructured.partition.html.parser.RemovedBlock`.

    This class is used for block level items we want to skip like `<hr/>` and `<figure>`.
    """

    def it_is_skipped_during_parsing(self):
        html_text = """
          <div>
            <hr/>
            <figure>
              <img src="/media/cc0-images/elephant-660-480.jpg" alt="Elephant at sunset" />
              <figcaption>An elephant at sunset</figcaption>
            </figure>
            <p>Content we want.</p>
          </div>
          """
        div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]

        assert list(div.iter_elements()) == [NarrativeText("Content we want.")]


# -- PHRASING (INLINE) ELEMENTS ------------------------------------------------------------------


class DescribePhrasing:
    """Isolated unit-test suite for `unstructured.partition.html.parser.Phrasing`.

    The `Phrasing` class provides most behaviors for phrasing (inline) elements.
    """

    # -- .is_phrasing -----------------------------------------------------

    def it_knows_it_is_a_phrasing_element(self):
        b = etree.fromstring("<b>Hello</b>", html_parser).xpath(".//b")[0]

        assert isinstance(b, Phrasing)
        assert b.is_phrasing is True

    # -- .iter_text_segments() --------------------------------------------

    @pytest.mark.parametrize(
        ("html_text", "expected_value"),
        [
            # -- an empty element produces no text segments --
            ("<code></code>", []),
            # -- element text produces one segment --
            ("<data> foo </data>", [(" foo ", {})]),
            # -- element tail produces one segment --
            ("<dfn/> bar ", [(" bar ", {})]),
            # -- element descendants each produce one segment --
            ("<kbd><mark>foo <meter>bar</meter></mark></kbd>", [("foo ", {}), ("bar", {})]),
            # -- and any combination produces a segment for each text, child, and tail --
            (
                "<kbd> <mark>foo <meter>bar</meter> baz</mark> </kbd>",
                [
                    (" ", {}),
                    ("foo ", {}),
                    ("bar", {}),
                    (" baz", {}),
                    (" ", {}),
                ],
            ),
        ],
    )
    def it_generates_text_segments_for_its_text_and_children_and_tail(
        self, html_text: str, expected_value: list[TextSegment]
    ):
        e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0]
        assert list(e.iter_text_segments()) == expected_value

    @pytest.mark.parametrize(
        ("html_text", "expected_value"),
        [
            # -- Phrasing with nested block but no text or tail produces only element for block --
            ("<strong><p>aaa</p></strong>", [Text("aaa")]),
            # -- Phrasing with text produces annotated text-segment for the text --
            (
                "<strong>aaa<p>bbb</p></strong>",
                [
                    TextSegment(
                        "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
                    ),
                    Text("bbb"),
                ],
            ),
            # -- Phrasing with tail produces annotated text-segment for the tail --
            (
                "<strong><p>aaa</p>bbb</strong>",
                [
                    Text("aaa"),
                    TextSegment(
                        "bbb", {"emphasized_text_contents": "bbb", "emphasized_text_tags": "b"}
                    ),
                ],
            ),
            # -- Phrasing with text, nested block, and tail produces all three --
            (
                "<strong>aaa<p>bbb</p>ccc</strong>",
                [
                    TextSegment(
                        "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
                    ),
                    Text("bbb"),
                    TextSegment(
                        "ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"}
                    ),
                ],
            ),
        ],
    )
    def but_it_can_also_generate_an_element_when_it_has_a_nested_block_element(
        self, html_text: str, expected_value: list[TextSegment | Element]
    ):
        e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0]
        assert list(e.iter_text_segments()) == expected_value

    # -- ._annotation() ---------------------------------------------------

    def it_forms_its_annotations_from_emphasis(self):
        cite = etree.fromstring("<cite/>", html_parser).xpath(".//cite")[0]
        assert cite._annotation("\n  foobar\n  ", "bi") == {
            "emphasized_text_contents": "foobar",
            "emphasized_text_tags": "bi",
        }

    @pytest.mark.parametrize("text", ["", "\n  \t  "])
    def but_not_when_text_is_empty_or_whitespace(self, text: str):
        cite = etree.fromstring("<cite/>", html_parser).xpath(".//cite")[0]
        assert cite._annotation(text, "bi") == {}

    def and_not_when_there_is_no_emphasis(self):
        cite = etree.fromstring("<cite/>", html_parser).xpath(".//cite")[0]
        assert cite._annotation("foobar", "") == {}

    # -- ._inside_emphasis() ----------------------------------------------

    @pytest.mark.parametrize("enclosing_emphasis", ["", "b", "bi"])
    def it_uses_the_enclosing_emphasis_as_the_default_inside_emphasis(
        self, enclosing_emphasis: str
    ):
        """Inside emphasis is applied to text inside the phrasing element (but not its tail).

        The `._inside_emphasis()` method is overridden by Bold and Italic classes which add their
        specific emphasis characters.
        """
        abbr = etree.fromstring("<abbr/>", html_parser).xpath(".//abbr")[0]
        assert abbr._inside_emphasis(enclosing_emphasis) == enclosing_emphasis

    # -- ._iter_child_text_segments() -------------------------------------

    @pytest.mark.parametrize(
        ("html_text", "expected_value"),
        [
            # -- a phrasing element with no children produces no text segments
            # -- (element text is handled elsewhere)
            ("<abbr>aaa</abbr>", []),
            # -- child phrasing element produces text-segment for its text --
            ("<bdi>x<bdo>bbb</bdo></bdi>", [TextSegment("bbb", {})]),
            # -- and also for its tail when it has one --
            ("<bdi>x<bdo>bbb</bdo>ccc</bdi>", [TextSegment("bbb", {}), TextSegment("ccc", {})]),
            # -- nested phrasing recursively each produce a segment for text and tail, in order --
            (
                "<big>xxx<cite>aaa<code>bbb<data>ccc</data>ddd</code>eee</cite>fff</big>",
                [
                    TextSegment("aaa", {}),
                    TextSegment("bbb", {}),
                    TextSegment("ccc", {}),
                    TextSegment("ddd", {}),
                    TextSegment("eee", {}),
                    TextSegment("fff", {}),
                ],
            ),
        ],
    )
    def it_generates_text_segments_for_its_children_and_their_tails(
        self, html_text: str, expected_value: list[TextSegment]
    ):
        e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0]
        assert list(e._iter_child_text_segments("")) == expected_value

    @pytest.mark.parametrize(
        ("html_text", "inside_emphasis", "expected_value"),
        [
            # -- a phrasing element with no block children produces no elements --
            ("<dfn></dfn>", "", []),
            # -- a child block element produces an element --
            ("<kbd><p>aaa</p></kbd>", "", [Text("aaa")]),
            # -- a child block element with a tail also produces a text-segment for the tail --
            ("<kbd><p>aaa</p>bbb</kbd>", "", [Text("aaa"), TextSegment("bbb", {})]),
            # -- and also text-segments for phrasing following the tail --
            (
                "<kbd><p>aaa</p>bbb<mark>ccc</mark>ddd</kbd>",
                "",
                [
                    Text("aaa"),
                    TextSegment("bbb", {}),
                    TextSegment("ccc", {}),
                    TextSegment("ddd", {}),
                ],
            ),
            # -- and emphasis is applied before and after block-item --
            (
                "<strong><q>aaa</q><p>bbb</p>ccc<s>ddd</s>eee</strong>",
                "b",
                [
                    TextSegment(
                        "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
                    ),
                    Text("bbb"),
                    TextSegment(
                        "ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"}
                    ),
                    TextSegment(
                        "ddd", {"emphasized_text_contents": "ddd", "emphasized_text_tags": "b"}
                    ),
                    TextSegment(
                        "eee", {"emphasized_text_contents": "eee", "emphasized_text_tags": "b"}
                    ),
                ],
            ),
        ],
    )
    def and_it_generates_elements_for_its_block_children(
        self, html_text: str, inside_emphasis: str, expected_value: list[TextSegment | Element]
    ):
        e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0]
        assert list(e._iter_child_text_segments(inside_emphasis)) == expected_value

    # -- ._iter_text_segments_from_block_tail_and_phrasing() --------------

    @pytest.mark.parametrize(
        ("html_text", "emphasis", "expected_value"),
        [
            # -- no tail and no contiguous phrasing produces no text-segments --
            ("<cite><p/></cite>", "", []),
            # -- tail produces a text-segment --
            ("<cite><p/>aaa</cite>", "", [TextSegment("aaa", {})]),
            # -- contiguous phrasing produces a text-segment --
            ("<cite><p/><s>aaa</s></cite>", "", [TextSegment("aaa", {})]),
            # -- tail of contiguous phrasing also produces a text-segment --
            ("<bdi><p/><s>aaa</s>bbb</bdi>", "", [TextSegment("aaa", {}), TextSegment("bbb", {})]),
            # -- nested phrasing produces a text-segment --
            (
                "<sub><p/>aaa<s>bbb<q>ccc</q>ddd</s>eee</sub>",
                "",
                [
                    TextSegment("aaa", {}),
                    TextSegment("bbb", {}),
                    TextSegment("ccc", {}),
                    TextSegment("ddd", {}),
                    TextSegment("eee", {}),
                ],
            ),
            # -- and emphasis is added to each text-segment when specified --
            (
                "<strong><p/>aaa<s>bbb<i>ccc</i>ddd</s>eee</strong>",
                "b",
                [
                    TextSegment(
                        "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
                    ),
                    TextSegment(
                        "bbb", {"emphasized_text_contents": "bbb", "emphasized_text_tags": "b"}
                    ),
                    TextSegment(
                        "ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "bi"}
                    ),
                    TextSegment(
                        "ddd", {"emphasized_text_contents": "ddd", "emphasized_text_tags": "b"}
                    ),
                    TextSegment(
                        "eee", {"emphasized_text_contents": "eee", "emphasized_text_tags": "b"}
                    ),
                ],
            ),
            # -- a block item nested in contiguous phrasing produces an Element --
            (
                "<cite><p/>aaa<abbr>bbb<p>ccc</p>ddd</abbr>eee</cite>",
                "",
                [
                    TextSegment("aaa", {}),
                    TextSegment("bbb", {}),
                    Text("ccc"),
                    TextSegment("ddd", {}),
                    TextSegment("eee", {}),
                ],
            ),
        ],
    )
    def it_generates_text_segments_from_the_tail_and_contiguous_phrasing(
        self, html_text: str, emphasis: str, expected_value: list[TextSegment | Element]
    ):
        e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0]
        p = e.xpath("./p")[0]
        tail = p.tail or ""
        q = deque(e[1:])

        assert (
            list(e._iter_text_segments_from_block_tail_and_phrasing(tail, q, emphasis))
            == expected_value
        )


class DescribeAnchor:
    """Isolated unit-test suite for `unstructured.partition.html.parser.Anchor`.

    The `Anchor` class is used for `<a>` tags and provides link metadata.
    """

    # -- .iter_text_segments() --------------------------------------------

    @pytest.mark.parametrize(
        ("html_text", "emphasis", "expected_value"),
        [
            # -- produces no text-segment or annotation for anchor.text when there is none --
            ('<a href="http://abc.com"></a>', "", []),
            # -- but it produces a text-segment for the tail if there is one --
            ('<a href="http://abc.com"></a> long tail ', "", [TextSegment(" long tail ", {})]),
            # -- produces text-segment but no annotation for anchor.text when it is whitespace --
            ('<a href="http://abc.com">  </a>', "", [TextSegment("  ", {})]),
            # -- produces text-segment and annotation for anchor text. Note `link_texts:`
            # -- annotation value is whitespace-normalized but text-segment text is not.
            (
                '<a href="http://abc.com"> click here </a>',
                "",
                [
                    TextSegment(
                        " click here ",
                        {"link_texts": ["click here"], "link_urls": ["http://abc.com"]},
                    )
                ],
            ),
            # -- produces text-segment for both text and tail when present --
            (
                '<a href="http://abc.com"> click here </a> long tail',
                "",
                [
                    TextSegment(
                        " click here ",
                        {"link_texts": ["click here"], "link_urls": ["http://abc.com"]},
                    ),
                    TextSegment(" long tail", {}),
                ],
            ),
            # -- nested phrasing inside <a> element is handled as expected --
            (
                '<p>I am <a href="http://eie.io">one <u>with<i> the</i></u> Force</a>.</p>',
                "",
                [
                    TextSegment(
                        "one with the Force",
                        {
                            "emphasized_text_contents": ["the"],
                            "emphasized_text_tags": ["i"],
                            "link_texts": ["one with the Force"],
                            "link_urls": ["http://eie.io"],
                        },
                    ),
                    TextSegment(".", {}),
                ],
            ),
            # -- enclosing_emphasis is applied to all segments --
            (
                '<p>I am <strong><a href="http://eie.io">one with</a> the Force.</strong></p>',
                "b",
                [
                    TextSegment(
                        "one with",
                        {
                            "emphasized_text_contents": ["one with"],
                            "emphasized_text_tags": ["b"],
                            "link_texts": ["one with"],
                            "link_urls": ["http://eie.io"],
                        },
                    ),
                    TextSegment(
                        " the Force.",
                        {
                            "emphasized_text_contents": "the Force.",
                            "emphasized_text_tags": "b",
                        },
                    ),
                ],
            ),
        ],
    )
    def it_generates_link_annotated_text_segments_for_its_text_and_a_tail_text_segment(
        self, html_text: str, emphasis: str, expected_value: list[TextSegment]
    ):
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
        assert list(a.iter_text_segments(emphasis)) == expected_value

    def it_generates_enclosed_block_items_as_separate_elements(self):
        html_text = """<a href="http://eie.io">I am <p>one with</p> the Force.</a>"""
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]

        assert list(a.iter_text_segments("b")) == [
            TextSegment(
                "I am ",
                {
                    "emphasized_text_contents": ["I am"],
                    "emphasized_text_tags": ["b"],
                    "link_texts": ["I am"],
                    "link_urls": ["http://eie.io"],
                },
            ),
            Text("one with"),
            TextSegment(
                " the Force.",
                {
                    "emphasized_text_contents": "the Force.",
                    "emphasized_text_tags": "b",
                },
            ),
        ]

    def and_it_annotates_first_enclosed_block_Element_when_no_non_whitespace_phrase_appears_first(
        self,
    ):
        html_text = """<a href="http://eie.io"> \n <p>I am one with</p> the Force.</a>"""
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]

        actual = list(a.iter_text_segments("i"))

        assert actual == [
            TextSegment(" \n ", {}),
            NarrativeText("I am one with"),
            TextSegment(
                " the Force.",
                {
                    "emphasized_text_contents": "the Force.",
                    "emphasized_text_tags": "i",
                },
            ),
        ]
        element = actual[1]
        assert element.metadata.link_texts == ["I am one with"]
        assert element.metadata.link_urls == ["http://eie.io"]

    # -- ._iter_phrases_and_elements() ------------------------------------

    def it_divides_the_anchor_contents_but_not_tail_into_phrases_and_elements(self):
        html_text = """
          <a href="http://eie.io">But always <p>see first.</p> Otherwise you </a> will only see
        """
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]

        assert list(a._iter_phrases_and_elements(emphasis="")) == [
            (TextSegment("But always ", {}),),
            NarrativeText("see first."),
            (TextSegment(" Otherwise you ", {}),),
        ]

    # -- ._iter_phrasing() ------------------------------------------------

    def it_generates_zero_items_when_both_text_and_q_are_empty(self):
        html_text = """<a href="http://eie.io"></a>"""
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]

        with pytest.raises(StopIteration):
            next(a._iter_phrasing(text="", q=deque([]), emphasis=""))

    def it_generates_a_phrase_when_only_text_is_present(self):
        html_text = """<a href="http://eie.io">\n  But always see first.\n</a>"""
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]

        assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [
            (TextSegment("\n  But always see first.\n", {}),)
        ]

    def and_it_generates_a_phrase_when_that_text_is_followed_by_a_phrasing_element(self):
        html_text = """<a href="http://eie.io">But always <b>see <i>first</i></b>. Otherwise</a>"""
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]

        assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [
            (
                TextSegment("But always ", {}),
                TextSegment(
                    "see ",
                    {
                        "emphasized_text_contents": "see",
                        "emphasized_text_tags": "b",
                    },
                ),
                TextSegment(
                    "first",
                    {
                        "emphasized_text_contents": "first",
                        "emphasized_text_tags": "bi",
                    },
                ),
                TextSegment(". Otherwise", {}),
            )
        ]

    def it_ends_the_phrase_at_the_end_of_the_element(self):
        html_text = """<a href="http://eie.io">But always see first.</a> Otherwise you will """
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]

        assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [
            (TextSegment("But always see first.", {}),)
        ]

    def but_it_ends_at_a_block_element_if_one_occurs_first(self):
        html_text = """<a href="http://eie.io">But always see first. <p>Otherwise you </p> </a>"""
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]

        assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [
            (TextSegment("But always see first. ", {}),)
        ]

    def it_generates_an_element_for_a_block_item_nested_inside_phrasing(self):
        html_text = """
          <a href="http://eie.io">But <strong>always <p>see first.</p>Otherwise</strong> you </a>
        """
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]

        assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [
            (
                TextSegment("But ", {}),
                TextSegment(
                    "always ",
                    {
                        "emphasized_text_contents": "always",
                        "emphasized_text_tags": "b",
                    },
                ),
            ),
            NarrativeText("see first."),
            (
                TextSegment(
                    "Otherwise",
                    {
                        "emphasized_text_contents": "Otherwise",
                        "emphasized_text_tags": "b",
                    },
                ),
                TextSegment(" you ", {}),
            ),
        ]

    # -- ._link_annotate_element() ----------------------------------------

    def it_adds_link_metadata_to_an_element_to_help(self):
        html_text = """<a href="http://eie.io"></a>"""
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
        element = Text("aaa")

        e = a._link_annotate_element(element)

        assert e is element
        assert e.metadata.link_texts == ["aaa"]
        assert e.metadata.link_urls == ["http://eie.io"]

    def and_it_preserves_any_existing_link_metadata_on_the_element(self):
        # -- nested anchors shouldn't be possible but easier to test than prove it can't happen --
        html_text = """<a href="http://eie.io"></a>"""
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
        element = Text("bbb")
        element.metadata.link_texts = ["abc"]
        element.metadata.link_urls = ["http://abc.com"]

        e = a._link_annotate_element(element)

        assert e is element
        assert e.metadata.link_texts == ["abc", "bbb"]
        assert e.metadata.link_urls == ["http://abc.com", "http://eie.io"]

    def but_not_when_the_text_is_empty(self):
        html_text = """<a href="http://eie.io"/>"""
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
        element = Text("")

        e = a._link_annotate_element(element)

        assert e is element
        assert e.metadata.link_texts is None
        assert e.metadata.link_urls is None

    def and_not_when_there_is_no_url(self):
        html_text = """<a/>"""
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
        element = Text("zzz")

        e = a._link_annotate_element(element)

        assert e is element
        assert e.metadata.link_texts is None
        assert e.metadata.link_urls is None

    # -- ._link_text_segment() --------------------------------------------

    def it_consolidates_a_phrase_into_a_single_link_annotated_TextSegment_to_help(self):
        html_text = """<a href="http://eie.io"></a>"""
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
        phrase = (
            TextSegment(
                "Otherwise you will only ",
                {
                    "emphasized_text_contents": ["Otherwise"],
                    "emphasized_text_tags": ["i"],
                },
            ),
            TextSegment(
                "see what you were expecting.\n",
                {
                    "emphasized_text_contents": "expecting",
                    "emphasized_text_tags": "b",
                },
            ),
        )

        link_text_segment = a._link_text_segment(phrase)

        assert link_text_segment == TextSegment(
            "Otherwise you will only see what you were expecting.\n",
            {
                "emphasized_text_contents": ["Otherwise", "expecting"],
                "emphasized_text_tags": ["i", "b"],
                "link_texts": ["Otherwise you will only see what you were expecting."],
                "link_urls": ["http://eie.io"],
            },
        )

    @pytest.mark.parametrize("text", ["", " \n \t "])
    def but_not_when_the_text_is_empty_or_whitespace_only(self, text: str):
        html_text = """<a href="http://eie.io"></a>"""
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
        phrase = (TextSegment(text, {}), TextSegment(text, {}), TextSegment(text, {}))

        assert a._link_text_segment(phrase) is None

    def and_not_when_the_anchor_has_no_href_url(self):
        html_text = """<a>foobar</a>"""
        a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
        phrase = (TextSegment("Otherwise", {}), TextSegment(" you will", {}))

        assert a._link_text_segment(phrase) is None


class DescribeBold:
    """Isolated unit-test suite for `unstructured.partition.html.parser.Bold`.

    The `Bold` class is used for `<b>` and `<strong>` tags and adds emphasis metadata.
    """

    def it_annotates_its_text_segment_with_bold_emphasis(self):
        b = etree.fromstring("<b>rhombus</b>", html_parser).xpath(".//b")[0]

        text_segments = b.iter_text_segments()

        text, annotation = next(text_segments)
        assert text == "rhombus"
        assert annotation == {
            "emphasized_text_contents": "rhombus",
            "emphasized_text_tags": "b",
        }

    def and_its_children_are_also_annotated_with_bold_emphasis(self):
        b = etree.fromstring("<b>rhombus <i>pentagon</i></b>", html_parser).xpath(".//b")[0]

        text_segments = b.iter_text_segments()

        text, annotation = next(text_segments)
        assert text == "rhombus "
        assert annotation == {
            "emphasized_text_contents": "rhombus",
            "emphasized_text_tags": "b",
        }
        text, annotation = next(text_segments)
        assert text == "pentagon"
        assert annotation == {
            "emphasized_text_contents": "pentagon",
            "emphasized_text_tags": "bi",
        }

    def but_not_its_tail(self):
        b = etree.fromstring("<b>rhombus</b> pentagon", html_parser).xpath(".//b")[0]

        text_segments = b.iter_text_segments()

        text, annotation = next(text_segments)
        assert text == "rhombus"
        assert annotation == {
            "emphasized_text_contents": "rhombus",
            "emphasized_text_tags": "b",
        }
        text, annotation = next(text_segments)
        assert text == " pentagon"
        assert annotation == {}


class DescribeItalic:
    """Isolated unit-test suite for `unstructured.partition.html.parser.Italic`.

    The `Italic` class is used for `<i>` and `<em>` tags and adds emphasis metadata.
    """

    def it_annotates_its_text_segment_with_italic_emphasis(self):
        i = etree.fromstring("<i>rhombus</i>", html_parser).xpath(".//i")[0]

        text_segments = i.iter_text_segments()

        text, annotation = next(text_segments)
        assert text == "rhombus"
        assert annotation == {
            "emphasized_text_contents": "rhombus",
            "emphasized_text_tags": "i",
        }

    def and_its_children_are_also_annotated_with_italic_emphasis(self):
        em = etree.fromstring("<em>rhombus <b>pentagon</b></em>", html_parser).xpath(".//em")[0]

        text_segments = em.iter_text_segments()

        text, annotation = next(text_segments)
        assert text == "rhombus "
        assert annotation == {
            "emphasized_text_contents": "rhombus",
            "emphasized_text_tags": "i",
        }
        text, annotation = next(text_segments)
        assert text == "pentagon"
        assert annotation == {
            "emphasized_text_contents": "pentagon",
            "emphasized_text_tags": "bi",
        }

    def but_not_its_tail(self):
        i = etree.fromstring("<i>rhombus</i> pentagon", html_parser).xpath(".//i")[0]

        text_segments = i.iter_text_segments()

        text, annotation = next(text_segments)
        assert text == "rhombus"
        assert annotation == {
            "emphasized_text_contents": "rhombus",
            "emphasized_text_tags": "i",
        }
        text, annotation = next(text_segments)
        assert text == " pentagon"
        assert annotation == {}


class DescribeLineBreak:
    """Isolated unit-test suite for `unstructured.partition.html.parser.LineBreak`.

    Used for `<br/>` elements, it's only special behavior is to add whitespace such that phrasing
    butted up tight on both sides of the `<br/>` element is not joined, like `abc<br/>def` should
    become "abc def", not "abcdef".
    """

    def it_adds_a_newline_in_its_place(self):
        cite = etree.fromstring(
            "<cite>spaceships of the<br/>Vogon Constructor Fleet</cite>", html_parser
        ).xpath(".//cite")[0]

        text_segments = cite.iter_text_segments()

        texts = [ts.text for ts in text_segments]
        assert texts == ["spaceships of the", "\n", "Vogon Constructor Fleet"]
        assert _normalize_text("".join(texts)) == "spaceships of the Vogon Constructor Fleet"


class DescribeRemovedPhrasing:
    """Isolated unit-test suite for `unstructured.partition.html.parser.RemovedPhrasing`.

    Used for phrasing elements like `<label>` that we want to skip, including any content they
    enclose. The tail of such an element is not skipped though.
    """

    def it_behaves_like_an_empty_element(self):
        label = etree.fromstring(
            "<div>\n"
            "  <label>Space<p>is big</p>, <b>mind-bogglingly</b> big.</label>\n"
            "  Like vastly, hugely big.\n"
            "</div>",
            html_parser,
        ).xpath(".//label")[0]

        (text_segment,) = list(label.iter_text_segments())

        assert isinstance(label, RemovedPhrasing)
        assert label.is_phrasing is True
        assert text_segment.text == "\n  Like vastly, hugely big.\n"


# -- DEFAULT ELEMENT -----------------------------------------------------------------------------


class DescribeDefaultElement:
    """Isolated unit-test suite for `unstructured.partition.html.parser.DefaultElement`.

    Used for any element we haven't assigned a custom element-class too. This prominently includes
    any non-HTML elements that can be embedded in the HTML.

    It identifies as a block item but it can behave as either a block-item or phrasing. Its behavior
    is a combination of RemovedBlock and RemovedPhrasing. Namely, it iterates zero elements and only
    iterates a text-segment for its tail.
    """

    # -- .is_phrasing -----------------------------------------------------

    def it_identifies_as_a_phrasing_element(self):
        foobar = etree.fromstring("<foobar>Vogon</foobar>", html_parser).xpath(".//foobar")[0]

        assert isinstance(foobar, DefaultElement)
        assert foobar.is_phrasing is True

    # -- .iter_elements() -------------------------------------------------

    def it_generates_zero_elements_as_a_block_item(self):
        """Should never be called but belts and suspenders."""
        foobar = etree.fromstring(
            "<foobar>Space<p>is big</p>, <b>mind-bogglingly</b> big.</foobar>",
            html_parser,
        ).xpath(".//foobar")[0]

        elements = foobar.iter_elements()

        with pytest.raises(StopIteration):
            next(elements)

    # -- .iter_text_segments() --------------------------------------------

    def it_generates_its_tail_but_no_inner_text_segments_when_called_like_phrasing(self):
        foobar = etree.fromstring(
            "<div>\n"
            "  O Deep Thought computer, he said,\n"
            "  <foobar>Vogon Constructor Fleet</foobar>\n"
            "  The task we have designed you to perform is this.\n"
            "  <p>We want you to tell us.... he paused,</p>\n"
            "</div>",
            html_parser,
        ).xpath(".//foobar")[0]

        texts = [ts.text for ts in foobar.iter_text_segments()]

        assert texts == ["\n  The task we have designed you to perform is this.\n  "]

    def and_it_behaves_like_an_empty_phrasing_element_inside_a_block_element(self):
        div = etree.fromstring(
            "<div>\n"
            "  O Deep Thought computer, he said,\n"
            "  <foobar>Vogon Constructor Fleet</foobar>\n"
            "  The task we have designed you to perform is this.\n"
            "  <p>We want you to tell us.... he paused,</p>\n"
            "</div>",
            html_parser,
        ).xpath(".//div")[0]

        texts = [e.text for e in div.iter_elements()]

        assert texts == [
            "O Deep Thought computer, he said, The task we have designed you to perform is this.",
            "We want you to tell us.... he paused,",
        ]