# pyright: reportPrivateUsage=false # pyright: reportUnknownArgumentType=false """Test suite for `unstructured.partition.html.parser` module.""" from __future__ import annotations from collections import deque import pytest from lxml import etree from unstructured.documents.elements import Address, Element, ListItem, NarrativeText, Text, Title from unstructured.partition.html.parser import ( Annotation, DefaultElement, Flow, Phrasing, RemovedPhrasing, TextSegment, _consolidate_annotations, _ElementAccumulator, _normalize_text, _PhraseAccumulator, _PreElementAccumulator, html_parser, ) # -- MODULE-LEVEL FUNCTIONS ---------------------------------------------------------------------- # -- _consolidate_annotations() ------------------ def it_consolidates_annotations_from_multiple_text_segments(): annotations = [ { "link_texts": "Ford Prefect", "link_url": "https://wikipedia/Ford_Prefect", "emphasized_text_contents": "Ford Prefect", "emphasized_text_tags": "b", }, { "emphasized_text_contents": "alien encounter", "emphasized_text_tags": "bi", }, ] annotations = _consolidate_annotations(annotations) assert annotations == { # -- each distinct key gets a list of values -- "emphasized_text_contents": ["Ford Prefect", "alien encounter"], "emphasized_text_tags": ["b", "bi"], # -- even when there is only one value -- "link_texts": ["Ford Prefect"], "link_url": ["https://wikipedia/Ford_Prefect"], } # -- and the annotations mapping is immutable -- with pytest.raises(TypeError, match="object does not support item assignment"): annotations["new_key"] = "foobar" # pyright: ignore[reportIndexIssue] # -- (but not its list values unfortunately) -- annotations["emphasized_text_tags"].append("xyz") assert annotations["emphasized_text_tags"] == ["b", "bi", "xyz"] # -- _normalize_text() --------------------------- @pytest.mark.parametrize( ("text", "expected_value"), [ # -- already normalized text is left unchanged -- ("iterators allow", "iterators allow"), # -- newlines are treated as whitespace -- ("algorithm\nto be", "algorithm to be"), (" separated\n from ", "separated from"), ("\n container\n details\n ", "container details"), ( "\n iterators allow \n algorithm to be \nexpressed without container \nnoise", "iterators allow algorithm to be expressed without container noise", ), ], ) def test_normalize_text_produces_normalized_text(text: str, expected_value: str): assert _normalize_text(text) == expected_value # -- PHRASING ACCUMULATORS ----------------------------------------------------------------------- class Describe_PhraseAccumulator: """Isolated unit-test suite for `unstructured.partition.html.parser._PhraseAccumulator`.""" def it_is_empty_on_construction(self): accum = _PhraseAccumulator() phrase_iter = accum.flush() with pytest.raises(StopIteration): next(phrase_iter) # -- .add() ----------------------------------------------------------- def it_accumulates_text_segments(self): accum = _PhraseAccumulator() accum.add(TextSegment("Ford... you're turning ", {})) accum.add(TextSegment("into a penguin.", {})) phrase_iter = accum.flush() phrase = next(phrase_iter) assert phrase == ( TextSegment("Ford... you're turning ", {}), TextSegment("into a penguin.", {}), ) with pytest.raises(StopIteration): next(phrase_iter) # -- .flush() --------------------------------------------------------- def it_generates_zero_phrases_on_flush_when_empty(self): accum = _PhraseAccumulator() phrase_iter = accum.flush() with pytest.raises(StopIteration): next(phrase_iter) class Describe_ElementAccumulator: """Isolated unit-test suite for `unstructured.partition.html.parser._ElementAccumulator`.""" def it_is_empty_on_construction(self, html_element: etree.ElementBase): accum = _ElementAccumulator(html_element) element_iter = accum.flush(None) with pytest.raises(StopIteration): next(element_iter) # -- .add() ----------------------------------------------------------- def it_accumulates_text_segments(self, html_element: etree.ElementBase): accum = _ElementAccumulator(html_element) accum.add(TextSegment("Ford... you're turning ", {})) accum.add(TextSegment("into a penguin.", {})) element_iter = accum.flush(None) element = next(element_iter) assert element == NarrativeText("Ford... you're turning into a penguin.") with pytest.raises(StopIteration): next(element_iter) # -- .flush() --------------------------------------------------------- def it_generates_zero_elements_when_empty(self, html_element: etree.ElementBase): accum = _ElementAccumulator(html_element) element_iter = accum.flush(None) with pytest.raises(StopIteration): next(element_iter) def and_it_generates_zero_elements_when_all_its_text_segments_are_whitespace_only( self, html_element: etree.ElementBase ): accum = _ElementAccumulator(html_element) accum.add(TextSegment(" \n \t \n", {})) accum.add(TextSegment(" \n", {})) with pytest.raises(StopIteration): next(accum.flush(None)) def and_it_generates_zero_elements_when_there_is_only_one_non_whitespace_character( self, html_element: etree.ElementBase ): accum = _ElementAccumulator(html_element) accum.add(TextSegment(" \n \t \n", {})) accum.add(TextSegment(" X \n", {})) with pytest.raises(StopIteration): next(accum.flush(None)) def it_normalizes_the_text_of_its_text_segments_on_flush(self, html_element: etree.ElementBase): accum = _ElementAccumulator(html_element) accum.add(TextSegment(" \n Ford... you're \t turning\n", {})) accum.add(TextSegment("into a penguin.\n", {})) (element,) = accum.flush(None) assert element.text == "Ford... you're turning into a penguin." def it_creates_a_document_element_of_the_specified_type(self, html_element: etree.ElementBase): accum = _ElementAccumulator(html_element) accum.add(TextSegment("Ford... you're turning into a penguin.", {})) (element,) = accum.flush(ListItem) assert element == ListItem("Ford... you're turning into a penguin.") def but_it_derives_the_element_type_from_the_text_when_none_is_specified( self, html_element: etree.ElementBase ): accum = _ElementAccumulator(html_element) accum.add(TextSegment("Ford... you're turning into a penguin.", {})) (element,) = accum.flush(None) assert element == NarrativeText("Ford... you're turning into a penguin.") def it_removes_an_explicit_leading_bullet_character_from_a_list_item( self, html_element: etree.ElementBase ): accum = _ElementAccumulator(html_element) accum.add(TextSegment("* turning into a penguin", {})) (element,) = accum.flush(None) assert element == ListItem("turning into a penguin") def it_applies_category_depth_metadata(self): html_element = etree.fromstring("

About fish

", html_parser).xpath(".//h3")[0] accum = _ElementAccumulator(html_element) accum.add(TextSegment("Thanks for all those!", {})) (element,) = accum.flush(Title) e = element.to_dict() e.pop("element_id") assert e == { "metadata": {"category_depth": 2}, "text": "Thanks for all those!", "type": "Title", } def and_it_consolidates_annotations_into_metadata(self, html_element: etree.ElementBase): accum = _ElementAccumulator(html_element) accum.add( TextSegment( "\n Ford...", { "emphasized_text_contents": "Ford", "emphasized_text_tags": "b", }, ) ) accum.add(TextSegment(" you're turning into a ", {})) accum.add( TextSegment( "penguin", { "emphasized_text_contents": "penguin", "emphasized_text_tags": "i", }, ) ) accum.add(TextSegment(".\n", {})) (element,) = accum.flush(NarrativeText) e = element.to_dict() e.pop("element_id") assert e == { "metadata": { "emphasized_text_contents": [ "Ford", "penguin", ], "emphasized_text_tags": [ "b", "i", ], }, "text": "Ford... you're turning into a penguin.", "type": "NarrativeText", } # -- ._category_depth() ----------------------------------------------- @pytest.mark.parametrize( ("html_text", "tag", "ElementCls", "expected_value"), [ ("

Ford... you're turning into a penguin. Stop it.

", "p", Text, None), ("

* thanks for all the fish.

", "p", ListItem, 0), ("
  • thanks for all the fish.
  • ", "li", ListItem, 0), ("", "li", ListItem, 1), ("
    So long
    1. and thanks for the fish.
    ", "li", ListItem, 2), ("

    Examples

    ", "p", Title, 0), ("

    Examples

    ", "h1", Title, 0), ("

    Examples

    ", "h2", Title, 1), ("

    Examples

    ", "h3", Title, 2), ("

    Examples

    ", "h4", Title, 3), ("
    Examples
    ", "h5", Title, 4), ("
    Examples
    ", "h6", Title, 5), ], ) def it_computes_the_category_depth_to_help( self, html_text: str, tag: str, ElementCls: type[Element], expected_value: int | None ): e = etree.fromstring(html_text, html_parser).xpath(f".//{tag}")[0] accum = _ElementAccumulator(e) assert accum._category_depth(ElementCls) == expected_value # -- ._normalized_text ------------------------------------------------ def it_computes_the_normalized_text_of_its_text_segments_to_help( self, html_element: etree.ElementBase ): accum = _ElementAccumulator(html_element) accum.add(TextSegment(" \n Ford... you're \t turning\n", {})) accum.add(TextSegment("into a penguin.\n", {})) assert accum._normalized_text == "Ford... you're turning into a penguin." # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture() def html_element(self) -> etree.ElementBase: return etree.fromstring("

    ", html_parser).xpath(".//p")[0] class Describe_PreElementAccumulator: """Isolated unit-test suite for `unstructured.partition.html.parser._PreElementAccumulator`.""" def it_computes_the_normalized_text_of_its_text_segments_to_help(self): html_element = etree.fromstring("

    ", html_parser).xpath(".//p")[0] accum = _PreElementAccumulator(html_element) accum.add(TextSegment("\n\n", {})) accum.add(TextSegment(" The panel lit up\n", {})) accum.add(TextSegment(" with the words 'Please do not press\n", {})) accum.add(TextSegment(" this button again'\n\n", {})) # -- note single leading and trailing newline stripped -- assert accum._normalized_text == ( "\n" " The panel lit up\n" " with the words 'Please do not press\n" " this button again'\n" ) # -- FLOW (BLOCK-ITEM) ELEMENTS ------------------------------------------------------------------ class DescribeFlow: """Isolated unit-test suite for `unstructured.partition.html.parser.Flow`. The `Flow` class provides most behaviors for flow (block-level) elements. """ # -- .is_phrasing ----------------------------------------------------- def it_knows_it_is_NOT_a_phrasing_element(self): p = etree.fromstring("

    Hello

    ", html_parser).xpath(".//p")[0] assert isinstance(p, Flow) assert p.is_phrasing is False # -- .iter_elements() ------------------------------------------------- def it_generates_the_document_elements_from_the_Flow_element(self): """Phrasing siblings of child block elements are processed with text or tail. In the general case, a Flow element can contain text, phrasing content, and child flow elements. Each of these five lines in this example is a "paragraph" and gives rise to a distinct document-element. """ html_text = """
    Text of div with hierarchical\nphrasing content before first block item

    Click here to see the blurb for this block item.

    tail of block item with hierarchical phrasing content

    second block item

    tail of block item with hierarchical phrasing content
    """ div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] elements = div.iter_elements() e = next(elements) assert e == Title("Text of div with hierarchical phrasing content before first block item") assert e.metadata.to_dict() == { "category_depth": 0, "emphasized_text_contents": ["with", "hierarchical", "phrasing"], "emphasized_text_tags": ["b", "bi", "b"], } e = next(elements) assert e == NarrativeText("Click here to see the blurb for this block item.") assert e.metadata.to_dict() == {"link_texts": ["here"], "link_urls": ["http://blurb.io"]} e = next(elements) assert e == Title("tail of block item with hierarchical phrasing content") assert e.metadata.to_dict() == { "category_depth": 0, "emphasized_text_contents": ["with", "hierarchical", "phrasing"], "emphasized_text_tags": ["b", "bi", "b"], } e = next(elements) assert e == Title("second block item") assert e.metadata.to_dict() == {"category_depth": 0} e = next(elements) assert e == Title("tail of block item with hierarchical phrasing content") assert e.metadata.to_dict() == { "category_depth": 0, "emphasized_text_contents": ["with", "hierarchical"], "emphasized_text_tags": ["b", "bi"], } with pytest.raises(StopIteration): e = next(elements) # -- ._element_from_text_or_tail() ------------------------------------ def it_assembles_text_and_tail_document_elements_to_help(self): """Text and tails and their phrasing content are both processed the same way.""" html_text = "
    The \n Roman poet Virgil gave his pet fly
    " div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] elements = div._element_from_text_or_tail(div.text, deque(div), Text) e = next(elements) # -- element text is normalized -- assert e == Text("The Roman poet Virgil gave his pet fly") # -- individual annotations are consolidated -- assert e.metadata.to_dict() == { "emphasized_text_contents": ["poet", "Virgil", "gave"], "emphasized_text_tags": ["b", "bi", "b"], } def but_it_does_not_generate_a_document_element_when_only_whitespace_is_contained(self): html_text = "
    \n \n \n \n
    " div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] elements = div._element_from_text_or_tail(div.text, deque(div), Text) with pytest.raises(StopIteration): next(elements) def it_uses_the_specified_element_class_to_form_the_document_element(self): html_text = "
    \n The line-storm clouds fly tattered and swift\n
    " div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] elements = div._element_from_text_or_tail(div.text, deque(div), Address) e = next(elements) assert e == Address("The line-storm clouds fly tattered and swift") assert e.metadata.to_dict() == {} with pytest.raises(StopIteration): next(elements) def and_it_selects_the_document_element_class_by_analyzing_the_text_when_not_specified(self): html_text = "
    \n The line-storm clouds fly tattered and swift,\n
    " div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] elements = div._element_from_text_or_tail(div.text, deque(div)) assert next(elements) == NarrativeText("The line-storm clouds fly tattered and swift,") def but_it_does_not_generate_a_document_element_when_only_a_bullet_character_is_contained(self): html_text = "
    *
    " div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] elements = div._element_from_text_or_tail(div.text, deque(div)) with pytest.raises(StopIteration): next(elements) # -- ._iter_text_segments() ------------------------------------------- @pytest.mark.parametrize( ("html_text", "expected_value"), [ ( # -- text with no phrasing -- "

    Ford... you're turning into a penguin.

    ", [("Ford... you're turning into a penguin.", {})], ), ( # -- text with phrasing -- "

    Ford... you're turning into\na penguin.

    ", [ ("Ford... ", {}), ( "you're turning", {"emphasized_text_contents": "you're turning", "emphasized_text_tags": "b"}, ), (" into\na ", {}), ( "penguin", {"emphasized_text_contents": "penguin", "emphasized_text_tags": "i"}, ), (".", {}), ], ), ( # -- text with nested phrasing -- "

    Ford... you're turning into a penguin.

    ", [ ("Ford... ", {}), ( "you're ", {"emphasized_text_contents": "you're", "emphasized_text_tags": "b"}, ), ( "turning", {"emphasized_text_contents": "turning", "emphasized_text_tags": "bi"}, ), (" into a penguin.", {}), ], ), ], ) def it_recursively_generates_text_segments_from_text_and_phrasing_to_help( self, html_text: str, expected_value: list[Annotation] ): p = etree.fromstring(html_text, html_parser).xpath(".//p")[0] text_segments = list(p._iter_text_segments(p.text, deque(p))) assert text_segments == expected_value class DescribePre: """Isolated unit-test suite for `unstructured.partition.html.parser.Pre`. The `Pre` class specializes behaviors for the `

    ` (pre-formatted text) element.
        """
    
        def it_preserves_the_whitespace_of_its_phrasing_only_contents(self):
            """A `
    ` element can contain only phrasing content."""
            html_text = (
                "
    \n"
                "  The Answer to the Great Question...   Of Life, the Universe and Everything...\n"
                "  Is... Forty-two, said Deep Thought, with infinite majesty and calm.\n"
                "
    \n" ) pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0] elements = pre.iter_elements() e = next(elements) assert e == Text( " The Answer to the Great Question... Of Life, the Universe and Everything...\n" " Is... Forty-two, said Deep Thought, with infinite majesty and calm." ) with pytest.raises(StopIteration): next(elements) @pytest.mark.parametrize( ("html_text", "expected_value"), [ # -- a newline in the 0th position of pre.text is dropped -- ("
    \n  foo  
    ", " foo "), # -- but not when preceded by any other whitespace -- ("
     \n  foo  
    ", " \n foo "), # -- and only one is dropped -- ("
    \n\n  foo  
    ", "\n foo "), # -- a newline in the -1th position is dropped -- ("
      foo  \n
    ", " foo "), # -- but not when followed by any other whitespace -- ("
      foo  \n 
    ", " foo \n "), # -- and only one is dropped -- ("
      foo  \n\n
    ", " foo \n"), # -- a newline in both positions are both dropped -- ("
    \n  foo  \n
    ", " foo "), # -- or not when not at the absolute edge -- ("
     \n  foo  \n 
    ", " \n foo \n "), ], ) def but_it_strips_a_single_leading_or_trailing_newline( self, html_text: str, expected_value: str ): """Content starts on next line when opening `
    ` tag is immediately followed by `\n`"""
            pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]
            e = next(pre.iter_elements())
    
            assert e.text == expected_value
    
        def it_assigns_emphasis_and_link_metadata_when_contents_have_those_phrasing_elements(self):
            html_text = '
    You\'re turning into a penguin.
    ' pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0] e = next(pre.iter_elements()) assert e.text == "You're turning into a penguin." assert e.metadata.emphasized_text_contents == ["turning"] assert e.metadata.emphasized_text_tags == ["b"] assert e.metadata.link_texts == ["penguin"] assert e.metadata.link_urls == ["http://eie.io"] class DescribeRemovedBlock: """Isolated unit-test suite for `unstructured.partition.html.parser.RemovedBlock`. This class is used for block level items we want to skip like `
    ` and `
    `. """ def it_is_skipped_during_parsing(self): html_text = """

    Elephant at sunset
    An elephant at sunset

    Content we want.

    """ div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] assert list(div.iter_elements()) == [NarrativeText("Content we want.")] # -- PHRASING (INLINE) ELEMENTS ------------------------------------------------------------------ class DescribePhrasing: """Isolated unit-test suite for `unstructured.partition.html.parser.Phrasing`. The `Phrasing` class provides most behaviors for phrasing (inline) elements. """ # -- .is_phrasing ----------------------------------------------------- def it_knows_it_is_a_phrasing_element(self): b = etree.fromstring("Hello", html_parser).xpath(".//b")[0] assert isinstance(b, Phrasing) assert b.is_phrasing is True # -- .iter_text_segments() -------------------------------------------- @pytest.mark.parametrize( ("html_text", "expected_value"), [ # -- an empty element produces no text segments -- ("", []), # -- element text produces one segment -- (" foo ", [(" foo ", {})]), # -- element tail produces one segment -- (" bar ", [(" bar ", {})]), # -- element descendants each produce one segment -- ("foo bar", [("foo ", {}), ("bar", {})]), # -- and any combination produces a segment for each text, child, and tail -- ( " foo bar baz ", [ (" ", {}), ("foo ", {}), ("bar", {}), (" baz", {}), (" ", {}), ], ), ], ) def it_generates_text_segments_for_its_text_and_children_and_tail( self, html_text: str, expected_value: list[TextSegment] ): e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0] assert list(e.iter_text_segments()) == expected_value @pytest.mark.parametrize( ("html_text", "expected_value"), [ # -- Phrasing with nested block but no text or tail produces only element for block -- ("

    aaa

    ", [Title("aaa")]), # -- Phrasing with text produces annotated text-segment for the text -- ( "aaa

    bbb

    ", [ TextSegment( "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"} ), Title("bbb"), ], ), # -- Phrasing with tail produces annotated text-segment for the tail -- ( "

    aaa

    bbb
    ", [ Title("aaa"), TextSegment( "bbb", {"emphasized_text_contents": "bbb", "emphasized_text_tags": "b"} ), ], ), # -- Phrasing with text, nested block, and tail produces all three -- ( "aaa

    bbb

    ccc
    ", [ TextSegment( "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"} ), Title("bbb"), TextSegment( "ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"} ), ], ), ], ) def but_it_can_also_generate_an_element_when_it_has_a_nested_block_element( self, html_text: str, expected_value: list[TextSegment | Element] ): e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0] assert list(e.iter_text_segments()) == expected_value # -- ._annotation() --------------------------------------------------- def it_forms_its_annotations_from_emphasis(self): cite = etree.fromstring("", html_parser).xpath(".//cite")[0] assert cite._annotation("\n foobar\n ", "bi") == { "emphasized_text_contents": "foobar", "emphasized_text_tags": "bi", } @pytest.mark.parametrize("text", ["", "\n \t "]) def but_not_when_text_is_empty_or_whitespace(self, text: str): cite = etree.fromstring("", html_parser).xpath(".//cite")[0] assert cite._annotation(text, "bi") == {} def and_not_when_there_is_no_emphasis(self): cite = etree.fromstring("", html_parser).xpath(".//cite")[0] assert cite._annotation("foobar", "") == {} # -- ._inside_emphasis() ---------------------------------------------- @pytest.mark.parametrize("enclosing_emphasis", ["", "b", "bi"]) def it_uses_the_enclosing_emphasis_as_the_default_inside_emphasis( self, enclosing_emphasis: str ): """Inside emphasis is applied to text inside the phrasing element (but not its tail). The `._inside_emphasis()` method is overridden by Bold and Italic classes which add their specific emphasis characters. """ abbr = etree.fromstring("", html_parser).xpath(".//abbr")[0] assert abbr._inside_emphasis(enclosing_emphasis) == enclosing_emphasis # -- ._iter_child_text_segments() ------------------------------------- @pytest.mark.parametrize( ("html_text", "expected_value"), [ # -- a phrasing element with no children produces no text segments # -- (element text is handled elsewhere) ("aaa", []), # -- child phrasing element produces text-segment for its text -- ("xbbb", [TextSegment("bbb", {})]), # -- and also for its tail when it has one -- ("xbbbccc", [TextSegment("bbb", {}), TextSegment("ccc", {})]), # -- nested phrasing recursively each produce a segment for text and tail, in order -- ( "xxxaaabbbcccdddeeefff", [ TextSegment("aaa", {}), TextSegment("bbb", {}), TextSegment("ccc", {}), TextSegment("ddd", {}), TextSegment("eee", {}), TextSegment("fff", {}), ], ), ], ) def it_generates_text_segments_for_its_children_and_their_tails( self, html_text: str, expected_value: list[TextSegment] ): e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0] assert list(e._iter_child_text_segments("")) == expected_value @pytest.mark.parametrize( ("html_text", "inside_emphasis", "expected_value"), [ # -- a phrasing element with no block children produces no elements -- ("", "", []), # -- a child block element produces an element -- ("

    aaa

    ", "", [Title("aaa")]), # -- a child block element with a tail also produces a text-segment for the tail -- ("

    aaa

    bbb
    ", "", [Title("aaa"), TextSegment("bbb", {})]), # -- and also text-segments for phrasing following the tail -- ( "

    aaa

    bbbcccddd
    ", "", [ Title("aaa"), TextSegment("bbb", {}), TextSegment("ccc", {}), TextSegment("ddd", {}), ], ), # -- and emphasis is applied before and after block-item -- ( "aaa

    bbb

    cccdddeee
    ", "b", [ TextSegment( "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"} ), Title("bbb"), TextSegment( "ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"} ), TextSegment( "ddd", {"emphasized_text_contents": "ddd", "emphasized_text_tags": "b"} ), TextSegment( "eee", {"emphasized_text_contents": "eee", "emphasized_text_tags": "b"} ), ], ), ], ) def and_it_generates_elements_for_its_block_children( self, html_text: str, inside_emphasis: str, expected_value: list[TextSegment | Element] ): e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0] assert list(e._iter_child_text_segments(inside_emphasis)) == expected_value # -- ._iter_text_segments_from_block_tail_and_phrasing() -------------- @pytest.mark.parametrize( ("html_text", "emphasis", "expected_value"), [ # -- no tail and no contiguous phrasing produces no text-segments -- ("

    ", "", []), # -- tail produces a text-segment -- ("

    aaa", "", [TextSegment("aaa", {})]), # -- contiguous phrasing produces a text-segment -- ("

    aaa", "", [TextSegment("aaa", {})]), # -- tail of contiguous phrasing also produces a text-segment -- ("

    aaabbb", "", [TextSegment("aaa", {}), TextSegment("bbb", {})]), # -- nested phrasing produces a text-segment -- ( "

    aaabbbcccdddeee", "", [ TextSegment("aaa", {}), TextSegment("bbb", {}), TextSegment("ccc", {}), TextSegment("ddd", {}), TextSegment("eee", {}), ], ), # -- and emphasis is added to each text-segment when specified -- ( "

    aaabbbcccdddeee", "b", [ TextSegment( "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"} ), TextSegment( "bbb", {"emphasized_text_contents": "bbb", "emphasized_text_tags": "b"} ), TextSegment( "ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "bi"} ), TextSegment( "ddd", {"emphasized_text_contents": "ddd", "emphasized_text_tags": "b"} ), TextSegment( "eee", {"emphasized_text_contents": "eee", "emphasized_text_tags": "b"} ), ], ), # -- a block item nested in contiguous phrasing produces an Element -- ( "

    aaabbb

    ccc

    ddd
    eee
    ", "", [ TextSegment("aaa", {}), TextSegment("bbb", {}), Title("ccc"), TextSegment("ddd", {}), TextSegment("eee", {}), ], ), ], ) def it_generates_text_segments_from_the_tail_and_contiguous_phrasing( self, html_text: str, emphasis: str, expected_value: list[TextSegment | Element] ): e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0] p = e.xpath("./p")[0] tail = p.tail or "" q = deque(e[1:]) assert ( list(e._iter_text_segments_from_block_tail_and_phrasing(tail, q, emphasis)) == expected_value ) class DescribeAnchor: """Isolated unit-test suite for `unstructured.partition.html.parser.Anchor`. The `Anchor` class is used for `` tags and provides link metadata. """ # -- .iter_text_segments() -------------------------------------------- @pytest.mark.parametrize( ("html_text", "emphasis", "expected_value"), [ # -- produces no text-segment or annotation for anchor.text when there is none -- ('', "", []), # -- but it produces a text-segment for the tail if there is one -- (' long tail ', "", [TextSegment(" long tail ", {})]), # -- produces text-segment but no annotation for anchor.text when it is whitespace -- (' ', "", [TextSegment(" ", {})]), # -- produces text-segment and annotation for anchor text. Note `link_texts:` # -- annotation value is whitespace-normalized but text-segment text is not. ( ' click here ', "", [ TextSegment( " click here ", {"link_texts": ["click here"], "link_urls": ["http://abc.com"]}, ) ], ), # -- produces text-segment for both text and tail when present -- ( ' click here long tail', "", [ TextSegment( " click here ", {"link_texts": ["click here"], "link_urls": ["http://abc.com"]}, ), TextSegment(" long tail", {}), ], ), # -- nested phrasing inside element is handled as expected -- ( '

    I am one with the Force.

    ', "", [ TextSegment( "one with the Force", { "emphasized_text_contents": ["the"], "emphasized_text_tags": ["i"], "link_texts": ["one with the Force"], "link_urls": ["http://eie.io"], }, ), TextSegment(".", {}), ], ), # -- enclosing_emphasis is applied to all segments -- ( '

    I am one with the Force.

    ', "b", [ TextSegment( "one with", { "emphasized_text_contents": ["one with"], "emphasized_text_tags": ["b"], "link_texts": ["one with"], "link_urls": ["http://eie.io"], }, ), TextSegment( " the Force.", { "emphasized_text_contents": "the Force.", "emphasized_text_tags": "b", }, ), ], ), ], ) def it_generates_link_annotated_text_segments_for_its_text_and_a_tail_text_segment( self, html_text: str, emphasis: str, expected_value: list[TextSegment] ): a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] assert list(a.iter_text_segments(emphasis)) == expected_value def it_generates_enclosed_block_items_as_separate_elements(self): html_text = """I am

    one with

    the Force.
    """ a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] assert list(a.iter_text_segments("b")) == [ TextSegment( "I am ", { "emphasized_text_contents": ["I am"], "emphasized_text_tags": ["b"], "link_texts": ["I am"], "link_urls": ["http://eie.io"], }, ), Title("one with"), TextSegment( " the Force.", { "emphasized_text_contents": "the Force.", "emphasized_text_tags": "b", }, ), ] def and_it_annotates_first_enclosed_block_Element_when_no_non_whitespace_phrase_appears_first( self, ): html_text = """ \n

    I am one with

    the Force.
    """ a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] actual = list(a.iter_text_segments("i")) assert actual == [ TextSegment(" \n ", {}), NarrativeText("I am one with"), TextSegment( " the Force.", { "emphasized_text_contents": "the Force.", "emphasized_text_tags": "i", }, ), ] element = actual[1] assert element.metadata.link_texts == ["I am one with"] assert element.metadata.link_urls == ["http://eie.io"] # -- ._iter_phrases_and_elements() ------------------------------------ def it_divides_the_anchor_contents_but_not_tail_into_phrases_and_elements(self): html_text = """ But always

    see first.

    Otherwise you
    will only see """ a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] assert list(a._iter_phrases_and_elements(emphasis="")) == [ (TextSegment("But always ", {}),), NarrativeText("see first."), (TextSegment(" Otherwise you ", {}),), ] # -- ._iter_phrasing() ------------------------------------------------ def it_generates_zero_items_when_both_text_and_q_are_empty(self): html_text = """""" a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] with pytest.raises(StopIteration): next(a._iter_phrasing(text="", q=deque([]), emphasis="")) def it_generates_a_phrase_when_only_text_is_present(self): html_text = """\n But always see first.\n""" a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [ (TextSegment("\n But always see first.\n", {}),) ] def and_it_generates_a_phrase_when_that_text_is_followed_by_a_phrasing_element(self): html_text = """But always see first. Otherwise""" a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [ ( TextSegment("But always ", {}), TextSegment( "see ", { "emphasized_text_contents": "see", "emphasized_text_tags": "b", }, ), TextSegment( "first", { "emphasized_text_contents": "first", "emphasized_text_tags": "bi", }, ), TextSegment(". Otherwise", {}), ) ] def it_ends_the_phrase_at_the_end_of_the_element(self): html_text = """But always see first. Otherwise you will """ a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [ (TextSegment("But always see first.", {}),) ] def but_it_ends_at_a_block_element_if_one_occurs_first(self): html_text = """But always see first.

    Otherwise you

    """ a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [ (TextSegment("But always see first. ", {}),) ] def it_generates_an_element_for_a_block_item_nested_inside_phrasing(self): html_text = """ But always

    see first.

    Otherwise
    you
    """ a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [ ( TextSegment("But ", {}), TextSegment( "always ", { "emphasized_text_contents": "always", "emphasized_text_tags": "b", }, ), ), NarrativeText("see first."), ( TextSegment( "Otherwise", { "emphasized_text_contents": "Otherwise", "emphasized_text_tags": "b", }, ), TextSegment(" you ", {}), ), ] # -- ._link_annotate_element() ---------------------------------------- def it_adds_link_metadata_to_an_element_to_help(self): html_text = """""" a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] element = Text("aaa") e = a._link_annotate_element(element) assert e is element assert e.metadata.link_texts == ["aaa"] assert e.metadata.link_urls == ["http://eie.io"] def and_it_preserves_any_existing_link_metadata_on_the_element(self): # -- nested anchors shouldn't be possible but easier to test than prove it can't happen -- html_text = """""" a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] element = Text("bbb") element.metadata.link_texts = ["abc"] element.metadata.link_urls = ["http://abc.com"] e = a._link_annotate_element(element) assert e is element assert e.metadata.link_texts == ["abc", "bbb"] assert e.metadata.link_urls == ["http://abc.com", "http://eie.io"] def but_not_when_the_text_is_empty(self): html_text = """""" a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] element = Text("") e = a._link_annotate_element(element) assert e is element assert e.metadata.link_texts is None assert e.metadata.link_urls is None def and_not_when_there_is_no_url(self): html_text = """""" a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] element = Text("zzz") e = a._link_annotate_element(element) assert e is element assert e.metadata.link_texts is None assert e.metadata.link_urls is None # -- ._link_text_segment() -------------------------------------------- def it_consolidates_a_phrase_into_a_single_link_annotated_TextSegment_to_help(self): html_text = """""" a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] phrase = ( TextSegment( "Otherwise you will only ", { "emphasized_text_contents": ["Otherwise"], "emphasized_text_tags": ["i"], }, ), TextSegment( "see what you were expecting.\n", { "emphasized_text_contents": "expecting", "emphasized_text_tags": "b", }, ), ) link_text_segment = a._link_text_segment(phrase) assert link_text_segment == TextSegment( "Otherwise you will only see what you were expecting.\n", { "emphasized_text_contents": ["Otherwise", "expecting"], "emphasized_text_tags": ["i", "b"], "link_texts": ["Otherwise you will only see what you were expecting."], "link_urls": ["http://eie.io"], }, ) @pytest.mark.parametrize("text", ["", " \n \t "]) def but_not_when_the_text_is_empty_or_whitespace_only(self, text: str): html_text = """""" a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] phrase = (TextSegment(text, {}), TextSegment(text, {}), TextSegment(text, {})) assert a._link_text_segment(phrase) is None def and_not_when_the_anchor_has_no_href_url(self): html_text = """foobar""" a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] phrase = (TextSegment("Otherwise", {}), TextSegment(" you will", {})) assert a._link_text_segment(phrase) is None class DescribeBold: """Isolated unit-test suite for `unstructured.partition.html.parser.Bold`. The `Bold` class is used for `` and `` tags and adds emphasis metadata. """ def it_annotates_its_text_segment_with_bold_emphasis(self): b = etree.fromstring("rhombus", html_parser).xpath(".//b")[0] text_segments = b.iter_text_segments() text, annotation = next(text_segments) assert text == "rhombus" assert annotation == { "emphasized_text_contents": "rhombus", "emphasized_text_tags": "b", } def and_its_children_are_also_annotated_with_bold_emphasis(self): b = etree.fromstring("rhombus pentagon", html_parser).xpath(".//b")[0] text_segments = b.iter_text_segments() text, annotation = next(text_segments) assert text == "rhombus " assert annotation == { "emphasized_text_contents": "rhombus", "emphasized_text_tags": "b", } text, annotation = next(text_segments) assert text == "pentagon" assert annotation == { "emphasized_text_contents": "pentagon", "emphasized_text_tags": "bi", } def but_not_its_tail(self): b = etree.fromstring("rhombus pentagon", html_parser).xpath(".//b")[0] text_segments = b.iter_text_segments() text, annotation = next(text_segments) assert text == "rhombus" assert annotation == { "emphasized_text_contents": "rhombus", "emphasized_text_tags": "b", } text, annotation = next(text_segments) assert text == " pentagon" assert annotation == {} class DescribeItalic: """Isolated unit-test suite for `unstructured.partition.html.parser.Italic`. The `Italic` class is used for `` and `` tags and adds emphasis metadata. """ def it_annotates_its_text_segment_with_italic_emphasis(self): i = etree.fromstring("rhombus", html_parser).xpath(".//i")[0] text_segments = i.iter_text_segments() text, annotation = next(text_segments) assert text == "rhombus" assert annotation == { "emphasized_text_contents": "rhombus", "emphasized_text_tags": "i", } def and_its_children_are_also_annotated_with_italic_emphasis(self): em = etree.fromstring("rhombus pentagon", html_parser).xpath(".//em")[0] text_segments = em.iter_text_segments() text, annotation = next(text_segments) assert text == "rhombus " assert annotation == { "emphasized_text_contents": "rhombus", "emphasized_text_tags": "i", } text, annotation = next(text_segments) assert text == "pentagon" assert annotation == { "emphasized_text_contents": "pentagon", "emphasized_text_tags": "bi", } def but_not_its_tail(self): i = etree.fromstring("rhombus pentagon", html_parser).xpath(".//i")[0] text_segments = i.iter_text_segments() text, annotation = next(text_segments) assert text == "rhombus" assert annotation == { "emphasized_text_contents": "rhombus", "emphasized_text_tags": "i", } text, annotation = next(text_segments) assert text == " pentagon" assert annotation == {} class DescribeLineBreak: """Isolated unit-test suite for `unstructured.partition.html.parser.LineBreak`. Used for `
    ` elements, it's only special behavior is to add whitespace such that phrasing butted up tight on both sides of the `
    ` element is not joined, like `abc
    def` should become "abc def", not "abcdef". """ def it_adds_a_newline_in_its_place(self): cite = etree.fromstring( "spaceships of the
    Vogon Constructor Fleet
    ", html_parser ).xpath(".//cite")[0] text_segments = cite.iter_text_segments() texts = [ts.text for ts in text_segments] assert texts == ["spaceships of the", "\n", "Vogon Constructor Fleet"] assert _normalize_text("".join(texts)) == "spaceships of the Vogon Constructor Fleet" class DescribeRemovedPhrasing: """Isolated unit-test suite for `unstructured.partition.html.parser.RemovedPhrasing`. Used for phrasing elements like `