# pyright: reportPrivateUsage=false # pyright: reportUnknownArgumentType=false """Test suite for `unstructured.partition.html.parser` module.""" from __future__ import annotations from collections import deque import pytest from lxml import etree from unstructured.documents.elements import Address, Element, ListItem, NarrativeText, Text, Title from unstructured.partition.html.parser import ( Anchor, Annotation, DefaultElement, Flow, Phrasing, RemovedPhrasing, TextSegment, _consolidate_annotations, _normalize_text, html_parser, ) # -- MODULE-LEVEL FUNCTIONS ---------------------------------------------------------------------- # -- _consolidate_annotations() ------------------ def it_gathers_annotations_from_text_segments(): text_segments = [ TextSegment( " Ford Prefect ", { "link_texts": "Ford Prefect", "link_url": "https://wikipedia/Ford_Prefect", "emphasized_text_contents": "Ford Prefect", "emphasized_text_tags": "b", }, ), TextSegment( " alien encounter", { "emphasized_text_contents": "alien encounter", "emphasized_text_tags": "bi", }, ), ] annotations = _consolidate_annotations(text_segments) assert annotations == { # -- each distinct key gets a list of values -- "emphasized_text_contents": ["Ford Prefect", "alien encounter"], "emphasized_text_tags": ["b", "bi"], # -- even when there is only one value -- "link_texts": ["Ford Prefect"], "link_url": ["https://wikipedia/Ford_Prefect"], } # -- and the annotations mapping is immutable -- with pytest.raises(TypeError, match="object does not support item assignment"): annotations["new_key"] = "foobar" # pyright: ignore[reportIndexIssue] # -- (but not its list values unfortunately) -- annotations["emphasized_text_tags"].append("xyz") assert annotations["emphasized_text_tags"] == ["b", "bi", "xyz"] # -- _normalize_text() --------------------------- @pytest.mark.parametrize( ("text", "expected_value"), [ # -- already normalized text is left unchanged -- ("iterators allow", "iterators allow"), # -- newlines are treated as whitespace -- ("algorithm\nto be", "algorithm to be"), (" separated\n from ", "separated from"), ("\n container\n details\n ", "container details"), ( "\n iterators allow \n algorithm to be \nexpressed without container \nnoise", "iterators allow algorithm to be expressed without container noise", ), ], ) def test_normalize_text_produces_normalized_text(text: str, expected_value: str): assert _normalize_text(text) == expected_value # -- FLOW (BLOCK-ITEM) ELEMENTS ------------------------------------------------------------------ class DescribeFlow: """Isolated unit-test suite for `unstructured.partition.html.parser.Flow`. The `Flow` class provides most behaviors for flow (block-level) elements. """ # -- .is_phrasing ----------------------------------------------------- def it_knows_it_is_NOT_a_phrasing_element(self): p = etree.fromstring("

Hello

", html_parser).xpath(".//p")[0] assert isinstance(p, Flow) assert p.is_phrasing is False # -- .iter_elements() ------------------------------------------------- def it_generates_the_document_elements_from_the_Flow_element(self): """Phrasing siblings of child block elements are processed with text or tail. In the general case, a Flow element can contain text, phrasing content, and child flow elements. Each of these five lines in this example is a "paragraph" and gives rise to a distinct document-element. """ html_text = """
Text of div with hierarchical\nphrasing content before first block item

Click here to see the blurb for this block item.

tail of block item with hierarchical phrasing content

second block item

tail of block item with hierarchical phrasing content
""" div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] elements = div.iter_elements() e = next(elements) assert e == Title("Text of div with hierarchical phrasing content before first block item") assert e.metadata.to_dict() == { "category_depth": 0, "emphasized_text_contents": ["with", "hierarchical", "phrasing"], "emphasized_text_tags": ["b", "bi", "b"], } e = next(elements) assert e == NarrativeText("Click here to see the blurb for this block item.") assert e.metadata.to_dict() == {"link_texts": ["here"], "link_urls": ["http://blurb.io"]} e = next(elements) assert e == Title("tail of block item with hierarchical phrasing content") assert e.metadata.to_dict() == { "category_depth": 0, "emphasized_text_contents": ["with", "hierarchical", "phrasing"], "emphasized_text_tags": ["b", "bi", "b"], } e = next(elements) assert e == Title("second block item") assert e.metadata.to_dict() == {"category_depth": 0} e = next(elements) assert e == Title("tail of block item with hierarchical phrasing content") assert e.metadata.to_dict() == { "category_depth": 0, "emphasized_text_contents": ["with", "hierarchical"], "emphasized_text_tags": ["b", "bi"], } with pytest.raises(StopIteration): e = next(elements) # -- ._category_depth() ----------------------------------------------- @pytest.mark.parametrize( ("html_text", "tag", "ElementCls", "expected_value"), [ ("

Ford... you're turning into a penguin. Stop it.

", "p", Text, None), ("

* thanks for all the fish.

", "p", ListItem, 0), ("
  • thanks for all the fish.
  • ", "li", ListItem, 0), ("", "li", ListItem, 1), ("
    So long
    1. and thanks for the fish.
    ", "li", ListItem, 2), ("

    Examples

    ", "p", Title, 0), ("

    Examples

    ", "h1", Title, 0), ("

    Examples

    ", "h2", Title, 1), ("

    Examples

    ", "h3", Title, 2), ("

    Examples

    ", "h4", Title, 3), ("
    Examples
    ", "h5", Title, 4), ("
    Examples
    ", "h6", Title, 5), ], ) def it_computes_the_category_depth_to_help( self, html_text: str, tag: str, ElementCls: type[Element], expected_value: int | None ): e = etree.fromstring(html_text, html_parser).xpath(f".//{tag}")[0] assert e._category_depth(ElementCls) == expected_value # -- ._element_from_text_or_tail() ------------------------------------ def it_assembles_text_and_tail_document_elements_to_help(self): """Text and tails and their phrasing content are both processed the same way.""" html_text = "
    The \n Roman poet Virgil gave his pet fly
    " div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] elements = div._element_from_text_or_tail(div.text, deque(div), Text) e = next(elements) # -- element text is normalized -- assert e == Text("The Roman poet Virgil gave his pet fly") # -- individual annotations are consolidated -- assert e.metadata.to_dict() == { "emphasized_text_contents": ["poet", "Virgil", "gave"], "emphasized_text_tags": ["b", "bi", "b"], } def but_it_does_not_generate_a_document_element_when_only_whitespace_is_contained(self): html_text = "
    \n \n \n \n
    " div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] elements = div._element_from_text_or_tail(div.text, deque(div), Text) with pytest.raises(StopIteration): next(elements) def it_uses_the_specified_element_class_to_form_the_document_element(self): html_text = "
    \n The line-storm clouds fly tattered and swift\n
    " div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] elements = div._element_from_text_or_tail(div.text, deque(div), Address) e = next(elements) assert e == Address("The line-storm clouds fly tattered and swift") assert e.metadata.to_dict() == {} with pytest.raises(StopIteration): next(elements) def and_it_selects_the_document_element_class_by_analyzing_the_text_when_not_specified(self): html_text = "
    \n The line-storm clouds fly tattered and swift,\n
    " div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] elements = div._element_from_text_or_tail(div.text, deque(div)) assert next(elements) == NarrativeText("The line-storm clouds fly tattered and swift,") def but_it_does_not_generate_a_document_element_when_only_a_bullet_character_is_contained(self): html_text = "
    *
    " div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] elements = div._element_from_text_or_tail(div.text, deque(div)) with pytest.raises(StopIteration): next(elements) # -- ._iter_text_segments() ------------------------------------------- @pytest.mark.parametrize( ("html_text", "expected_value"), [ ( # -- text with no phrasing -- "

    Ford... you're turning into a penguin.

    ", [("Ford... you're turning into a penguin.", {})], ), ( # -- text with phrasing -- "

    Ford... you're turning into\na penguin.

    ", [ ("Ford... ", {}), ( "you're turning", {"emphasized_text_contents": "you're turning", "emphasized_text_tags": "b"}, ), (" into\na ", {}), ( "penguin", {"emphasized_text_contents": "penguin", "emphasized_text_tags": "i"}, ), (".", {}), ], ), ( # -- text with nested phrasing -- "

    Ford... you're turning into a penguin.

    ", [ ("Ford... ", {}), ( "you're ", {"emphasized_text_contents": "you're", "emphasized_text_tags": "b"}, ), ( "turning", {"emphasized_text_contents": "turning", "emphasized_text_tags": "bi"}, ), (" into a penguin.", {}), ], ), ], ) def it_recursively_generates_text_segments_from_text_and_phrasing_to_help( self, html_text: str, expected_value: list[Annotation] ): p = etree.fromstring(html_text, html_parser).xpath(".//p")[0] text_segments = list(p._iter_text_segments(p.text, deque(p))) assert text_segments == expected_value class DescribePre: """Isolated unit-test suite for `unstructured.partition.html.parser.Pre`. The `Pre` class specializes behaviors for the `

    ` (pre-formatted text) element.
        """
    
        def it_preserves_the_whitespace_of_its_phrasing_only_contents(self):
            """A `
    ` element can contain only phrasing content."""
            html_text = (
                "
    \n"
                "  The Answer to the Great Question...   Of Life, the Universe and Everything...\n"
                "  Is... Forty-two, said Deep Thought, with infinite majesty and calm.\n"
                "
    \n" ) pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0] elements = pre.iter_elements() e = next(elements) assert e == Text( " The Answer to the Great Question... Of Life, the Universe and Everything...\n" " Is... Forty-two, said Deep Thought, with infinite majesty and calm." ) with pytest.raises(StopIteration): next(elements) @pytest.mark.parametrize( ("html_text", "expected_value"), [ # -- a newline in the 0th position of pre.text is dropped -- ("
    \n  foo  
    ", " foo "), # -- but not when preceded by any other whitespace -- ("
     \n  foo  
    ", " \n foo "), # -- and only one is dropped -- ("
    \n\n  foo  
    ", "\n foo "), # -- a newline in the -1th position is dropped -- ("
      foo  \n
    ", " foo "), # -- but not when followed by any other whitespace -- ("
      foo  \n 
    ", " foo \n "), # -- and only one is dropped -- ("
      foo  \n\n
    ", " foo \n"), # -- a newline in both positions are both dropped -- ("
    \n  foo  \n
    ", " foo "), # -- or not when not at the absolute edge -- ("
     \n  foo  \n 
    ", " \n foo \n "), ], ) def but_it_strips_a_single_leading_or_trailing_newline( self, html_text: str, expected_value: str ): """Content starts on next line when opening `
    ` tag is immediately followed by `\n`"""
            pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]
            e = next(pre.iter_elements())
    
            assert e.text == expected_value
    
        def it_assigns_emphasis_and_link_metadata_when_contents_have_those_phrasing_elements(self):
            html_text = '
    You\'re turning into a penguin.
    ' pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0] e = next(pre.iter_elements()) assert e.text == "You're turning into a penguin." assert e.metadata.emphasized_text_contents == ["turning"] assert e.metadata.emphasized_text_tags == ["b"] assert e.metadata.link_texts == ["penguin"] assert e.metadata.link_urls == ["http://eie.io"] class DescribeRemovedBlock: """Isolated unit-test suite for `unstructured.partition.html.parser.RemovedBlock`. This class is used for block level items we want to skip like `
    ` and `
    `. """ def it_is_skipped_during_parsing(self): html_text = """

    Elephant at sunset
    An elephant at sunset

    Content we want.

    """ div = etree.fromstring(html_text, html_parser).xpath(".//div")[0] assert list(div.iter_elements()) == [NarrativeText("Content we want.")] # -- PHRASING (INLINE) ELEMENTS ------------------------------------------------------------------ class DescribePhrasing: """Isolated unit-test suite for `unstructured.partition.html.parser.Phrasing`. The `Phrasing` class provides most behaviors for phrasing (inline) elements. """ def it_knows_it_is_a_phrasing_element(self): b = etree.fromstring("Hello", html_parser).xpath(".//b")[0] assert isinstance(b, Phrasing) assert b.is_phrasing is True @pytest.mark.parametrize( ("html_text", "expected_value"), [ # -- an empty element produces no text segments -- ("", []), # -- element text produces one segment -- (" foo ", [(" foo ", {})]), # -- element tail produces one segment -- (" bar ", [(" bar ", {})]), # -- element descendants each produce one segment -- ("foo bar", [("foo ", {}), ("bar", {})]), # -- and any combination produces a segment for each text, child, and tail -- ( " foo bar baz ", [ (" ", {}), ("foo ", {}), ("bar", {}), (" baz", {}), (" ", {}), ], ), ], ) def it_generates_text_segments_for_its_text_and_children_and_tail( self, html_text: str, expected_value: list[TextSegment] ): e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0] assert list(e.iter_text_segments()) == expected_value def it_forms_its_annotations_from_emphasis(self): cite = etree.fromstring(" rhombus ", html_parser).xpath(".//cite")[0] assert cite._annotation(cite.text, "bi") == { "emphasized_text_contents": "rhombus", "emphasized_text_tags": "bi", } def but_not_when_text_is_empty_or_whitespace(self): cite = etree.fromstring(" ", html_parser).xpath(".//cite")[0] assert cite._annotation(cite.text, "bi") == {} def and_not_when_there_is_no_emphasis(self): cite = etree.fromstring("rhombus", html_parser).xpath(".//cite")[0] assert cite._annotation(cite.text, "") == {} def it_uses_the_enclosing_emphasis_as_the_default_inside_emphasis(self): abbr = etree.fromstring("LLM", html_parser).xpath(".//abbr")[0] assert abbr._inside_emphasis("xyz") == "xyz" class DescribeBold: """Isolated unit-test suite for `unstructured.partition.html.parser.Bold`. The `Bold` class is used for `` and `` tags and adds emphasis metadata. """ def it_annotates_its_text_segment_with_bold_emphasis(self): b = etree.fromstring("rhombus", html_parser).xpath(".//b")[0] text_segments = b.iter_text_segments() text, annotation = next(text_segments) assert text == "rhombus" assert annotation == { "emphasized_text_contents": "rhombus", "emphasized_text_tags": "b", } def and_its_children_are_also_annotated_with_bold_emphasis(self): b = etree.fromstring("rhombus pentagon", html_parser).xpath(".//b")[0] text_segments = b.iter_text_segments() text, annotation = next(text_segments) assert text == "rhombus " assert annotation == { "emphasized_text_contents": "rhombus", "emphasized_text_tags": "b", } text, annotation = next(text_segments) assert text == "pentagon" assert annotation == { "emphasized_text_contents": "pentagon", "emphasized_text_tags": "bi", } def but_not_its_tail(self): b = etree.fromstring("rhombus pentagon", html_parser).xpath(".//b")[0] text_segments = b.iter_text_segments() text, annotation = next(text_segments) assert text == "rhombus" assert annotation == { "emphasized_text_contents": "rhombus", "emphasized_text_tags": "b", } text, annotation = next(text_segments) assert text == " pentagon" assert annotation == {} class DescribeItalic: """Isolated unit-test suite for `unstructured.partition.html.parser.Italic`. The `Italic` class is used for `` and `` tags and adds emphasis metadata. """ def it_annotates_its_text_segment_with_italic_emphasis(self): i = etree.fromstring("rhombus", html_parser).xpath(".//i")[0] text_segments = i.iter_text_segments() text, annotation = next(text_segments) assert text == "rhombus" assert annotation == { "emphasized_text_contents": "rhombus", "emphasized_text_tags": "i", } def and_its_children_are_also_annotated_with_italic_emphasis(self): em = etree.fromstring("rhombus pentagon", html_parser).xpath(".//em")[0] text_segments = em.iter_text_segments() text, annotation = next(text_segments) assert text == "rhombus " assert annotation == { "emphasized_text_contents": "rhombus", "emphasized_text_tags": "i", } text, annotation = next(text_segments) assert text == "pentagon" assert annotation == { "emphasized_text_contents": "pentagon", "emphasized_text_tags": "bi", } def but_not_its_tail(self): i = etree.fromstring("rhombus pentagon", html_parser).xpath(".//i")[0] text_segments = i.iter_text_segments() text, annotation = next(text_segments) assert text == "rhombus" assert annotation == { "emphasized_text_contents": "rhombus", "emphasized_text_tags": "i", } text, annotation = next(text_segments) assert text == " pentagon" assert annotation == {} class DescribeLineBreak: """Isolated unit-test suite for `unstructured.partition.html.parser.LineBreak`. Used for `
    ` elements, it's only special behavior is to add whitespace such that phrasing butted up tight on both sides of the `
    ` element is not joined, like `abc
    def` should become "abc def", not "abcdef". """ def it_adds_a_newline_in_its_place(self): cite = etree.fromstring( "spaceships of the
    Vogon Constructor Fleet
    ", html_parser ).xpath(".//cite")[0] text_segments = cite.iter_text_segments() texts = [ts.text for ts in text_segments] assert texts == ["spaceships of the", "\n", "Vogon Constructor Fleet"] assert _normalize_text("".join(texts)) == "spaceships of the Vogon Constructor Fleet" class DescribeRemovedPhrasing: """Isolated unit-test suite for `unstructured.partition.html.parser.RemovedPhrasing`. Used for phrasing elements like `