From 00e1d5c05b4d0b55b99a062551fef15332a46e2f Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Mon, 8 Jul 2024 18:10:03 -0700 Subject: [PATCH] rfctr(html): refine HTML parser (#3351) **Note** This refines the new HTML parser but _does not install it_. This is why no changes to ingest test expectations or other unit-tests are required here. Installing the new parser will happen in the next PR #3218. **Summary** The initial version of the parser (purposely) raised on a block element nested inside a phrasing element. While such nesting is not valid according to the HTML Standard, it is accepted by the browser and does happen in the wild. The refinements here handle this situation similarly to how the browser does, breaking phrasing at the block element boundaries and starting it up again after the block element. Unfortunately this adds complexity to the parser, but it makes the parser robust against pretty much any HTML we're likely to encounter and partitions it consistent with how it would be rendered in the browser. --- CHANGELOG.md | 17 +- .../partition/html/test_parser.py | 1012 ++++++++++++++--- typings/lxml/etree/__init__.pyi | 1 + typings/lxml/etree/_cleanup.pyi | 21 + unstructured/__version__.py | 2 +- unstructured/partition/html/parser.py | 512 ++++++--- 6 files changed, 1240 insertions(+), 325 deletions(-) create mode 100644 typings/lxml/etree/_cleanup.pyi diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c7801f96..212e9a76b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,21 @@ +## 0.14.11-dev0 + +### Enhancements + +* **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `

`, `

`) nested inside a phrasing element (e.g. `` or ``). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation. + +### Features + +### Fixes + ## 0.14.10 ### Enhancements -* **Update unstructured-client dependency** Change unstructured-client dependency pin back to - greater than min version and updated tests that were failing given the update. +* **Update unstructured-client dependency** Change unstructured-client dependency pin back to greater than min version and updated tests that were failing given the update. * **`.doc` files are now supported in the `arm64` image.**. `libreoffice24` is added to the `arm64` image, meaning `.doc` files are now supported. We have follow on work planned to investigate adding `.ppt` support for `arm64` as well. -* Add table detection metrics: recall, precision and f1 -* Remove unused _with_spans metrics +* **Add table detection metrics: recall, precision and f1.** +* **Remove unused _with_spans metrics.** ### Features diff --git a/test_unstructured/partition/html/test_parser.py b/test_unstructured/partition/html/test_parser.py index 7ee9899ae..32dc975ec 100644 --- a/test_unstructured/partition/html/test_parser.py +++ b/test_unstructured/partition/html/test_parser.py @@ -12,7 +12,6 @@ from lxml import etree from unstructured.documents.elements import Address, Element, ListItem, NarrativeText, Text, Title from unstructured.partition.html.parser import ( - Anchor, Annotation, DefaultElement, Flow, @@ -20,7 +19,10 @@ from unstructured.partition.html.parser import ( RemovedPhrasing, TextSegment, _consolidate_annotations, + _ElementAccumulator, _normalize_text, + _PhraseAccumulator, + _PreElementAccumulator, html_parser, ) @@ -29,27 +31,21 @@ from unstructured.partition.html.parser import ( # -- _consolidate_annotations() ------------------ -def it_gathers_annotations_from_text_segments(): - text_segments = [ - TextSegment( - " Ford Prefect ", - { - "link_texts": "Ford Prefect", - "link_url": "https://wikipedia/Ford_Prefect", - "emphasized_text_contents": "Ford Prefect", - "emphasized_text_tags": "b", - }, - ), - TextSegment( - " alien encounter", - { - "emphasized_text_contents": "alien encounter", - "emphasized_text_tags": "bi", - }, - ), +def it_consolidates_annotations_from_multiple_text_segments(): + annotations = [ + { + "link_texts": "Ford Prefect", + "link_url": "https://wikipedia/Ford_Prefect", + "emphasized_text_contents": "Ford Prefect", + "emphasized_text_tags": "b", + }, + { + "emphasized_text_contents": "alien encounter", + "emphasized_text_tags": "bi", + }, ] - annotations = _consolidate_annotations(text_segments) + annotations = _consolidate_annotations(annotations) assert annotations == { # -- each distinct key gets a list of values -- @@ -89,6 +85,263 @@ def test_normalize_text_produces_normalized_text(text: str, expected_value: str) assert _normalize_text(text) == expected_value +# -- PHRASING ACCUMULATORS ----------------------------------------------------------------------- + + +class Describe_PhraseAccumulator: + """Isolated unit-test suite for `unstructured.partition.html.parser._PhraseAccumulator`.""" + + def it_is_empty_on_construction(self): + accum = _PhraseAccumulator() + + phrase_iter = accum.flush() + + with pytest.raises(StopIteration): + next(phrase_iter) + + # -- .add() ----------------------------------------------------------- + + def it_accumulates_text_segments(self): + accum = _PhraseAccumulator() + + accum.add(TextSegment("Ford... you're turning ", {})) + accum.add(TextSegment("into a penguin.", {})) + phrase_iter = accum.flush() + + phrase = next(phrase_iter) + assert phrase == ( + TextSegment("Ford... you're turning ", {}), + TextSegment("into a penguin.", {}), + ) + + with pytest.raises(StopIteration): + next(phrase_iter) + + # -- .flush() --------------------------------------------------------- + + def it_generates_zero_phrases_on_flush_when_empty(self): + accum = _PhraseAccumulator() + + phrase_iter = accum.flush() + + with pytest.raises(StopIteration): + next(phrase_iter) + + +class Describe_ElementAccumulator: + """Isolated unit-test suite for `unstructured.partition.html.parser._ElementAccumulator`.""" + + def it_is_empty_on_construction(self, html_element: etree.ElementBase): + accum = _ElementAccumulator(html_element) + + element_iter = accum.flush(None) + + with pytest.raises(StopIteration): + next(element_iter) + + # -- .add() ----------------------------------------------------------- + + def it_accumulates_text_segments(self, html_element: etree.ElementBase): + accum = _ElementAccumulator(html_element) + + accum.add(TextSegment("Ford... you're turning ", {})) + accum.add(TextSegment("into a penguin.", {})) + element_iter = accum.flush(None) + + element = next(element_iter) + assert element == NarrativeText("Ford... you're turning into a penguin.") + + with pytest.raises(StopIteration): + next(element_iter) + + # -- .flush() --------------------------------------------------------- + + def it_generates_zero_elements_when_empty(self, html_element: etree.ElementBase): + accum = _ElementAccumulator(html_element) + + element_iter = accum.flush(None) + + with pytest.raises(StopIteration): + next(element_iter) + + def and_it_generates_zero_elements_when_all_its_text_segments_are_whitespace_only( + self, html_element: etree.ElementBase + ): + accum = _ElementAccumulator(html_element) + accum.add(TextSegment(" \n \t \n", {})) + accum.add(TextSegment(" \n", {})) + + with pytest.raises(StopIteration): + next(accum.flush(None)) + + def and_it_generates_zero_elements_when_there_is_only_one_non_whitespace_character( + self, html_element: etree.ElementBase + ): + accum = _ElementAccumulator(html_element) + accum.add(TextSegment(" \n \t \n", {})) + accum.add(TextSegment(" X \n", {})) + + with pytest.raises(StopIteration): + next(accum.flush(None)) + + def it_normalizes_the_text_of_its_text_segments_on_flush(self, html_element: etree.ElementBase): + accum = _ElementAccumulator(html_element) + accum.add(TextSegment(" \n Ford... you're \t turning\n", {})) + accum.add(TextSegment("into a penguin.\n", {})) + + (element,) = accum.flush(None) + + assert element.text == "Ford... you're turning into a penguin." + + def it_creates_a_document_element_of_the_specified_type(self, html_element: etree.ElementBase): + accum = _ElementAccumulator(html_element) + accum.add(TextSegment("Ford... you're turning into a penguin.", {})) + + (element,) = accum.flush(ListItem) + + assert element == ListItem("Ford... you're turning into a penguin.") + + def but_it_derives_the_element_type_from_the_text_when_none_is_specified( + self, html_element: etree.ElementBase + ): + accum = _ElementAccumulator(html_element) + accum.add(TextSegment("Ford... you're turning into a penguin.", {})) + + (element,) = accum.flush(None) + + assert element == NarrativeText("Ford... you're turning into a penguin.") + + def it_removes_an_explicit_leading_bullet_character_from_a_list_item( + self, html_element: etree.ElementBase + ): + accum = _ElementAccumulator(html_element) + accum.add(TextSegment("* turning into a penguin", {})) + + (element,) = accum.flush(None) + + assert element == ListItem("turning into a penguin") + + def it_applies_category_depth_metadata(self): + html_element = etree.fromstring("

About fish

", html_parser).xpath(".//h3")[0] + accum = _ElementAccumulator(html_element) + accum.add(TextSegment("Thanks for all those!", {})) + + (element,) = accum.flush(Title) + + e = element.to_dict() + e.pop("element_id") + assert e == { + "metadata": {"category_depth": 2}, + "text": "Thanks for all those!", + "type": "Title", + } + + def and_it_consolidates_annotations_into_metadata(self, html_element: etree.ElementBase): + accum = _ElementAccumulator(html_element) + accum.add( + TextSegment( + "\n Ford...", + { + "emphasized_text_contents": "Ford", + "emphasized_text_tags": "b", + }, + ) + ) + accum.add(TextSegment(" you're turning into a ", {})) + accum.add( + TextSegment( + "penguin", + { + "emphasized_text_contents": "penguin", + "emphasized_text_tags": "i", + }, + ) + ) + accum.add(TextSegment(".\n", {})) + + (element,) = accum.flush(NarrativeText) + + e = element.to_dict() + e.pop("element_id") + assert e == { + "metadata": { + "emphasized_text_contents": [ + "Ford", + "penguin", + ], + "emphasized_text_tags": [ + "b", + "i", + ], + }, + "text": "Ford... you're turning into a penguin.", + "type": "NarrativeText", + } + + # -- ._category_depth() ----------------------------------------------- + + @pytest.mark.parametrize( + ("html_text", "tag", "ElementCls", "expected_value"), + [ + ("

Ford... you're turning into a penguin. Stop it.

", "p", Text, None), + ("

* thanks for all the fish.

", "p", ListItem, 0), + ("
  • thanks for all the fish.
  • ", "li", ListItem, 0), + ("
    • So long
    • and thanks for all the fish.
    ", "li", ListItem, 1), + ("
    So long
    1. and thanks for the fish.
    ", "li", ListItem, 2), + ("

    Examples

    ", "p", Title, 0), + ("

    Examples

    ", "h1", Title, 0), + ("

    Examples

    ", "h2", Title, 1), + ("

    Examples

    ", "h3", Title, 2), + ("

    Examples

    ", "h4", Title, 3), + ("
    Examples
    ", "h5", Title, 4), + ("
    Examples
    ", "h6", Title, 5), + ], + ) + def it_computes_the_category_depth_to_help( + self, html_text: str, tag: str, ElementCls: type[Element], expected_value: int | None + ): + e = etree.fromstring(html_text, html_parser).xpath(f".//{tag}")[0] + accum = _ElementAccumulator(e) + assert accum._category_depth(ElementCls) == expected_value + + # -- ._normalized_text ------------------------------------------------ + + def it_computes_the_normalized_text_of_its_text_segments_to_help( + self, html_element: etree.ElementBase + ): + accum = _ElementAccumulator(html_element) + accum.add(TextSegment(" \n Ford... you're \t turning\n", {})) + accum.add(TextSegment("into a penguin.\n", {})) + + assert accum._normalized_text == "Ford... you're turning into a penguin." + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture() + def html_element(self) -> etree.ElementBase: + return etree.fromstring("

    ", html_parser).xpath(".//p")[0] + + +class Describe_PreElementAccumulator: + """Isolated unit-test suite for `unstructured.partition.html.parser._PreElementAccumulator`.""" + + def it_computes_the_normalized_text_of_its_text_segments_to_help(self): + html_element = etree.fromstring("

    ", html_parser).xpath(".//p")[0] + accum = _PreElementAccumulator(html_element) + accum.add(TextSegment("\n\n", {})) + accum.add(TextSegment(" The panel lit up\n", {})) + accum.add(TextSegment(" with the words 'Please do not press\n", {})) + accum.add(TextSegment(" this button again'\n\n", {})) + + # -- note single leading and trailing newline stripped -- + assert accum._normalized_text == ( + "\n" + " The panel lit up\n" + " with the words 'Please do not press\n" + " this button again'\n" + ) + + # -- FLOW (BLOCK-ITEM) ELEMENTS ------------------------------------------------------------------ @@ -160,31 +413,6 @@ class DescribeFlow: with pytest.raises(StopIteration): e = next(elements) - # -- ._category_depth() ----------------------------------------------- - - @pytest.mark.parametrize( - ("html_text", "tag", "ElementCls", "expected_value"), - [ - ("

    Ford... you're turning into a penguin. Stop it.

    ", "p", Text, None), - ("

    * thanks for all the fish.

    ", "p", ListItem, 0), - ("
  • thanks for all the fish.
  • ", "li", ListItem, 0), - ("
    • So long
    • and thanks for all the fish.
    ", "li", ListItem, 1), - ("
    So long
    1. and thanks for the fish.
    ", "li", ListItem, 2), - ("

    Examples

    ", "p", Title, 0), - ("

    Examples

    ", "h1", Title, 0), - ("

    Examples

    ", "h2", Title, 1), - ("

    Examples

    ", "h3", Title, 2), - ("

    Examples

    ", "h4", Title, 3), - ("
    Examples
    ", "h5", Title, 4), - ("
    Examples
    ", "h6", Title, 5), - ], - ) - def it_computes_the_category_depth_to_help( - self, html_text: str, tag: str, ElementCls: type[Element], expected_value: int | None - ): - e = etree.fromstring(html_text, html_parser).xpath(f".//{tag}")[0] - assert e._category_depth(ElementCls) == expected_value - # -- ._element_from_text_or_tail() ------------------------------------ def it_assembles_text_and_tail_document_elements_to_help(self): @@ -392,12 +620,16 @@ class DescribePhrasing: The `Phrasing` class provides most behaviors for phrasing (inline) elements. """ + # -- .is_phrasing ----------------------------------------------------- + def it_knows_it_is_a_phrasing_element(self): b = etree.fromstring("Hello", html_parser).xpath(".//b")[0] assert isinstance(b, Phrasing) assert b.is_phrasing is True + # -- .iter_text_segments() -------------------------------------------- + @pytest.mark.parametrize( ("html_text", "expected_value"), [ @@ -428,24 +660,574 @@ class DescribePhrasing: e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0] assert list(e.iter_text_segments()) == expected_value + @pytest.mark.parametrize( + ("html_text", "expected_value"), + [ + # -- Phrasing with nested block but no text or tail produces only element for block -- + ("

    aaa

    ", [Title("aaa")]), + # -- Phrasing with text produces annotated text-segment for the text -- + ( + "aaa

    bbb

    ", + [ + TextSegment( + "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"} + ), + Title("bbb"), + ], + ), + # -- Phrasing with tail produces annotated text-segment for the tail -- + ( + "

    aaa

    bbb
    ", + [ + Title("aaa"), + TextSegment( + "bbb", {"emphasized_text_contents": "bbb", "emphasized_text_tags": "b"} + ), + ], + ), + # -- Phrasing with text, nested block, and tail produces all three -- + ( + "aaa

    bbb

    ccc
    ", + [ + TextSegment( + "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"} + ), + Title("bbb"), + TextSegment( + "ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"} + ), + ], + ), + ], + ) + def but_it_can_also_generate_an_element_when_it_has_a_nested_block_element( + self, html_text: str, expected_value: list[TextSegment | Element] + ): + e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0] + assert list(e.iter_text_segments()) == expected_value + + # -- ._annotation() --------------------------------------------------- + def it_forms_its_annotations_from_emphasis(self): - cite = etree.fromstring(" rhombus ", html_parser).xpath(".//cite")[0] - assert cite._annotation(cite.text, "bi") == { - "emphasized_text_contents": "rhombus", + cite = etree.fromstring("", html_parser).xpath(".//cite")[0] + assert cite._annotation("\n foobar\n ", "bi") == { + "emphasized_text_contents": "foobar", "emphasized_text_tags": "bi", } - def but_not_when_text_is_empty_or_whitespace(self): - cite = etree.fromstring(" ", html_parser).xpath(".//cite")[0] - assert cite._annotation(cite.text, "bi") == {} + @pytest.mark.parametrize("text", ["", "\n \t "]) + def but_not_when_text_is_empty_or_whitespace(self, text: str): + cite = etree.fromstring("", html_parser).xpath(".//cite")[0] + assert cite._annotation(text, "bi") == {} def and_not_when_there_is_no_emphasis(self): - cite = etree.fromstring("rhombus", html_parser).xpath(".//cite")[0] - assert cite._annotation(cite.text, "") == {} + cite = etree.fromstring("", html_parser).xpath(".//cite")[0] + assert cite._annotation("foobar", "") == {} - def it_uses_the_enclosing_emphasis_as_the_default_inside_emphasis(self): - abbr = etree.fromstring("LLM", html_parser).xpath(".//abbr")[0] - assert abbr._inside_emphasis("xyz") == "xyz" + # -- ._inside_emphasis() ---------------------------------------------- + + @pytest.mark.parametrize("enclosing_emphasis", ["", "b", "bi"]) + def it_uses_the_enclosing_emphasis_as_the_default_inside_emphasis( + self, enclosing_emphasis: str + ): + """Inside emphasis is applied to text inside the phrasing element (but not its tail). + + The `._inside_emphasis()` method is overridden by Bold and Italic classes which add their + specific emphasis characters. + """ + abbr = etree.fromstring("", html_parser).xpath(".//abbr")[0] + assert abbr._inside_emphasis(enclosing_emphasis) == enclosing_emphasis + + # -- ._iter_child_text_segments() ------------------------------------- + + @pytest.mark.parametrize( + ("html_text", "expected_value"), + [ + # -- a phrasing element with no children produces no text segments + # -- (element text is handled elsewhere) + ("aaa", []), + # -- child phrasing element produces text-segment for its text -- + ("xbbb", [TextSegment("bbb", {})]), + # -- and also for its tail when it has one -- + ("xbbbccc", [TextSegment("bbb", {}), TextSegment("ccc", {})]), + # -- nested phrasing recursively each produce a segment for text and tail, in order -- + ( + "xxxaaabbbcccdddeeefff", + [ + TextSegment("aaa", {}), + TextSegment("bbb", {}), + TextSegment("ccc", {}), + TextSegment("ddd", {}), + TextSegment("eee", {}), + TextSegment("fff", {}), + ], + ), + ], + ) + def it_generates_text_segments_for_its_children_and_their_tails( + self, html_text: str, expected_value: list[TextSegment] + ): + e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0] + assert list(e._iter_child_text_segments("")) == expected_value + + @pytest.mark.parametrize( + ("html_text", "inside_emphasis", "expected_value"), + [ + # -- a phrasing element with no block children produces no elements -- + ("", "", []), + # -- a child block element produces an element -- + ("

    aaa

    ", "", [Title("aaa")]), + # -- a child block element with a tail also produces a text-segment for the tail -- + ("

    aaa

    bbb
    ", "", [Title("aaa"), TextSegment("bbb", {})]), + # -- and also text-segments for phrasing following the tail -- + ( + "

    aaa

    bbbcccddd
    ", + "", + [ + Title("aaa"), + TextSegment("bbb", {}), + TextSegment("ccc", {}), + TextSegment("ddd", {}), + ], + ), + # -- and emphasis is applied before and after block-item -- + ( + "aaa

    bbb

    cccdddeee
    ", + "b", + [ + TextSegment( + "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"} + ), + Title("bbb"), + TextSegment( + "ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"} + ), + TextSegment( + "ddd", {"emphasized_text_contents": "ddd", "emphasized_text_tags": "b"} + ), + TextSegment( + "eee", {"emphasized_text_contents": "eee", "emphasized_text_tags": "b"} + ), + ], + ), + ], + ) + def and_it_generates_elements_for_its_block_children( + self, html_text: str, inside_emphasis: str, expected_value: list[TextSegment | Element] + ): + e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0] + assert list(e._iter_child_text_segments(inside_emphasis)) == expected_value + + # -- ._iter_text_segments_from_block_tail_and_phrasing() -------------- + + @pytest.mark.parametrize( + ("html_text", "emphasis", "expected_value"), + [ + # -- no tail and no contiguous phrasing produces no text-segments -- + ("

    ", "", []), + # -- tail produces a text-segment -- + ("

    aaa", "", [TextSegment("aaa", {})]), + # -- contiguous phrasing produces a text-segment -- + ("

    aaa", "", [TextSegment("aaa", {})]), + # -- tail of contiguous phrasing also produces a text-segment -- + ("

    aaabbb", "", [TextSegment("aaa", {}), TextSegment("bbb", {})]), + # -- nested phrasing produces a text-segment -- + ( + "

    aaabbbcccdddeee", + "", + [ + TextSegment("aaa", {}), + TextSegment("bbb", {}), + TextSegment("ccc", {}), + TextSegment("ddd", {}), + TextSegment("eee", {}), + ], + ), + # -- and emphasis is added to each text-segment when specified -- + ( + "

    aaabbbcccdddeee", + "b", + [ + TextSegment( + "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"} + ), + TextSegment( + "bbb", {"emphasized_text_contents": "bbb", "emphasized_text_tags": "b"} + ), + TextSegment( + "ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "bi"} + ), + TextSegment( + "ddd", {"emphasized_text_contents": "ddd", "emphasized_text_tags": "b"} + ), + TextSegment( + "eee", {"emphasized_text_contents": "eee", "emphasized_text_tags": "b"} + ), + ], + ), + # -- a block item nested in contiguous phrasing produces an Element -- + ( + "

    aaabbb

    ccc

    ddd
    eee
    ", + "", + [ + TextSegment("aaa", {}), + TextSegment("bbb", {}), + Title("ccc"), + TextSegment("ddd", {}), + TextSegment("eee", {}), + ], + ), + ], + ) + def it_generates_text_segments_from_the_tail_and_contiguous_phrasing( + self, html_text: str, emphasis: str, expected_value: list[TextSegment | Element] + ): + e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0] + p = e.xpath("./p")[0] + tail = p.tail or "" + q = deque(e[1:]) + + assert ( + list(e._iter_text_segments_from_block_tail_and_phrasing(tail, q, emphasis)) + == expected_value + ) + + +class DescribeAnchor: + """Isolated unit-test suite for `unstructured.partition.html.parser.Anchor`. + + The `Anchor` class is used for `` tags and provides link metadata. + """ + + # -- .iter_text_segments() -------------------------------------------- + + @pytest.mark.parametrize( + ("html_text", "emphasis", "expected_value"), + [ + # -- produces no text-segment or annotation for anchor.text when there is none -- + ('', "", []), + # -- but it produces a text-segment for the tail if there is one -- + (' long tail ', "", [TextSegment(" long tail ", {})]), + # -- produces text-segment but no annotation for anchor.text when it is whitespace -- + (' ', "", [TextSegment(" ", {})]), + # -- produces text-segment and annotation for anchor text. Note `link_texts:` + # -- annotation value is whitespace-normalized but text-segment text is not. + ( + ' click here ', + "", + [ + TextSegment( + " click here ", + {"link_texts": ["click here"], "link_urls": ["http://abc.com"]}, + ) + ], + ), + # -- produces text-segment for both text and tail when present -- + ( + ' click here long tail', + "", + [ + TextSegment( + " click here ", + {"link_texts": ["click here"], "link_urls": ["http://abc.com"]}, + ), + TextSegment(" long tail", {}), + ], + ), + # -- nested phrasing inside element is handled as expected -- + ( + '

    I am one with the Force.

    ', + "", + [ + TextSegment( + "one with the Force", + { + "emphasized_text_contents": ["the"], + "emphasized_text_tags": ["i"], + "link_texts": ["one with the Force"], + "link_urls": ["http://eie.io"], + }, + ), + TextSegment(".", {}), + ], + ), + # -- enclosing_emphasis is applied to all segments -- + ( + '

    I am one with the Force.

    ', + "b", + [ + TextSegment( + "one with", + { + "emphasized_text_contents": ["one with"], + "emphasized_text_tags": ["b"], + "link_texts": ["one with"], + "link_urls": ["http://eie.io"], + }, + ), + TextSegment( + " the Force.", + { + "emphasized_text_contents": "the Force.", + "emphasized_text_tags": "b", + }, + ), + ], + ), + ], + ) + def it_generates_link_annotated_text_segments_for_its_text_and_a_tail_text_segment( + self, html_text: str, emphasis: str, expected_value: list[TextSegment] + ): + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + assert list(a.iter_text_segments(emphasis)) == expected_value + + def it_generates_enclosed_block_items_as_separate_elements(self): + html_text = """I am

    one with

    the Force.
    """ + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + + assert list(a.iter_text_segments("b")) == [ + TextSegment( + "I am ", + { + "emphasized_text_contents": ["I am"], + "emphasized_text_tags": ["b"], + "link_texts": ["I am"], + "link_urls": ["http://eie.io"], + }, + ), + Title("one with"), + TextSegment( + " the Force.", + { + "emphasized_text_contents": "the Force.", + "emphasized_text_tags": "b", + }, + ), + ] + + def and_it_annotates_first_enclosed_block_Element_when_no_non_whitespace_phrase_appears_first( + self, + ): + html_text = """ \n

    I am one with

    the Force.
    """ + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + + actual = list(a.iter_text_segments("i")) + + assert actual == [ + TextSegment(" \n ", {}), + NarrativeText("I am one with"), + TextSegment( + " the Force.", + { + "emphasized_text_contents": "the Force.", + "emphasized_text_tags": "i", + }, + ), + ] + element = actual[1] + assert element.metadata.link_texts == ["I am one with"] + assert element.metadata.link_urls == ["http://eie.io"] + + # -- ._iter_phrases_and_elements() ------------------------------------ + + def it_divides_the_anchor_contents_but_not_tail_into_phrases_and_elements(self): + html_text = """ + But always

    see first.

    Otherwise you
    will only see + """ + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + + assert list(a._iter_phrases_and_elements(emphasis="")) == [ + (TextSegment("But always ", {}),), + NarrativeText("see first."), + (TextSegment(" Otherwise you ", {}),), + ] + + # -- ._iter_phrasing() ------------------------------------------------ + + def it_generates_zero_items_when_both_text_and_q_are_empty(self): + html_text = """""" + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + + with pytest.raises(StopIteration): + next(a._iter_phrasing(text="", q=deque([]), emphasis="")) + + def it_generates_a_phrase_when_only_text_is_present(self): + html_text = """\n But always see first.\n""" + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + + assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [ + (TextSegment("\n But always see first.\n", {}),) + ] + + def and_it_generates_a_phrase_when_that_text_is_followed_by_a_phrasing_element(self): + html_text = """But always see first. Otherwise""" + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + + assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [ + ( + TextSegment("But always ", {}), + TextSegment( + "see ", + { + "emphasized_text_contents": "see", + "emphasized_text_tags": "b", + }, + ), + TextSegment( + "first", + { + "emphasized_text_contents": "first", + "emphasized_text_tags": "bi", + }, + ), + TextSegment(". Otherwise", {}), + ) + ] + + def it_ends_the_phrase_at_the_end_of_the_element(self): + html_text = """But always see first. Otherwise you will """ + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + + assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [ + (TextSegment("But always see first.", {}),) + ] + + def but_it_ends_at_a_block_element_if_one_occurs_first(self): + html_text = """But always see first.

    Otherwise you

    """ + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + + assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [ + (TextSegment("But always see first. ", {}),) + ] + + def it_generates_an_element_for_a_block_item_nested_inside_phrasing(self): + html_text = """ + But always

    see first.

    Otherwise
    you
    + """ + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + + assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [ + ( + TextSegment("But ", {}), + TextSegment( + "always ", + { + "emphasized_text_contents": "always", + "emphasized_text_tags": "b", + }, + ), + ), + NarrativeText("see first."), + ( + TextSegment( + "Otherwise", + { + "emphasized_text_contents": "Otherwise", + "emphasized_text_tags": "b", + }, + ), + TextSegment(" you ", {}), + ), + ] + + # -- ._link_annotate_element() ---------------------------------------- + + def it_adds_link_metadata_to_an_element_to_help(self): + html_text = """""" + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + element = Text("aaa") + + e = a._link_annotate_element(element) + + assert e is element + assert e.metadata.link_texts == ["aaa"] + assert e.metadata.link_urls == ["http://eie.io"] + + def and_it_preserves_any_existing_link_metadata_on_the_element(self): + # -- nested anchors shouldn't be possible but easier to test than prove it can't happen -- + html_text = """""" + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + element = Text("bbb") + element.metadata.link_texts = ["abc"] + element.metadata.link_urls = ["http://abc.com"] + + e = a._link_annotate_element(element) + + assert e is element + assert e.metadata.link_texts == ["abc", "bbb"] + assert e.metadata.link_urls == ["http://abc.com", "http://eie.io"] + + def but_not_when_the_text_is_empty(self): + html_text = """""" + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + element = Text("") + + e = a._link_annotate_element(element) + + assert e is element + assert e.metadata.link_texts is None + assert e.metadata.link_urls is None + + def and_not_when_there_is_no_url(self): + html_text = """""" + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + element = Text("zzz") + + e = a._link_annotate_element(element) + + assert e is element + assert e.metadata.link_texts is None + assert e.metadata.link_urls is None + + # -- ._link_text_segment() -------------------------------------------- + + def it_consolidates_a_phrase_into_a_single_link_annotated_TextSegment_to_help(self): + html_text = """""" + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + phrase = ( + TextSegment( + "Otherwise you will only ", + { + "emphasized_text_contents": ["Otherwise"], + "emphasized_text_tags": ["i"], + }, + ), + TextSegment( + "see what you were expecting.\n", + { + "emphasized_text_contents": "expecting", + "emphasized_text_tags": "b", + }, + ), + ) + + link_text_segment = a._link_text_segment(phrase) + + assert link_text_segment == TextSegment( + "Otherwise you will only see what you were expecting.\n", + { + "emphasized_text_contents": ["Otherwise", "expecting"], + "emphasized_text_tags": ["i", "b"], + "link_texts": ["Otherwise you will only see what you were expecting."], + "link_urls": ["http://eie.io"], + }, + ) + + @pytest.mark.parametrize("text", ["", " \n \t "]) + def but_not_when_the_text_is_empty_or_whitespace_only(self, text: str): + html_text = """""" + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + phrase = (TextSegment(text, {}), TextSegment(text, {}), TextSegment(text, {})) + + assert a._link_text_segment(phrase) is None + + def and_not_when_the_anchor_has_no_href_url(self): + html_text = """foobar""" + a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] + phrase = (TextSegment("Otherwise", {}), TextSegment(" you will", {})) + + assert a._link_text_segment(phrase) is None class DescribeBold: @@ -595,124 +1377,6 @@ class DescribeRemovedPhrasing: assert text_segment.text == "\n Like vastly, hugely big.\n" -# -- DUAL-ROLE ELEMENTS -------------------------------------------------------------------------- - - -class DescribeAnchor: - """Isolated unit-test suite for `unstructured.partition.html.parser.Anchor`. - - The `Anchor` class is used for `` tags and provides link metadata. - """ - - # -- .is_phrasing ----------------------------------------------------- - - @pytest.mark.parametrize( - ("html_text", "expected_value"), - [ - # -- an empty identifies as phrasing -- - ('', True), - # -- an with text but no children identifies as phrasing -- - ('“O Deep Thought computer," he said,', True), - # -- an with no text and only phrasing children identifies as phrasing -- - ('“O Deep Thought computer,"', True), - # -- an with both text and phrasing children identifies as phrasing -- - ('“O Deep Thought computer,"', True), - # -- but an with a block-item child does not -- - ('

    “O Deep Thought computer,"

    ', False), - # -- and an with both text and a block-item child does not -- - ('“O Deep Thought computer,"
    he said,
    ', False), - # -- and an with text and both block and phrasing children does not -- - ('“O Deep Thought
    computer," he
    ', False), - ], - ) - def it_determines_whether_it_is_phrasing_dynamically( - self, html_text: str, expected_value: bool - ): - a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] - - assert isinstance(a, Anchor) - assert a.is_phrasing is expected_value - - # -- .iter_elements() ------------------------------------------------- - - def it_can_also_act_as_a_block_item(self): - html_text = """ - - """ - a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] - - elements = a.iter_elements() - - assert [e.text for e in elements] == [ - "O Deep Thought computer, he said,", - "The task we have designed you to perform is this.", - "We want you to tell us.... he paused,", - ] - - # -- .iter_text_segments() -------------------------------------------- - - @pytest.mark.parametrize( - ("html_text", "expected_value"), - [ - # -- produces no text-segment or annotation for anchor.text when there is none -- - ('', []), - # -- but it produces a text-segment for the tail if there is one -- - (' long tail ', [TextSegment(" long tail ", {})]), - # -- produces text-segment but no annotation for anchor.text when it is whitespace -- - (' ', [TextSegment(" ", {})]), - # -- produces text-segment and annotation for anchor text - # -- Note link-texts annotation is whitespace-normalized but text-segment text is not. - ( - ' click here ', - [ - TextSegment( - " click here ", - {"link_texts": ["click here"], "link_urls": ["http://abc.com"]}, - ) - ], - ), - # -- produces text-segment for both text and tail when present -- - ( - ' click here long tail', - [ - TextSegment( - " click here ", - {"link_texts": ["click here"], "link_urls": ["http://abc.com"]}, - ), - TextSegment(" long tail", {}), - ], - ), - # -- nested phrasing inside element is handled as expected -- - ( - '

    I am one with the Force.

    ', - [ - TextSegment( - "one with the Force", - { - "emphasized_text_contents": ["the"], - "emphasized_text_tags": ["i"], - "link_texts": ["one with the Force"], - "link_urls": ["http://eie.io"], - }, - ), - TextSegment(".", {}), - ], - ), - ], - ) - def it_generates_link_annotated_text_segments_for_its_text_and_a_tail_text_segment( - self, html_text: str, expected_value: list[TextSegment] - ): - a = etree.fromstring(html_text, html_parser).xpath(".//a")[0] - assert list(a.iter_text_segments()) == expected_value - - # -- DEFAULT ELEMENT ----------------------------------------------------------------------------- diff --git a/typings/lxml/etree/__init__.pyi b/typings/lxml/etree/__init__.pyi index 9758f62a5..1509ad7c1 100644 --- a/typings/lxml/etree/__init__.pyi +++ b/typings/lxml/etree/__init__.pyi @@ -4,6 +4,7 @@ from __future__ import annotations from ._classlookup import ElementBase as ElementBase from ._classlookup import ElementDefaultClassLookup as ElementDefaultClassLookup +from ._cleanup import strip_elements as strip_elements from ._element import _Element as _Element from ._element import _ElementTree as _ElementTree from ._module_func import fromstring as fromstring diff --git a/typings/lxml/etree/_cleanup.pyi b/typings/lxml/etree/_cleanup.pyi new file mode 100644 index 000000000..29e6bd861 --- /dev/null +++ b/typings/lxml/etree/_cleanup.pyi @@ -0,0 +1,21 @@ +# pyright: reportPrivateUsage=false + +from __future__ import annotations + +from typing import Collection, overload + +from .._types import _ElementOrTree, _TagSelector + +@overload +def strip_elements( + __tree_or_elem: _ElementOrTree, + *tag_names: _TagSelector, + with_tail: bool = True, +) -> None: ... +@overload +def strip_elements( + __tree_or_elem: _ElementOrTree, + __tag: Collection[_TagSelector], + /, + with_tail: bool = True, +) -> None: ... diff --git a/unstructured/__version__.py b/unstructured/__version__.py index cee899622..780e4a22a 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.10" # pragma: no cover +__version__ = "0.14.11-dev0" # pragma: no cover diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py index 557b6bd3f..a79803969 100644 --- a/unstructured/partition/html/parser.py +++ b/unstructured/partition/html/parser.py @@ -75,10 +75,9 @@ Other background from __future__ import annotations -import itertools from collections import defaultdict, deque from types import MappingProxyType -from typing import Any, Iterable, Iterator, Mapping, NamedTuple, cast +from typing import Any, Iterable, Iterator, Mapping, NamedTuple, Sequence, cast from lxml import etree from typing_extensions import TypeAlias @@ -102,7 +101,7 @@ from unstructured.partition.text_type import ( is_possible_title, is_us_city_state_zip, ) -from unstructured.utils import htmlify_matrix_of_cell_texts +from unstructured.utils import htmlify_matrix_of_cell_texts, lazyproperty # ------------------------------------------------------------------------------------------------ # DOMAIN MODEL @@ -117,14 +116,14 @@ differ between the individual (text-segment) and consolidated (Element) forms. """ -def _consolidate_annotations(text_segments: Iterable[TextSegment]) -> Annotation: +def _consolidate_annotations(annotations: Iterable[Annotation]) -> Annotation: """Combine individual text-segment annotations into an element-level annotation. Sequence is significant. """ combined_annotations = cast(defaultdict[str, list[str]], defaultdict(list)) - for ts in text_segments: - for k, v in ts.annotation.items(): + for a in annotations: + for k, v in a.items(): if isinstance(v, list): combined_annotations[k].extend(cast(list[Any], v)) else: @@ -161,6 +160,171 @@ class TextSegment(NamedTuple): annotation: Annotation +Phrase: TypeAlias = Sequence[TextSegment] +"""Contiguous text-segments formed from text and contiguous phrasing. + +These occur within a block element as the element text and contiguous phrasing or the tail and +contiguous phrasing. For example, there are two phrases in this div, one before and one after the +

    child element: + +

    + Seagulls gonna come and +

    Poke me in the coconut

    + And they did, they did +
    + +The first is `div.text` and the phrasing (text and tail of phrasing elements) that follow it. A +phrase terminates at a block element (`

    ` in this case) or at the end of the enclosing block (the +`

    ` in this example). +""" + + +# ------------------------------------------------------------------------------------------------ +# PHRASING ACCUMULATORS +# ------------------------------------------------------------------------------------------------ + + +class _PhraseAccumulator: + """Accumulates sequential `TextSegment`s making them available as iterable on flush(). + + - The accumulator starts empty. + - `.flush()` is a Phrase iterator and generates zero or one Phrase. + - `.flush()` generates zero items when no text-segments have been accumulated + - `flush()` resets the accumulator to its initial empty state. + + So far, phrases are used only by the Anchor class. + """ + + def __init__(self): + self._text_segments: list[TextSegment] = [] + + def add(self, text_segment: TextSegment) -> None: + """Add `text_segment` to this collection.""" + self._text_segments.append(text_segment) + + def flush(self) -> Iterator[Phrase]: + """Generate each of the stored `TextSegment` objects and clears the accumulator.""" + # -- harvest accumulated text-segments and empty the accumulator -- + text_segments = self._text_segments[:] + self._text_segments.clear() + + if not text_segments: + return + + yield tuple(text_segments) + + +class _ElementAccumulator: + """Accumulates sequential `TextSegment`s and forms them into an element on flush(). + + The text segments come from element text or tails and any contiguous phrasing elements that + follow that text or tail. + + - The accumulator starts empty. + - `.flush()` is an element iterator and generates zero or one Element. + - `.flush()` generates zero elements when no text-segments have been accumulated or the ones + that have been accumulated contain only whitespace. + - `flush()` resets the accumulator to its initial empty state. + """ + + def __init__(self, element: etree.ElementBase): + self._element = element + self._text_segments: list[TextSegment] = [] + + def add(self, text_segment: TextSegment) -> None: + """Add `text_segment` to this Element-under-construction.""" + self._text_segments.append(text_segment) + + def flush(self, ElementCls: type[Element] | None) -> Iterator[Element]: + """Generate zero-or-one document-`Element` object and clear the accumulator.""" + # -- normalized-text must be computed before resetting the accumulator -- + normalized_text = self._normalized_text + + # -- harvest accumulated text-segments and empty the accumulator -- + text_segments = self._text_segments[:] + self._text_segments.clear() + + if not text_segments or not normalized_text: + return + + # -- if we don't have a more specific element-class, choose one based on the text -- + if ElementCls is None: + ElementCls = derive_element_type_from_text(normalized_text) + # -- normalized text that contains only a single character is skipped unless it + # -- identifies as a list-item + if ElementCls is None: + return + # -- derived ListItem means text starts with a bullet character that needs removing -- + if ElementCls is ListItem: + normalized_text = clean_bullets(normalized_text) + if not normalized_text: + return + + category_depth = self._category_depth(ElementCls) + + yield ElementCls( + normalized_text, + metadata=ElementMetadata( + **_consolidate_annotations(ts.annotation for ts in text_segments), + category_depth=category_depth, + ), + ) + + def _category_depth(self, ElementCls: type[Element]) -> int | None: + """Not clear on concept. Something to do with hierarchy ...""" + if ElementCls is ListItem: + return ( + len([e for e in self._element.iterancestors() if e.tag in ("dl", "ol", "ul")]) + if self._element.tag in ("li", "dd") + else 0 + ) + + if ElementCls is Title: + return ( + int(self._element.tag[1]) - 1 + if self._element.tag in ("h1", "h2", "h3", "h4", "h5", "h6") + else 0 + ) + + return None + + @property + def _normalized_text(self) -> str: + """Consolidate text-segment text values into a single whitespace-normalized string. + + This normalization is suitable for text inside a block element including any segments from + phrasing elements immediately following that text. The spec is: + + - All text segments are concatenated (without adding or removing whitespace) + - Leading and trailing whitespace are removed. + - Each run of whitespace in the string is reduced to a single space. + + For example: + " \n foo bar\nbaz bada \t bing\n " + becomes: + "foo bar baz bada bing" + """ + return " ".join("".join(ts.text for ts in self._text_segments).split()) + + +class _PreElementAccumulator(_ElementAccumulator): + """Accumulator specific to `
    ` element, preserves (most) whitespace in normalized text."""
    +
    +    @property
    +    def _normalized_text(self) -> str:
    +        """Consolidate `texts` into a single whitespace-normalized string.
    +
    +        This normalization is specific to the `
    ` element. Only a leading and or trailing
    +        newline is removed. All other whitespace is preserved.
    +        """
    +        text = "".join(ts.text for ts in self._text_segments)
    +
    +        start = 1 if text.startswith("\n") else 0
    +        end = -1 if text.endswith("\n") else len(text)
    +
    +        return text[start:end]
    +
    +
     # ------------------------------------------------------------------------------------------------
     # CUSTOM ELEMENT-CLASSES
     # ------------------------------------------------------------------------------------------------
    @@ -195,19 +359,10 @@ class Flow(etree.ElementBase):
                 yield from block_item.iter_elements()
                 yield from self._element_from_text_or_tail(block_item.tail or "", q)
     
    -    def _category_depth(self, ElementCls: type[Element]) -> int | None:
    -        """Not clear on concept. Something to do with hierarchy ..."""
    -        if ElementCls is ListItem:
    -            return (
    -                len([e for e in self.iterancestors() if e.tag in ("dl", "ol", "ul")])
    -                if self.tag in ("li", "dd")
    -                else 0
    -            )
    -
    -        if ElementCls is Title:
    -            return int(self.tag[1]) - 1 if self.tag in ("h1", "h2", "h3", "h4", "h5", "h6") else 0
    -
    -        return None
    +    @lazyproperty
    +    def _element_accum(self) -> _ElementAccumulator:
    +        """Text-segment accumulator suitable for this block-element."""
    +        return _ElementAccumulator(self)
     
         def _element_from_text_or_tail(
             self, text: str, q: deque[Flow | Phrasing], ElementCls: type[Element] | None = None
    @@ -216,37 +371,34 @@ class Flow(etree.ElementBase):
     
             Note this mutates `q` by popping phrasing elements off as they are processed.
             """
    -        text_segments = tuple(self._iter_text_segments(text, q))
    -        normalized_text = " ".join("".join(ts.text for ts in text_segments).split())
    +        element_accum = self._element_accum
     
    -        if not normalized_text:
    -            return
    +        for node in self._iter_text_segments(text, q):
    +            if isinstance(node, TextSegment):
    +                element_accum.add(node)
    +            else:
    +                # -- otherwise x is an Element, which terminates any accumulating Element --
    +                yield from element_accum.flush(ElementCls)
    +                yield node
     
    -        # -- if we don't have a more specific element-class, choose one based on the text --
    -        if ElementCls is None:
    -            ElementCls = derive_element_type_from_text(normalized_text)
    -            # -- normalized text that contains only a bullet character is skipped --
    -            if ElementCls is None:
    -                return
    -            # -- derived ListItem means text starts with a bullet character that needs removing --
    -            if ElementCls is ListItem:
    -                normalized_text = clean_bullets(normalized_text)
    -                if not normalized_text:
    -                    return
    +        yield from element_accum.flush(ElementCls)
     
    -        category_depth = self._category_depth(ElementCls)
    +    def _iter_text_segments(
    +        self, text: str, q: deque[Flow | Phrasing]
    +    ) -> Iterator[TextSegment | Element]:
    +        """Generate zero-or-more `TextSegment`s or `Element`s from text and leading phrasing.
     
    -        yield ElementCls(
    -            normalized_text,
    -            metadata=ElementMetadata(
    -                **_consolidate_annotations(text_segments), category_depth=category_depth
    -            ),
    -        )
    +        Note that while this method is named "._iter_text_segments()", it can also generate
    +        `Element` objects when a block item is nested within a phrasing element. This is not
    +        technically valid HTML, but folks write some wacky HTML and the browser is pretty forgiving
    +        so we try to do the right thing (what the browser does) when that happens, generally
    +        interpret each nested block as its own paragraph and generate a separate `Element` object
    +        for each.
     
    -    def _iter_text_segments(self, text: str, q: deque[Flow | Phrasing]) -> Iterator[TextSegment]:
    -        """Generate zero-or-more `TextSegment`s from text and leading phrasing elements.
    +        This method is used to process the text or tail of a block element, including any phrasing
    +        elements immediately following the text or tail.
     
    -        This is used to process the text or tail of a flow element. For example, this 
    : + For example, this
    :
    For a moment, nothing happened. @@ -254,8 +406,13 @@ class Flow(etree.ElementBase): The dolphins had always believed that they were far more intelligent.
    - Should generate three distinct elements, one for each contained line. This method is - invoked to process the first beginning "For a" and the third line beginning "The dolphins". + Should generate three distinct elements: + - One for the div's text "For a " and the phrasing element after it, + - one for the

    element, and + - one for the tail of the

    and the phrasing element that follows it. + + This method is invoked to process the first line beginning "For a" and the third line + beginning "The dolphins", in two separate calls. Note this method mutates `q` by popping phrasing elements off as they are processed. """ @@ -314,33 +471,10 @@ class Pre(BlockItem): Can only contain phrasing content. """ - def iter_elements(self) -> Iterator[Element]: - """Generate zero or one document element for the entire `

    ` element.
    -
    -        Whitespace is preserved just as it appears in the source HTML.
    -        """
    -        pre_text = self.text or ""
    -        # -- this is pretty subtle, but in a browser, if the opening `
    ` is immediately
    -        # -- followed by a newline, that newline is removed from the rendered text.
    -        if pre_text.startswith("\n"):
    -            pre_text = pre_text[1:]
    -
    -        text_segments = tuple(self._iter_text_segments(pre_text, deque(self)))
    -        text = "".join(ts.text for ts in text_segments)
    -
    -        # -- also subtle, but in a browser, if the closing `
    ` tag is immediately preceded - # -- by a newline (starts in column 1), that preceding newline is removed too. - if text.endswith("\n"): - text = text[:-1] - - if not text: - return - - ElementCls = derive_element_type_from_text(text) - if not ElementCls: - return - - yield ElementCls(text, metadata=ElementMetadata(**_consolidate_annotations(text_segments))) + @lazyproperty + def _element_accum(self) -> _ElementAccumulator: + """Text-segment accumulator suitable for this block-element.""" + return _PreElementAccumulator(self) class TableBlock(Flow): @@ -404,7 +538,7 @@ class Phrasing(etree.ElementBase): def is_phrasing(self) -> bool: return True - def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]: + def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment | Element]: """Generate text segments for text, children, and tail of this element.""" inside_emphasis = self._inside_emphasis(enclosing_emphasis) @@ -445,14 +579,25 @@ class Phrasing(etree.ElementBase): """ return enclosing_emphasis - def _iter_child_text_segments(self, emphasis: str) -> Iterator[TextSegment]: + def _iter_child_text_segments(self, emphasis: str) -> Iterator[TextSegment | Element]: """Generate zero-or-more text-segments for phrasing children of this element. All generated text segments will be annotated with `emphasis` when it is other than the empty string. """ - for child in self: - yield from child.iter_text_segments(emphasis) + q: deque[Flow | Phrasing] = deque(self) + # -- Recurse into any nested tags. Phrasing children contribute `TextSegment`s to the + # -- stream. Block children contribute document `Element`s. Note however that a phrasing + # -- child can also produce an `Element` from any nested block element. + while q: + child = q.popleft() + if child.is_phrasing: + yield from cast(Phrasing, child).iter_text_segments(emphasis) + else: + yield from cast(Flow, child).iter_elements() + yield from self._iter_text_segments_from_block_tail_and_phrasing( + child.tail or "", q, emphasis + ) def _iter_tail_segment(self, emphasis: str) -> Iterator[TextSegment]: """Generate zero-or-one text-segment for tail of this element. @@ -472,6 +617,150 @@ class Phrasing(etree.ElementBase): if text := self.text: yield TextSegment(text, self._annotation(text, emphasis)) + def _iter_text_segments_from_block_tail_and_phrasing( + self, tail: str, q: deque[Flow | Phrasing], emphasis: str + ) -> Iterator[TextSegment | Element]: + """Generate zero-or-more `TextSegment`s or `Element`s from tail+phrasing of block child. + + When this phrasing element contains a block child (not valid HTML but accepted by + browsers), the tail of that block child and any phrasing elements contiguous with that tail + also need to contribute their text. This method takes care of that job. + + Note this mutates `q` by popping phrasing elements off as they are processed. + """ + if tail: + yield TextSegment(tail, self._annotation(tail, emphasis)) + while q and q[0].is_phrasing: + e = cast(Phrasing, q.popleft()) + yield from e.iter_text_segments(emphasis) + + +class Anchor(Phrasing): + """Custom element-class for `` element. + + Provides link annotations. + """ + + def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment | Element]: + """Generate text segments for contents and tail of this element, when they exist. + + Phrasing is emitted as `TextSegment` objects. Any nested block items (not valid HTML but + are accepted by browser so can occur) are emitted as `Element` objects. + + When an anchor contains a nested block element, there can be multiple phrases and/or + elements. Link annotation is only added to the first phrase or element. Otherwise the link + annotation would span multiple document-elements. + """ + q: deque[Phrase | Element] = deque(self._iter_phrases_and_elements(enclosing_emphasis)) + + # -- the first non-whitespace phrase or element gets the link annotation -- + while q: + x = q.popleft() + if isinstance(x, Element): + yield self._link_annotate_element(x) + break + else: + # -- a whitespace-only phrase will not receive the link annotation (no link text) -- + if lts := self._link_text_segment(x): + yield lts + break + else: + yield from x + + # -- whatever phrases or elements remain are emitted without link annotation -- + + while q: + x = q.popleft() + if isinstance(x, Element): + yield x + else: + yield from x + + # -- A tail is emitted when present whether anchor itself was emitted or not -- + yield from self._iter_tail_segment(enclosing_emphasis) + + def _iter_phrases_and_elements(self, emphasis: str) -> Iterator[Phrase | Element]: + """Divide contents (text+children, but not tail) into phrases and document-elements.""" + # -- place child elements in a queue, method calls use some and leave the rest -- + q: deque[Flow | Phrasing] = deque(self) + + yield from self._iter_phrasing(self.text or "", q, emphasis) + + while q: + assert not q[0].is_phrasing + block_item = cast(Flow, q.popleft()) + yield from block_item.iter_elements() + yield from self._iter_phrasing(block_item.tail or "", q, emphasis) + + def _iter_phrasing( + self, text: str, q: deque[Flow | Phrasing], emphasis: str + ) -> Iterator[Phrase | Element]: + """Generate zero-or-more `TextSegment`s or `Element`s from text and leading phrasing. + + Note that while this method is named "._iter_phrasing()", it can also generate `Element` + objects when a block item is nested within a phrasing element. This is not technically + valid HTML, but folks write some wacky HTML and the browser is pretty forgiving so we try + to do the right thing (what the browser does) when that happens, generally interpret each + nested block as its own paragraph and generate a separate `Element` object for each. + + This method is used to process the text or tail of a block element, including any phrasing + elements immediately following the text or tail. + + Note this method mutates `q` by popping phrasing elements off as they are processed. + """ + phrase_accum = _PhraseAccumulator() + + if text: + phrase_accum.add(TextSegment(text, self._annotation(text, emphasis))) + + while q and q[0].is_phrasing: + e = cast(Phrasing, q.popleft()) + for x in e.iter_text_segments(emphasis): + if isinstance(x, TextSegment): + phrase_accum.add(x) + # -- otherwise x is an `Element`, which terminates the accumulating phrase -- + else: + yield from phrase_accum.flush() + yield x + + # -- emit any phrase remaining in accumulator -- + yield from phrase_accum.flush() + + def _link_annotate_element(self, element: Element) -> Element: + """Apply this link's annotation to `element` and return it.""" + link_text = element.text + link_url = self.get("href") + + if not link_text or not link_url: + return element + + element.metadata.link_texts = (element.metadata.link_texts or []) + [link_text] + element.metadata.link_urls = (element.metadata.link_urls or []) + [link_url] + + return element + + def _link_text_segment(self, phrase: Phrase) -> TextSegment | None: + """Consolidate `phrase` into a single text-segment with link annotation. + + Returns None if the phrase contains only whitespace. + """ + consolidated_text = "".join(text_segment.text for text_segment in phrase) + link_text = _normalize_text(consolidated_text) + link_url = self.get("href") + + if not link_text or not link_url: + return None + + # -- the emphasis annotations must come from the individual text segments in the phrase -- + consolidated_annotations = _consolidate_annotations( + ( + {"link_texts": [link_text], "link_urls": [link_url]}, + *(text_segment.annotation for text_segment in phrase), + ) + ) + + return TextSegment(consolidated_text, consolidated_annotations) + class Bold(Phrasing): """Provides annotations for bold/strong text.""" @@ -526,75 +815,6 @@ class RemovedPhrasing(Phrasing): yield from self._iter_tail_segment(enclosing_emphasis) -# -- DUAL-ROLE ELEMENTS -------------------------------------------------------------------------- - - -class Anchor(Phrasing, Flow): - """Custom element-class for `` element. - - Provides link annotations. - """ - - @property - def is_phrasing(self) -> bool: - """False when the `` element contains any block items, True otherwise.""" - return all(e.is_phrasing for e in self) - - def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]: - """Generate text segments for text and tail of this element, when they exist. - - The behavior for an anchor element is slightly different because link annotations are only - added to the text, not the tail. Also an anchor can have no children. - """ - # -- the text of the link is everything inside the `` element, text and child text -- - text_segments = tuple( - itertools.chain( - self._iter_text_segment(enclosing_emphasis), - self._iter_child_text_segments(enclosing_emphasis), - ) - ) - - link_text = "".join("".join(ts.text for ts in text_segments)) - - # -- the link_text and link_url annotation refers to the entire text inside the `` -- - link_text_segment = TextSegment( - link_text, self._link_annotations(link_text, enclosing_emphasis) - ) - - # -- but the emphasis annotations must come from the individual text segments within -- - consolidated_annotations = _consolidate_annotations((link_text_segment, *text_segments)) - - # -- generate at most one text-segment for the `` element, the full enclosed text with - # -- consolidated emphasis and link annotations. - if link_text: - yield TextSegment(link_text, consolidated_annotations) - - # -- A tail is emitted when present whether anchor itself was or not -- - yield from self._iter_tail_segment(enclosing_emphasis) - - def _link_annotations(self, text: str, emphasis: str) -> Annotation: - """Link and emphasis annotations that apply to the text of this anchor. - - An anchor element does not add any emphasis but uses any introduced by enclosing elements. - """ - normalized_text = _normalize_text(text) - - if not normalized_text: - return {} - - def iter_annotation_pairs() -> Iterator[tuple[str, Any]]: - # -- emphasis annotation is only added when there is enclosing emphasis -- - if emphasis: - yield "emphasized_text_contents", normalized_text - yield "emphasized_text_tags", emphasis - - if href := self.get("href"): - yield "link_texts", normalized_text - yield "link_urls", href - - return MappingProxyType(dict(iter_annotation_pairs())) - - # -- DEFAULT ELEMENT -----------------------------------------------------------------------------