mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-31 21:10:43 +00:00

Fixes #3666 --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
1447 lines
55 KiB
Python
1447 lines
55 KiB
Python
# pyright: reportPrivateUsage=false
|
|
# pyright: reportUnknownArgumentType=false
|
|
|
|
"""Test suite for `unstructured.partition.html.parser` module."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from collections import deque
|
|
|
|
import pytest
|
|
from lxml import etree
|
|
|
|
from unstructured.documents.elements import Address, Element, ListItem, NarrativeText, Text, Title
|
|
from unstructured.partition.html.parser import (
|
|
Annotation,
|
|
DefaultElement,
|
|
Flow,
|
|
Phrasing,
|
|
RemovedPhrasing,
|
|
TextSegment,
|
|
_consolidate_annotations,
|
|
_ElementAccumulator,
|
|
_normalize_text,
|
|
_PhraseAccumulator,
|
|
_PreElementAccumulator,
|
|
html_parser,
|
|
)
|
|
|
|
# -- MODULE-LEVEL FUNCTIONS ----------------------------------------------------------------------
|
|
|
|
# -- _consolidate_annotations() ------------------
|
|
|
|
|
|
def it_consolidates_annotations_from_multiple_text_segments():
|
|
annotations = [
|
|
{
|
|
"link_texts": "Ford Prefect",
|
|
"link_url": "https://wikipedia/Ford_Prefect",
|
|
"emphasized_text_contents": "Ford Prefect",
|
|
"emphasized_text_tags": "b",
|
|
},
|
|
{
|
|
"emphasized_text_contents": "alien encounter",
|
|
"emphasized_text_tags": "bi",
|
|
},
|
|
]
|
|
|
|
annotations = _consolidate_annotations(annotations)
|
|
|
|
assert annotations == {
|
|
# -- each distinct key gets a list of values --
|
|
"emphasized_text_contents": ["Ford Prefect", "alien encounter"],
|
|
"emphasized_text_tags": ["b", "bi"],
|
|
# -- even when there is only one value --
|
|
"link_texts": ["Ford Prefect"],
|
|
"link_url": ["https://wikipedia/Ford_Prefect"],
|
|
}
|
|
# -- and the annotations mapping is immutable --
|
|
with pytest.raises(TypeError, match="object does not support item assignment"):
|
|
annotations["new_key"] = "foobar" # pyright: ignore[reportIndexIssue]
|
|
# -- (but not its list values unfortunately) --
|
|
annotations["emphasized_text_tags"].append("xyz")
|
|
assert annotations["emphasized_text_tags"] == ["b", "bi", "xyz"]
|
|
|
|
|
|
# -- _normalize_text() ---------------------------
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text", "expected_value"),
|
|
[
|
|
# -- already normalized text is left unchanged --
|
|
("iterators allow", "iterators allow"),
|
|
# -- newlines are treated as whitespace --
|
|
("algorithm\nto be", "algorithm to be"),
|
|
(" separated\n from ", "separated from"),
|
|
("\n container\n details\n ", "container details"),
|
|
(
|
|
"\n iterators allow \n algorithm to be \nexpressed without container \nnoise",
|
|
"iterators allow algorithm to be expressed without container noise",
|
|
),
|
|
],
|
|
)
|
|
def test_normalize_text_produces_normalized_text(text: str, expected_value: str):
|
|
assert _normalize_text(text) == expected_value
|
|
|
|
|
|
# -- PHRASING ACCUMULATORS -----------------------------------------------------------------------
|
|
|
|
|
|
class Describe_PhraseAccumulator:
|
|
"""Isolated unit-test suite for `unstructured.partition.html.parser._PhraseAccumulator`."""
|
|
|
|
def it_is_empty_on_construction(self):
|
|
accum = _PhraseAccumulator()
|
|
|
|
phrase_iter = accum.flush()
|
|
|
|
with pytest.raises(StopIteration):
|
|
next(phrase_iter)
|
|
|
|
# -- .add() -----------------------------------------------------------
|
|
|
|
def it_accumulates_text_segments(self):
|
|
accum = _PhraseAccumulator()
|
|
|
|
accum.add(TextSegment("Ford... you're turning ", {}))
|
|
accum.add(TextSegment("into a penguin.", {}))
|
|
phrase_iter = accum.flush()
|
|
|
|
phrase = next(phrase_iter)
|
|
assert phrase == (
|
|
TextSegment("Ford... you're turning ", {}),
|
|
TextSegment("into a penguin.", {}),
|
|
)
|
|
|
|
with pytest.raises(StopIteration):
|
|
next(phrase_iter)
|
|
|
|
# -- .flush() ---------------------------------------------------------
|
|
|
|
def it_generates_zero_phrases_on_flush_when_empty(self):
|
|
accum = _PhraseAccumulator()
|
|
|
|
phrase_iter = accum.flush()
|
|
|
|
with pytest.raises(StopIteration):
|
|
next(phrase_iter)
|
|
|
|
|
|
class Describe_ElementAccumulator:
|
|
"""Isolated unit-test suite for `unstructured.partition.html.parser._ElementAccumulator`."""
|
|
|
|
def it_is_empty_on_construction(self, html_element: etree.ElementBase):
|
|
accum = _ElementAccumulator(html_element)
|
|
|
|
element_iter = accum.flush(None)
|
|
|
|
with pytest.raises(StopIteration):
|
|
next(element_iter)
|
|
|
|
# -- .add() -----------------------------------------------------------
|
|
|
|
def it_accumulates_text_segments(self, html_element: etree.ElementBase):
|
|
accum = _ElementAccumulator(html_element)
|
|
|
|
accum.add(TextSegment("Ford... you're turning ", {}))
|
|
accum.add(TextSegment("into a penguin.", {}))
|
|
element_iter = accum.flush(None)
|
|
|
|
element = next(element_iter)
|
|
assert element == NarrativeText("Ford... you're turning into a penguin.")
|
|
|
|
with pytest.raises(StopIteration):
|
|
next(element_iter)
|
|
|
|
# -- .flush() ---------------------------------------------------------
|
|
|
|
def it_generates_zero_elements_when_empty(self, html_element: etree.ElementBase):
|
|
accum = _ElementAccumulator(html_element)
|
|
|
|
element_iter = accum.flush(None)
|
|
|
|
with pytest.raises(StopIteration):
|
|
next(element_iter)
|
|
|
|
def and_it_generates_zero_elements_when_all_its_text_segments_are_whitespace_only(
|
|
self, html_element: etree.ElementBase
|
|
):
|
|
accum = _ElementAccumulator(html_element)
|
|
accum.add(TextSegment(" \n \t \n", {}))
|
|
accum.add(TextSegment(" \n", {}))
|
|
|
|
with pytest.raises(StopIteration):
|
|
next(accum.flush(None))
|
|
|
|
def and_it_generates_zero_elements_when_there_is_only_one_non_whitespace_character(
|
|
self, html_element: etree.ElementBase
|
|
):
|
|
accum = _ElementAccumulator(html_element)
|
|
accum.add(TextSegment(" \n \t \n", {}))
|
|
accum.add(TextSegment(" X \n", {}))
|
|
|
|
with pytest.raises(StopIteration):
|
|
next(accum.flush(None))
|
|
|
|
def it_normalizes_the_text_of_its_text_segments_on_flush(self, html_element: etree.ElementBase):
|
|
accum = _ElementAccumulator(html_element)
|
|
accum.add(TextSegment(" \n Ford... you're \t turning\n", {}))
|
|
accum.add(TextSegment("into a penguin.\n", {}))
|
|
|
|
(element,) = accum.flush(None)
|
|
|
|
assert element.text == "Ford... you're turning into a penguin."
|
|
|
|
def it_creates_a_document_element_of_the_specified_type(self, html_element: etree.ElementBase):
|
|
accum = _ElementAccumulator(html_element)
|
|
accum.add(TextSegment("Ford... you're turning into a penguin.", {}))
|
|
|
|
(element,) = accum.flush(ListItem)
|
|
|
|
assert element == ListItem("Ford... you're turning into a penguin.")
|
|
|
|
def but_it_derives_the_element_type_from_the_text_when_none_is_specified(
|
|
self, html_element: etree.ElementBase
|
|
):
|
|
accum = _ElementAccumulator(html_element)
|
|
accum.add(TextSegment("Ford... you're turning into a penguin.", {}))
|
|
|
|
(element,) = accum.flush(None)
|
|
|
|
assert element == NarrativeText("Ford... you're turning into a penguin.")
|
|
|
|
def it_removes_an_explicit_leading_bullet_character_from_a_list_item(
|
|
self, html_element: etree.ElementBase
|
|
):
|
|
accum = _ElementAccumulator(html_element)
|
|
accum.add(TextSegment("* turning into a penguin", {}))
|
|
|
|
(element,) = accum.flush(None)
|
|
|
|
assert element == ListItem("turning into a penguin")
|
|
|
|
def it_applies_category_depth_metadata(self):
|
|
html_element = etree.fromstring("<h3>About fish</h3>", html_parser).xpath(".//h3")[0]
|
|
accum = _ElementAccumulator(html_element)
|
|
accum.add(TextSegment("Thanks for all those!", {}))
|
|
|
|
(element,) = accum.flush(Title)
|
|
|
|
e = element.to_dict()
|
|
e.pop("element_id")
|
|
assert e == {
|
|
"metadata": {"category_depth": 2},
|
|
"text": "Thanks for all those!",
|
|
"type": "Title",
|
|
}
|
|
|
|
def and_it_consolidates_annotations_into_metadata(self, html_element: etree.ElementBase):
|
|
accum = _ElementAccumulator(html_element)
|
|
accum.add(
|
|
TextSegment(
|
|
"\n Ford...",
|
|
{
|
|
"emphasized_text_contents": "Ford",
|
|
"emphasized_text_tags": "b",
|
|
},
|
|
)
|
|
)
|
|
accum.add(TextSegment(" you're turning into a ", {}))
|
|
accum.add(
|
|
TextSegment(
|
|
"penguin",
|
|
{
|
|
"emphasized_text_contents": "penguin",
|
|
"emphasized_text_tags": "i",
|
|
},
|
|
)
|
|
)
|
|
accum.add(TextSegment(".\n", {}))
|
|
|
|
(element,) = accum.flush(NarrativeText)
|
|
|
|
e = element.to_dict()
|
|
e.pop("element_id")
|
|
assert e == {
|
|
"metadata": {
|
|
"emphasized_text_contents": [
|
|
"Ford",
|
|
"penguin",
|
|
],
|
|
"emphasized_text_tags": [
|
|
"b",
|
|
"i",
|
|
],
|
|
},
|
|
"text": "Ford... you're turning into a penguin.",
|
|
"type": "NarrativeText",
|
|
}
|
|
|
|
# -- ._category_depth() -----------------------------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("html_text", "tag", "ElementCls", "expected_value"),
|
|
[
|
|
("<p>Ford... you're turning into a penguin. Stop it.<p>", "p", Text, None),
|
|
("<p>* thanks for all the fish.</p>", "p", ListItem, 0),
|
|
("<li>thanks for all the fish.</li>", "li", ListItem, 0),
|
|
("<ul><li>So long</li><li>and thanks for all the fish.</li></ul>", "li", ListItem, 1),
|
|
("<dl><dd>So long<ol><li>and thanks for the fish.</li></ol></ul>", "li", ListItem, 2),
|
|
("<p>Examples</p>", "p", Title, 0),
|
|
("<h1>Examples</h1>", "h1", Title, 0),
|
|
("<h2>Examples</h2>", "h2", Title, 1),
|
|
("<h3>Examples</h3>", "h3", Title, 2),
|
|
("<h4>Examples</h4>", "h4", Title, 3),
|
|
("<h5>Examples</h5>", "h5", Title, 4),
|
|
("<h6>Examples</h6>", "h6", Title, 5),
|
|
],
|
|
)
|
|
def it_computes_the_category_depth_to_help(
|
|
self, html_text: str, tag: str, ElementCls: type[Element], expected_value: int | None
|
|
):
|
|
e = etree.fromstring(html_text, html_parser).xpath(f".//{tag}")[0]
|
|
accum = _ElementAccumulator(e)
|
|
assert accum._category_depth(ElementCls) == expected_value
|
|
|
|
# -- ._normalized_text ------------------------------------------------
|
|
|
|
def it_computes_the_normalized_text_of_its_text_segments_to_help(
|
|
self, html_element: etree.ElementBase
|
|
):
|
|
accum = _ElementAccumulator(html_element)
|
|
accum.add(TextSegment(" \n Ford... you're \t turning\n", {}))
|
|
accum.add(TextSegment("into a penguin.\n", {}))
|
|
|
|
assert accum._normalized_text == "Ford... you're turning into a penguin."
|
|
|
|
# -- fixtures --------------------------------------------------------------------------------
|
|
|
|
@pytest.fixture()
|
|
def html_element(self) -> etree.ElementBase:
|
|
return etree.fromstring("<p/>", html_parser).xpath(".//p")[0]
|
|
|
|
|
|
class Describe_PreElementAccumulator:
|
|
"""Isolated unit-test suite for `unstructured.partition.html.parser._PreElementAccumulator`."""
|
|
|
|
def it_computes_the_normalized_text_of_its_text_segments_to_help(self):
|
|
html_element = etree.fromstring("<p/>", html_parser).xpath(".//p")[0]
|
|
accum = _PreElementAccumulator(html_element)
|
|
accum.add(TextSegment("\n\n", {}))
|
|
accum.add(TextSegment(" The panel lit up\n", {}))
|
|
accum.add(TextSegment(" with the words 'Please do not press\n", {}))
|
|
accum.add(TextSegment(" this button again'\n\n", {}))
|
|
|
|
# -- note single leading and trailing newline stripped --
|
|
assert accum._normalized_text == (
|
|
"\n"
|
|
" The panel lit up\n"
|
|
" with the words 'Please do not press\n"
|
|
" this button again'\n"
|
|
)
|
|
|
|
|
|
# -- FLOW (BLOCK-ITEM) ELEMENTS ------------------------------------------------------------------
|
|
|
|
|
|
class DescribeFlow:
|
|
"""Isolated unit-test suite for `unstructured.partition.html.parser.Flow`.
|
|
|
|
The `Flow` class provides most behaviors for flow (block-level) elements.
|
|
"""
|
|
|
|
# -- .is_phrasing -----------------------------------------------------
|
|
|
|
def it_knows_it_is_NOT_a_phrasing_element(self):
|
|
p = etree.fromstring("<p>Hello</p>", html_parser).xpath(".//p")[0]
|
|
|
|
assert isinstance(p, Flow)
|
|
assert p.is_phrasing is False
|
|
|
|
# -- .iter_elements() -------------------------------------------------
|
|
|
|
def it_generates_the_document_elements_from_the_Flow_element(self):
|
|
"""Phrasing siblings of child block elements are processed with text or tail.
|
|
|
|
In the general case, a Flow element can contain text, phrasing content, and child flow
|
|
elements.
|
|
|
|
Each of these five lines in this example is a "paragraph" and gives rise to a distinct
|
|
document-element.
|
|
"""
|
|
html_text = """
|
|
<div>
|
|
Text of div <b>with <i>hierarchical</i>\nphrasing</b> content before first block item
|
|
<p>Click <a href="http://blurb.io">here</a> to see the blurb for this block item. </p>
|
|
tail of block item <b>with <i>hierarchical</i> phrasing </b> content
|
|
<p>second block item</p>
|
|
tail of block item <b>with <i> hierarchical </i></b> phrasing content
|
|
</div>
|
|
"""
|
|
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
|
|
|
elements = div.iter_elements()
|
|
|
|
e = next(elements)
|
|
assert e == Text("Text of div with hierarchical phrasing content before first block item")
|
|
assert e.metadata.to_dict() == {
|
|
"emphasized_text_contents": ["with", "hierarchical", "phrasing"],
|
|
"emphasized_text_tags": ["b", "bi", "b"],
|
|
}
|
|
e = next(elements)
|
|
assert e == NarrativeText("Click here to see the blurb for this block item.")
|
|
assert e.metadata.to_dict() == {"link_texts": ["here"], "link_urls": ["http://blurb.io"]}
|
|
e = next(elements)
|
|
assert e == Text("tail of block item with hierarchical phrasing content")
|
|
assert e.metadata.to_dict() == {
|
|
"emphasized_text_contents": ["with", "hierarchical", "phrasing"],
|
|
"emphasized_text_tags": ["b", "bi", "b"],
|
|
}
|
|
e = next(elements)
|
|
assert e == Text("second block item")
|
|
assert e.metadata.to_dict() == {}
|
|
e = next(elements)
|
|
assert e == Text("tail of block item with hierarchical phrasing content")
|
|
assert e.metadata.to_dict() == {
|
|
"emphasized_text_contents": ["with", "hierarchical"],
|
|
"emphasized_text_tags": ["b", "bi"],
|
|
}
|
|
with pytest.raises(StopIteration):
|
|
e = next(elements)
|
|
|
|
# -- ._element_from_text_or_tail() ------------------------------------
|
|
|
|
def it_assembles_text_and_tail_document_elements_to_help(self):
|
|
"""Text and tails and their phrasing content are both processed the same way."""
|
|
html_text = "<div>The \n Roman <b>poet <i> Virgil</i> gave</b> his <q>pet</q> fly</div>"
|
|
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
|
|
|
elements = div._element_from_text_or_tail(div.text, deque(div), Text)
|
|
|
|
e = next(elements)
|
|
# -- element text is normalized --
|
|
assert e == Text("The Roman poet Virgil gave his pet fly")
|
|
# -- individual annotations are consolidated --
|
|
assert e.metadata.to_dict() == {
|
|
"emphasized_text_contents": ["poet", "Virgil", "gave"],
|
|
"emphasized_text_tags": ["b", "bi", "b"],
|
|
}
|
|
|
|
def but_it_does_not_generate_a_document_element_when_only_whitespace_is_contained(self):
|
|
html_text = "<div> <b> \n <i> \n </i> </b> <q> \n </q> \n </div>"
|
|
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
|
|
|
elements = div._element_from_text_or_tail(div.text, deque(div), Text)
|
|
|
|
with pytest.raises(StopIteration):
|
|
next(elements)
|
|
|
|
def it_uses_the_specified_element_class_to_form_the_document_element(self):
|
|
html_text = "<div>\n The line-storm clouds fly tattered and swift\n</div>"
|
|
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
|
|
|
elements = div._element_from_text_or_tail(div.text, deque(div), Address)
|
|
|
|
e = next(elements)
|
|
assert e == Address("The line-storm clouds fly tattered and swift")
|
|
assert e.metadata.to_dict() == {}
|
|
with pytest.raises(StopIteration):
|
|
next(elements)
|
|
|
|
def and_it_selects_the_document_element_class_by_analyzing_the_text_when_not_specified(self):
|
|
html_text = "<div>\n The line-storm clouds fly tattered and swift,\n</div>"
|
|
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
|
|
|
elements = div._element_from_text_or_tail(div.text, deque(div))
|
|
|
|
assert next(elements) == NarrativeText("The line-storm clouds fly tattered and swift,")
|
|
|
|
def but_it_does_not_generate_a_document_element_when_only_a_bullet_character_is_contained(self):
|
|
html_text = "<div> * </div>"
|
|
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
|
|
|
elements = div._element_from_text_or_tail(div.text, deque(div))
|
|
|
|
with pytest.raises(StopIteration):
|
|
next(elements)
|
|
|
|
# -- ._iter_text_segments() -------------------------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("html_text", "expected_value"),
|
|
[
|
|
( # -- text with no phrasing --
|
|
"<p>Ford... you're turning into a penguin.<p>",
|
|
[("Ford... you're turning into a penguin.", {})],
|
|
),
|
|
( # -- text with phrasing --
|
|
"<p>Ford... <b>you're turning</b> into\na <i>penguin</i>.<p>",
|
|
[
|
|
("Ford... ", {}),
|
|
(
|
|
"you're turning",
|
|
{"emphasized_text_contents": "you're turning", "emphasized_text_tags": "b"},
|
|
),
|
|
(" into\na ", {}),
|
|
(
|
|
"penguin",
|
|
{"emphasized_text_contents": "penguin", "emphasized_text_tags": "i"},
|
|
),
|
|
(".", {}),
|
|
],
|
|
),
|
|
( # -- text with nested phrasing --
|
|
"<p>Ford... <b>you're <i>turning</i></b> into a penguin.<p>",
|
|
[
|
|
("Ford... ", {}),
|
|
(
|
|
"you're ",
|
|
{"emphasized_text_contents": "you're", "emphasized_text_tags": "b"},
|
|
),
|
|
(
|
|
"turning",
|
|
{"emphasized_text_contents": "turning", "emphasized_text_tags": "bi"},
|
|
),
|
|
(" into a penguin.", {}),
|
|
],
|
|
),
|
|
],
|
|
)
|
|
def it_recursively_generates_text_segments_from_text_and_phrasing_to_help(
|
|
self, html_text: str, expected_value: list[Annotation]
|
|
):
|
|
p = etree.fromstring(html_text, html_parser).xpath(".//p")[0]
|
|
text_segments = list(p._iter_text_segments(p.text, deque(p)))
|
|
|
|
assert text_segments == expected_value
|
|
|
|
|
|
class DescribePre:
|
|
"""Isolated unit-test suite for `unstructured.partition.html.parser.Pre`.
|
|
|
|
The `Pre` class specializes behaviors for the `<pre>` (pre-formatted text) element.
|
|
"""
|
|
|
|
def it_preserves_the_whitespace_of_its_phrasing_only_contents(self):
|
|
"""A `<pre>` element can contain only phrasing content."""
|
|
html_text = (
|
|
"<pre>\n"
|
|
" The Answer to the Great Question... Of Life, the Universe and Everything...\n"
|
|
" Is... Forty-two, said Deep Thought, with infinite majesty and calm.\n"
|
|
"</pre>\n"
|
|
)
|
|
pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]
|
|
|
|
elements = pre.iter_elements()
|
|
|
|
e = next(elements)
|
|
assert e == Text(
|
|
" The Answer to the Great Question... Of Life, the Universe and Everything...\n"
|
|
" Is... Forty-two, said Deep Thought, with infinite majesty and calm."
|
|
)
|
|
with pytest.raises(StopIteration):
|
|
next(elements)
|
|
|
|
@pytest.mark.parametrize(
|
|
("html_text", "expected_value"),
|
|
[
|
|
# -- a newline in the 0th position of pre.text is dropped --
|
|
("<pre>\n foo </pre>", " foo "),
|
|
# -- but not when preceded by any other whitespace --
|
|
("<pre> \n foo </pre>", " \n foo "),
|
|
# -- and only one is dropped --
|
|
("<pre>\n\n foo </pre>", "\n foo "),
|
|
# -- a newline in the -1th position is dropped --
|
|
("<pre> foo \n</pre>", " foo "),
|
|
# -- but not when followed by any other whitespace --
|
|
("<pre> foo \n </pre>", " foo \n "),
|
|
# -- and only one is dropped --
|
|
("<pre> foo \n\n</pre>", " foo \n"),
|
|
# -- a newline in both positions are both dropped --
|
|
("<pre>\n foo \n</pre>", " foo "),
|
|
# -- or not when not at the absolute edge --
|
|
("<pre> \n foo \n </pre>", " \n foo \n "),
|
|
],
|
|
)
|
|
def but_it_strips_a_single_leading_or_trailing_newline(
|
|
self, html_text: str, expected_value: str
|
|
):
|
|
"""Content starts on next line when opening `<pre>` tag is immediately followed by `\n`"""
|
|
pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]
|
|
e = next(pre.iter_elements())
|
|
|
|
assert e.text == expected_value
|
|
|
|
def it_assigns_emphasis_and_link_metadata_when_contents_have_those_phrasing_elements(self):
|
|
html_text = '<pre>You\'re <b>turning</b> into a <a href="http://eie.io">penguin</a>.</pre>'
|
|
pre = etree.fromstring(html_text, html_parser).xpath(".//pre")[0]
|
|
|
|
e = next(pre.iter_elements())
|
|
|
|
assert e.text == "You're turning into a penguin."
|
|
assert e.metadata.emphasized_text_contents == ["turning"]
|
|
assert e.metadata.emphasized_text_tags == ["b"]
|
|
assert e.metadata.link_texts == ["penguin"]
|
|
assert e.metadata.link_urls == ["http://eie.io"]
|
|
|
|
|
|
class DescribeRemovedBlock:
|
|
"""Isolated unit-test suite for `unstructured.partition.html.parser.RemovedBlock`.
|
|
|
|
This class is used for block level items we want to skip like `<hr/>` and `<figure>`.
|
|
"""
|
|
|
|
def it_is_skipped_during_parsing(self):
|
|
html_text = """
|
|
<div>
|
|
<hr/>
|
|
<figure>
|
|
<img src="/media/cc0-images/elephant-660-480.jpg" alt="Elephant at sunset" />
|
|
<figcaption>An elephant at sunset</figcaption>
|
|
</figure>
|
|
<p>Content we want.</p>
|
|
</div>
|
|
"""
|
|
div = etree.fromstring(html_text, html_parser).xpath(".//div")[0]
|
|
|
|
assert list(div.iter_elements()) == [NarrativeText("Content we want.")]
|
|
|
|
|
|
# -- PHRASING (INLINE) ELEMENTS ------------------------------------------------------------------
|
|
|
|
|
|
class DescribePhrasing:
|
|
"""Isolated unit-test suite for `unstructured.partition.html.parser.Phrasing`.
|
|
|
|
The `Phrasing` class provides most behaviors for phrasing (inline) elements.
|
|
"""
|
|
|
|
# -- .is_phrasing -----------------------------------------------------
|
|
|
|
def it_knows_it_is_a_phrasing_element(self):
|
|
b = etree.fromstring("<b>Hello</b>", html_parser).xpath(".//b")[0]
|
|
|
|
assert isinstance(b, Phrasing)
|
|
assert b.is_phrasing is True
|
|
|
|
# -- .iter_text_segments() --------------------------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("html_text", "expected_value"),
|
|
[
|
|
# -- an empty element produces no text segments --
|
|
("<code></code>", []),
|
|
# -- element text produces one segment --
|
|
("<data> foo </data>", [(" foo ", {})]),
|
|
# -- element tail produces one segment --
|
|
("<dfn/> bar ", [(" bar ", {})]),
|
|
# -- element descendants each produce one segment --
|
|
("<kbd><mark>foo <meter>bar</meter></mark></kbd>", [("foo ", {}), ("bar", {})]),
|
|
# -- and any combination produces a segment for each text, child, and tail --
|
|
(
|
|
"<kbd> <mark>foo <meter>bar</meter> baz</mark> </kbd>",
|
|
[
|
|
(" ", {}),
|
|
("foo ", {}),
|
|
("bar", {}),
|
|
(" baz", {}),
|
|
(" ", {}),
|
|
],
|
|
),
|
|
],
|
|
)
|
|
def it_generates_text_segments_for_its_text_and_children_and_tail(
|
|
self, html_text: str, expected_value: list[TextSegment]
|
|
):
|
|
e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0]
|
|
assert list(e.iter_text_segments()) == expected_value
|
|
|
|
@pytest.mark.parametrize(
|
|
("html_text", "expected_value"),
|
|
[
|
|
# -- Phrasing with nested block but no text or tail produces only element for block --
|
|
("<strong><p>aaa</p></strong>", [Text("aaa")]),
|
|
# -- Phrasing with text produces annotated text-segment for the text --
|
|
(
|
|
"<strong>aaa<p>bbb</p></strong>",
|
|
[
|
|
TextSegment(
|
|
"aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
|
|
),
|
|
Text("bbb"),
|
|
],
|
|
),
|
|
# -- Phrasing with tail produces annotated text-segment for the tail --
|
|
(
|
|
"<strong><p>aaa</p>bbb</strong>",
|
|
[
|
|
Text("aaa"),
|
|
TextSegment(
|
|
"bbb", {"emphasized_text_contents": "bbb", "emphasized_text_tags": "b"}
|
|
),
|
|
],
|
|
),
|
|
# -- Phrasing with text, nested block, and tail produces all three --
|
|
(
|
|
"<strong>aaa<p>bbb</p>ccc</strong>",
|
|
[
|
|
TextSegment(
|
|
"aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
|
|
),
|
|
Text("bbb"),
|
|
TextSegment(
|
|
"ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"}
|
|
),
|
|
],
|
|
),
|
|
],
|
|
)
|
|
def but_it_can_also_generate_an_element_when_it_has_a_nested_block_element(
|
|
self, html_text: str, expected_value: list[TextSegment | Element]
|
|
):
|
|
e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0]
|
|
assert list(e.iter_text_segments()) == expected_value
|
|
|
|
# -- ._annotation() ---------------------------------------------------
|
|
|
|
def it_forms_its_annotations_from_emphasis(self):
|
|
cite = etree.fromstring("<cite/>", html_parser).xpath(".//cite")[0]
|
|
assert cite._annotation("\n foobar\n ", "bi") == {
|
|
"emphasized_text_contents": "foobar",
|
|
"emphasized_text_tags": "bi",
|
|
}
|
|
|
|
@pytest.mark.parametrize("text", ["", "\n \t "])
|
|
def but_not_when_text_is_empty_or_whitespace(self, text: str):
|
|
cite = etree.fromstring("<cite/>", html_parser).xpath(".//cite")[0]
|
|
assert cite._annotation(text, "bi") == {}
|
|
|
|
def and_not_when_there_is_no_emphasis(self):
|
|
cite = etree.fromstring("<cite/>", html_parser).xpath(".//cite")[0]
|
|
assert cite._annotation("foobar", "") == {}
|
|
|
|
# -- ._inside_emphasis() ----------------------------------------------
|
|
|
|
@pytest.mark.parametrize("enclosing_emphasis", ["", "b", "bi"])
|
|
def it_uses_the_enclosing_emphasis_as_the_default_inside_emphasis(
|
|
self, enclosing_emphasis: str
|
|
):
|
|
"""Inside emphasis is applied to text inside the phrasing element (but not its tail).
|
|
|
|
The `._inside_emphasis()` method is overridden by Bold and Italic classes which add their
|
|
specific emphasis characters.
|
|
"""
|
|
abbr = etree.fromstring("<abbr/>", html_parser).xpath(".//abbr")[0]
|
|
assert abbr._inside_emphasis(enclosing_emphasis) == enclosing_emphasis
|
|
|
|
# -- ._iter_child_text_segments() -------------------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("html_text", "expected_value"),
|
|
[
|
|
# -- a phrasing element with no children produces no text segments
|
|
# -- (element text is handled elsewhere)
|
|
("<abbr>aaa</abbr>", []),
|
|
# -- child phrasing element produces text-segment for its text --
|
|
("<bdi>x<bdo>bbb</bdo></bdi>", [TextSegment("bbb", {})]),
|
|
# -- and also for its tail when it has one --
|
|
("<bdi>x<bdo>bbb</bdo>ccc</bdi>", [TextSegment("bbb", {}), TextSegment("ccc", {})]),
|
|
# -- nested phrasing recursively each produce a segment for text and tail, in order --
|
|
(
|
|
"<big>xxx<cite>aaa<code>bbb<data>ccc</data>ddd</code>eee</cite>fff</big>",
|
|
[
|
|
TextSegment("aaa", {}),
|
|
TextSegment("bbb", {}),
|
|
TextSegment("ccc", {}),
|
|
TextSegment("ddd", {}),
|
|
TextSegment("eee", {}),
|
|
TextSegment("fff", {}),
|
|
],
|
|
),
|
|
],
|
|
)
|
|
def it_generates_text_segments_for_its_children_and_their_tails(
|
|
self, html_text: str, expected_value: list[TextSegment]
|
|
):
|
|
e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0]
|
|
assert list(e._iter_child_text_segments("")) == expected_value
|
|
|
|
@pytest.mark.parametrize(
|
|
("html_text", "inside_emphasis", "expected_value"),
|
|
[
|
|
# -- a phrasing element with no block children produces no elements --
|
|
("<dfn></dfn>", "", []),
|
|
# -- a child block element produces an element --
|
|
("<kbd><p>aaa</p></kbd>", "", [Text("aaa")]),
|
|
# -- a child block element with a tail also produces a text-segment for the tail --
|
|
("<kbd><p>aaa</p>bbb</kbd>", "", [Text("aaa"), TextSegment("bbb", {})]),
|
|
# -- and also text-segments for phrasing following the tail --
|
|
(
|
|
"<kbd><p>aaa</p>bbb<mark>ccc</mark>ddd</kbd>",
|
|
"",
|
|
[
|
|
Text("aaa"),
|
|
TextSegment("bbb", {}),
|
|
TextSegment("ccc", {}),
|
|
TextSegment("ddd", {}),
|
|
],
|
|
),
|
|
# -- and emphasis is applied before and after block-item --
|
|
(
|
|
"<strong><q>aaa</q><p>bbb</p>ccc<s>ddd</s>eee</strong>",
|
|
"b",
|
|
[
|
|
TextSegment(
|
|
"aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
|
|
),
|
|
Text("bbb"),
|
|
TextSegment(
|
|
"ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"}
|
|
),
|
|
TextSegment(
|
|
"ddd", {"emphasized_text_contents": "ddd", "emphasized_text_tags": "b"}
|
|
),
|
|
TextSegment(
|
|
"eee", {"emphasized_text_contents": "eee", "emphasized_text_tags": "b"}
|
|
),
|
|
],
|
|
),
|
|
],
|
|
)
|
|
def and_it_generates_elements_for_its_block_children(
|
|
self, html_text: str, inside_emphasis: str, expected_value: list[TextSegment | Element]
|
|
):
|
|
e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0]
|
|
assert list(e._iter_child_text_segments(inside_emphasis)) == expected_value
|
|
|
|
# -- ._iter_text_segments_from_block_tail_and_phrasing() --------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("html_text", "emphasis", "expected_value"),
|
|
[
|
|
# -- no tail and no contiguous phrasing produces no text-segments --
|
|
("<cite><p/></cite>", "", []),
|
|
# -- tail produces a text-segment --
|
|
("<cite><p/>aaa</cite>", "", [TextSegment("aaa", {})]),
|
|
# -- contiguous phrasing produces a text-segment --
|
|
("<cite><p/><s>aaa</s></cite>", "", [TextSegment("aaa", {})]),
|
|
# -- tail of contiguous phrasing also produces a text-segment --
|
|
("<bdi><p/><s>aaa</s>bbb</bdi>", "", [TextSegment("aaa", {}), TextSegment("bbb", {})]),
|
|
# -- nested phrasing produces a text-segment --
|
|
(
|
|
"<sub><p/>aaa<s>bbb<q>ccc</q>ddd</s>eee</sub>",
|
|
"",
|
|
[
|
|
TextSegment("aaa", {}),
|
|
TextSegment("bbb", {}),
|
|
TextSegment("ccc", {}),
|
|
TextSegment("ddd", {}),
|
|
TextSegment("eee", {}),
|
|
],
|
|
),
|
|
# -- and emphasis is added to each text-segment when specified --
|
|
(
|
|
"<strong><p/>aaa<s>bbb<i>ccc</i>ddd</s>eee</strong>",
|
|
"b",
|
|
[
|
|
TextSegment(
|
|
"aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
|
|
),
|
|
TextSegment(
|
|
"bbb", {"emphasized_text_contents": "bbb", "emphasized_text_tags": "b"}
|
|
),
|
|
TextSegment(
|
|
"ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "bi"}
|
|
),
|
|
TextSegment(
|
|
"ddd", {"emphasized_text_contents": "ddd", "emphasized_text_tags": "b"}
|
|
),
|
|
TextSegment(
|
|
"eee", {"emphasized_text_contents": "eee", "emphasized_text_tags": "b"}
|
|
),
|
|
],
|
|
),
|
|
# -- a block item nested in contiguous phrasing produces an Element --
|
|
(
|
|
"<cite><p/>aaa<abbr>bbb<p>ccc</p>ddd</abbr>eee</cite>",
|
|
"",
|
|
[
|
|
TextSegment("aaa", {}),
|
|
TextSegment("bbb", {}),
|
|
Text("ccc"),
|
|
TextSegment("ddd", {}),
|
|
TextSegment("eee", {}),
|
|
],
|
|
),
|
|
],
|
|
)
|
|
def it_generates_text_segments_from_the_tail_and_contiguous_phrasing(
|
|
self, html_text: str, emphasis: str, expected_value: list[TextSegment | Element]
|
|
):
|
|
e = etree.fromstring(html_text, html_parser).xpath(".//body")[0][0]
|
|
p = e.xpath("./p")[0]
|
|
tail = p.tail or ""
|
|
q = deque(e[1:])
|
|
|
|
assert (
|
|
list(e._iter_text_segments_from_block_tail_and_phrasing(tail, q, emphasis))
|
|
== expected_value
|
|
)
|
|
|
|
|
|
class DescribeAnchor:
|
|
"""Isolated unit-test suite for `unstructured.partition.html.parser.Anchor`.
|
|
|
|
The `Anchor` class is used for `<a>` tags and provides link metadata.
|
|
"""
|
|
|
|
# -- .iter_text_segments() --------------------------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("html_text", "emphasis", "expected_value"),
|
|
[
|
|
# -- produces no text-segment or annotation for anchor.text when there is none --
|
|
('<a href="http://abc.com"></a>', "", []),
|
|
# -- but it produces a text-segment for the tail if there is one --
|
|
('<a href="http://abc.com"></a> long tail ', "", [TextSegment(" long tail ", {})]),
|
|
# -- produces text-segment but no annotation for anchor.text when it is whitespace --
|
|
('<a href="http://abc.com"> </a>', "", [TextSegment(" ", {})]),
|
|
# -- produces text-segment and annotation for anchor text. Note `link_texts:`
|
|
# -- annotation value is whitespace-normalized but text-segment text is not.
|
|
(
|
|
'<a href="http://abc.com"> click here </a>',
|
|
"",
|
|
[
|
|
TextSegment(
|
|
" click here ",
|
|
{"link_texts": ["click here"], "link_urls": ["http://abc.com"]},
|
|
)
|
|
],
|
|
),
|
|
# -- produces text-segment for both text and tail when present --
|
|
(
|
|
'<a href="http://abc.com"> click here </a> long tail',
|
|
"",
|
|
[
|
|
TextSegment(
|
|
" click here ",
|
|
{"link_texts": ["click here"], "link_urls": ["http://abc.com"]},
|
|
),
|
|
TextSegment(" long tail", {}),
|
|
],
|
|
),
|
|
# -- nested phrasing inside <a> element is handled as expected --
|
|
(
|
|
'<p>I am <a href="http://eie.io">one <u>with<i> the</i></u> Force</a>.</p>',
|
|
"",
|
|
[
|
|
TextSegment(
|
|
"one with the Force",
|
|
{
|
|
"emphasized_text_contents": ["the"],
|
|
"emphasized_text_tags": ["i"],
|
|
"link_texts": ["one with the Force"],
|
|
"link_urls": ["http://eie.io"],
|
|
},
|
|
),
|
|
TextSegment(".", {}),
|
|
],
|
|
),
|
|
# -- enclosing_emphasis is applied to all segments --
|
|
(
|
|
'<p>I am <strong><a href="http://eie.io">one with</a> the Force.</strong></p>',
|
|
"b",
|
|
[
|
|
TextSegment(
|
|
"one with",
|
|
{
|
|
"emphasized_text_contents": ["one with"],
|
|
"emphasized_text_tags": ["b"],
|
|
"link_texts": ["one with"],
|
|
"link_urls": ["http://eie.io"],
|
|
},
|
|
),
|
|
TextSegment(
|
|
" the Force.",
|
|
{
|
|
"emphasized_text_contents": "the Force.",
|
|
"emphasized_text_tags": "b",
|
|
},
|
|
),
|
|
],
|
|
),
|
|
],
|
|
)
|
|
def it_generates_link_annotated_text_segments_for_its_text_and_a_tail_text_segment(
|
|
self, html_text: str, emphasis: str, expected_value: list[TextSegment]
|
|
):
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
assert list(a.iter_text_segments(emphasis)) == expected_value
|
|
|
|
def it_generates_enclosed_block_items_as_separate_elements(self):
|
|
html_text = """<a href="http://eie.io">I am <p>one with</p> the Force.</a>"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
|
|
assert list(a.iter_text_segments("b")) == [
|
|
TextSegment(
|
|
"I am ",
|
|
{
|
|
"emphasized_text_contents": ["I am"],
|
|
"emphasized_text_tags": ["b"],
|
|
"link_texts": ["I am"],
|
|
"link_urls": ["http://eie.io"],
|
|
},
|
|
),
|
|
Text("one with"),
|
|
TextSegment(
|
|
" the Force.",
|
|
{
|
|
"emphasized_text_contents": "the Force.",
|
|
"emphasized_text_tags": "b",
|
|
},
|
|
),
|
|
]
|
|
|
|
def and_it_annotates_first_enclosed_block_Element_when_no_non_whitespace_phrase_appears_first(
|
|
self,
|
|
):
|
|
html_text = """<a href="http://eie.io"> \n <p>I am one with</p> the Force.</a>"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
|
|
actual = list(a.iter_text_segments("i"))
|
|
|
|
assert actual == [
|
|
TextSegment(" \n ", {}),
|
|
NarrativeText("I am one with"),
|
|
TextSegment(
|
|
" the Force.",
|
|
{
|
|
"emphasized_text_contents": "the Force.",
|
|
"emphasized_text_tags": "i",
|
|
},
|
|
),
|
|
]
|
|
element = actual[1]
|
|
assert element.metadata.link_texts == ["I am one with"]
|
|
assert element.metadata.link_urls == ["http://eie.io"]
|
|
|
|
# -- ._iter_phrases_and_elements() ------------------------------------
|
|
|
|
def it_divides_the_anchor_contents_but_not_tail_into_phrases_and_elements(self):
|
|
html_text = """
|
|
<a href="http://eie.io">But always <p>see first.</p> Otherwise you </a> will only see
|
|
"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
|
|
assert list(a._iter_phrases_and_elements(emphasis="")) == [
|
|
(TextSegment("But always ", {}),),
|
|
NarrativeText("see first."),
|
|
(TextSegment(" Otherwise you ", {}),),
|
|
]
|
|
|
|
# -- ._iter_phrasing() ------------------------------------------------
|
|
|
|
def it_generates_zero_items_when_both_text_and_q_are_empty(self):
|
|
html_text = """<a href="http://eie.io"></a>"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
|
|
with pytest.raises(StopIteration):
|
|
next(a._iter_phrasing(text="", q=deque([]), emphasis=""))
|
|
|
|
def it_generates_a_phrase_when_only_text_is_present(self):
|
|
html_text = """<a href="http://eie.io">\n But always see first.\n</a>"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
|
|
assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [
|
|
(TextSegment("\n But always see first.\n", {}),)
|
|
]
|
|
|
|
def and_it_generates_a_phrase_when_that_text_is_followed_by_a_phrasing_element(self):
|
|
html_text = """<a href="http://eie.io">But always <b>see <i>first</i></b>. Otherwise</a>"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
|
|
assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [
|
|
(
|
|
TextSegment("But always ", {}),
|
|
TextSegment(
|
|
"see ",
|
|
{
|
|
"emphasized_text_contents": "see",
|
|
"emphasized_text_tags": "b",
|
|
},
|
|
),
|
|
TextSegment(
|
|
"first",
|
|
{
|
|
"emphasized_text_contents": "first",
|
|
"emphasized_text_tags": "bi",
|
|
},
|
|
),
|
|
TextSegment(". Otherwise", {}),
|
|
)
|
|
]
|
|
|
|
def it_ends_the_phrase_at_the_end_of_the_element(self):
|
|
html_text = """<a href="http://eie.io">But always see first.</a> Otherwise you will """
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
|
|
assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [
|
|
(TextSegment("But always see first.", {}),)
|
|
]
|
|
|
|
def but_it_ends_at_a_block_element_if_one_occurs_first(self):
|
|
html_text = """<a href="http://eie.io">But always see first. <p>Otherwise you </p> </a>"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
|
|
assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [
|
|
(TextSegment("But always see first. ", {}),)
|
|
]
|
|
|
|
def it_generates_an_element_for_a_block_item_nested_inside_phrasing(self):
|
|
html_text = """
|
|
<a href="http://eie.io">But <strong>always <p>see first.</p>Otherwise</strong> you </a>
|
|
"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
|
|
assert list(a._iter_phrasing(text=a.text, q=deque(a), emphasis="")) == [
|
|
(
|
|
TextSegment("But ", {}),
|
|
TextSegment(
|
|
"always ",
|
|
{
|
|
"emphasized_text_contents": "always",
|
|
"emphasized_text_tags": "b",
|
|
},
|
|
),
|
|
),
|
|
NarrativeText("see first."),
|
|
(
|
|
TextSegment(
|
|
"Otherwise",
|
|
{
|
|
"emphasized_text_contents": "Otherwise",
|
|
"emphasized_text_tags": "b",
|
|
},
|
|
),
|
|
TextSegment(" you ", {}),
|
|
),
|
|
]
|
|
|
|
# -- ._link_annotate_element() ----------------------------------------
|
|
|
|
def it_adds_link_metadata_to_an_element_to_help(self):
|
|
html_text = """<a href="http://eie.io"></a>"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
element = Text("aaa")
|
|
|
|
e = a._link_annotate_element(element)
|
|
|
|
assert e is element
|
|
assert e.metadata.link_texts == ["aaa"]
|
|
assert e.metadata.link_urls == ["http://eie.io"]
|
|
|
|
def and_it_preserves_any_existing_link_metadata_on_the_element(self):
|
|
# -- nested anchors shouldn't be possible but easier to test than prove it can't happen --
|
|
html_text = """<a href="http://eie.io"></a>"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
element = Text("bbb")
|
|
element.metadata.link_texts = ["abc"]
|
|
element.metadata.link_urls = ["http://abc.com"]
|
|
|
|
e = a._link_annotate_element(element)
|
|
|
|
assert e is element
|
|
assert e.metadata.link_texts == ["abc", "bbb"]
|
|
assert e.metadata.link_urls == ["http://abc.com", "http://eie.io"]
|
|
|
|
def but_not_when_the_text_is_empty(self):
|
|
html_text = """<a href="http://eie.io"/>"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
element = Text("")
|
|
|
|
e = a._link_annotate_element(element)
|
|
|
|
assert e is element
|
|
assert e.metadata.link_texts is None
|
|
assert e.metadata.link_urls is None
|
|
|
|
def and_not_when_there_is_no_url(self):
|
|
html_text = """<a/>"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
element = Text("zzz")
|
|
|
|
e = a._link_annotate_element(element)
|
|
|
|
assert e is element
|
|
assert e.metadata.link_texts is None
|
|
assert e.metadata.link_urls is None
|
|
|
|
# -- ._link_text_segment() --------------------------------------------
|
|
|
|
def it_consolidates_a_phrase_into_a_single_link_annotated_TextSegment_to_help(self):
|
|
html_text = """<a href="http://eie.io"></a>"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
phrase = (
|
|
TextSegment(
|
|
"Otherwise you will only ",
|
|
{
|
|
"emphasized_text_contents": ["Otherwise"],
|
|
"emphasized_text_tags": ["i"],
|
|
},
|
|
),
|
|
TextSegment(
|
|
"see what you were expecting.\n",
|
|
{
|
|
"emphasized_text_contents": "expecting",
|
|
"emphasized_text_tags": "b",
|
|
},
|
|
),
|
|
)
|
|
|
|
link_text_segment = a._link_text_segment(phrase)
|
|
|
|
assert link_text_segment == TextSegment(
|
|
"Otherwise you will only see what you were expecting.\n",
|
|
{
|
|
"emphasized_text_contents": ["Otherwise", "expecting"],
|
|
"emphasized_text_tags": ["i", "b"],
|
|
"link_texts": ["Otherwise you will only see what you were expecting."],
|
|
"link_urls": ["http://eie.io"],
|
|
},
|
|
)
|
|
|
|
@pytest.mark.parametrize("text", ["", " \n \t "])
|
|
def but_not_when_the_text_is_empty_or_whitespace_only(self, text: str):
|
|
html_text = """<a href="http://eie.io"></a>"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
phrase = (TextSegment(text, {}), TextSegment(text, {}), TextSegment(text, {}))
|
|
|
|
assert a._link_text_segment(phrase) is None
|
|
|
|
def and_not_when_the_anchor_has_no_href_url(self):
|
|
html_text = """<a>foobar</a>"""
|
|
a = etree.fromstring(html_text, html_parser).xpath(".//a")[0]
|
|
phrase = (TextSegment("Otherwise", {}), TextSegment(" you will", {}))
|
|
|
|
assert a._link_text_segment(phrase) is None
|
|
|
|
|
|
class DescribeBold:
|
|
"""Isolated unit-test suite for `unstructured.partition.html.parser.Bold`.
|
|
|
|
The `Bold` class is used for `<b>` and `<strong>` tags and adds emphasis metadata.
|
|
"""
|
|
|
|
def it_annotates_its_text_segment_with_bold_emphasis(self):
|
|
b = etree.fromstring("<b>rhombus</b>", html_parser).xpath(".//b")[0]
|
|
|
|
text_segments = b.iter_text_segments()
|
|
|
|
text, annotation = next(text_segments)
|
|
assert text == "rhombus"
|
|
assert annotation == {
|
|
"emphasized_text_contents": "rhombus",
|
|
"emphasized_text_tags": "b",
|
|
}
|
|
|
|
def and_its_children_are_also_annotated_with_bold_emphasis(self):
|
|
b = etree.fromstring("<b>rhombus <i>pentagon</i></b>", html_parser).xpath(".//b")[0]
|
|
|
|
text_segments = b.iter_text_segments()
|
|
|
|
text, annotation = next(text_segments)
|
|
assert text == "rhombus "
|
|
assert annotation == {
|
|
"emphasized_text_contents": "rhombus",
|
|
"emphasized_text_tags": "b",
|
|
}
|
|
text, annotation = next(text_segments)
|
|
assert text == "pentagon"
|
|
assert annotation == {
|
|
"emphasized_text_contents": "pentagon",
|
|
"emphasized_text_tags": "bi",
|
|
}
|
|
|
|
def but_not_its_tail(self):
|
|
b = etree.fromstring("<b>rhombus</b> pentagon", html_parser).xpath(".//b")[0]
|
|
|
|
text_segments = b.iter_text_segments()
|
|
|
|
text, annotation = next(text_segments)
|
|
assert text == "rhombus"
|
|
assert annotation == {
|
|
"emphasized_text_contents": "rhombus",
|
|
"emphasized_text_tags": "b",
|
|
}
|
|
text, annotation = next(text_segments)
|
|
assert text == " pentagon"
|
|
assert annotation == {}
|
|
|
|
|
|
class DescribeItalic:
|
|
"""Isolated unit-test suite for `unstructured.partition.html.parser.Italic`.
|
|
|
|
The `Italic` class is used for `<i>` and `<em>` tags and adds emphasis metadata.
|
|
"""
|
|
|
|
def it_annotates_its_text_segment_with_italic_emphasis(self):
|
|
i = etree.fromstring("<i>rhombus</i>", html_parser).xpath(".//i")[0]
|
|
|
|
text_segments = i.iter_text_segments()
|
|
|
|
text, annotation = next(text_segments)
|
|
assert text == "rhombus"
|
|
assert annotation == {
|
|
"emphasized_text_contents": "rhombus",
|
|
"emphasized_text_tags": "i",
|
|
}
|
|
|
|
def and_its_children_are_also_annotated_with_italic_emphasis(self):
|
|
em = etree.fromstring("<em>rhombus <b>pentagon</b></em>", html_parser).xpath(".//em")[0]
|
|
|
|
text_segments = em.iter_text_segments()
|
|
|
|
text, annotation = next(text_segments)
|
|
assert text == "rhombus "
|
|
assert annotation == {
|
|
"emphasized_text_contents": "rhombus",
|
|
"emphasized_text_tags": "i",
|
|
}
|
|
text, annotation = next(text_segments)
|
|
assert text == "pentagon"
|
|
assert annotation == {
|
|
"emphasized_text_contents": "pentagon",
|
|
"emphasized_text_tags": "bi",
|
|
}
|
|
|
|
def but_not_its_tail(self):
|
|
i = etree.fromstring("<i>rhombus</i> pentagon", html_parser).xpath(".//i")[0]
|
|
|
|
text_segments = i.iter_text_segments()
|
|
|
|
text, annotation = next(text_segments)
|
|
assert text == "rhombus"
|
|
assert annotation == {
|
|
"emphasized_text_contents": "rhombus",
|
|
"emphasized_text_tags": "i",
|
|
}
|
|
text, annotation = next(text_segments)
|
|
assert text == " pentagon"
|
|
assert annotation == {}
|
|
|
|
|
|
class DescribeLineBreak:
|
|
"""Isolated unit-test suite for `unstructured.partition.html.parser.LineBreak`.
|
|
|
|
Used for `<br/>` elements, it's only special behavior is to add whitespace such that phrasing
|
|
butted up tight on both sides of the `<br/>` element is not joined, like `abc<br/>def` should
|
|
become "abc def", not "abcdef".
|
|
"""
|
|
|
|
def it_adds_a_newline_in_its_place(self):
|
|
cite = etree.fromstring(
|
|
"<cite>spaceships of the<br/>Vogon Constructor Fleet</cite>", html_parser
|
|
).xpath(".//cite")[0]
|
|
|
|
text_segments = cite.iter_text_segments()
|
|
|
|
texts = [ts.text for ts in text_segments]
|
|
assert texts == ["spaceships of the", "\n", "Vogon Constructor Fleet"]
|
|
assert _normalize_text("".join(texts)) == "spaceships of the Vogon Constructor Fleet"
|
|
|
|
|
|
class DescribeRemovedPhrasing:
|
|
"""Isolated unit-test suite for `unstructured.partition.html.parser.RemovedPhrasing`.
|
|
|
|
Used for phrasing elements like `<label>` that we want to skip, including any content they
|
|
enclose. The tail of such an element is not skipped though.
|
|
"""
|
|
|
|
def it_behaves_like_an_empty_element(self):
|
|
label = etree.fromstring(
|
|
"<div>\n"
|
|
" <label>Space<p>is big</p>, <b>mind-bogglingly</b> big.</label>\n"
|
|
" Like vastly, hugely big.\n"
|
|
"</div>",
|
|
html_parser,
|
|
).xpath(".//label")[0]
|
|
|
|
(text_segment,) = list(label.iter_text_segments())
|
|
|
|
assert isinstance(label, RemovedPhrasing)
|
|
assert label.is_phrasing is True
|
|
assert text_segment.text == "\n Like vastly, hugely big.\n"
|
|
|
|
|
|
# -- DEFAULT ELEMENT -----------------------------------------------------------------------------
|
|
|
|
|
|
class DescribeDefaultElement:
|
|
"""Isolated unit-test suite for `unstructured.partition.html.parser.DefaultElement`.
|
|
|
|
Used for any element we haven't assigned a custom element-class too. This prominently includes
|
|
any non-HTML elements that can be embedded in the HTML.
|
|
|
|
It identifies as a block item but it can behave as either a block-item or phrasing. Its behavior
|
|
is a combination of RemovedBlock and RemovedPhrasing. Namely, it iterates zero elements and only
|
|
iterates a text-segment for its tail.
|
|
"""
|
|
|
|
# -- .is_phrasing -----------------------------------------------------
|
|
|
|
def it_identifies_as_a_phrasing_element(self):
|
|
foobar = etree.fromstring("<foobar>Vogon</foobar>", html_parser).xpath(".//foobar")[0]
|
|
|
|
assert isinstance(foobar, DefaultElement)
|
|
assert foobar.is_phrasing is True
|
|
|
|
# -- .iter_elements() -------------------------------------------------
|
|
|
|
def it_generates_zero_elements_as_a_block_item(self):
|
|
"""Should never be called but belts and suspenders."""
|
|
foobar = etree.fromstring(
|
|
"<foobar>Space<p>is big</p>, <b>mind-bogglingly</b> big.</foobar>",
|
|
html_parser,
|
|
).xpath(".//foobar")[0]
|
|
|
|
elements = foobar.iter_elements()
|
|
|
|
with pytest.raises(StopIteration):
|
|
next(elements)
|
|
|
|
# -- .iter_text_segments() --------------------------------------------
|
|
|
|
def it_generates_its_tail_but_no_inner_text_segments_when_called_like_phrasing(self):
|
|
foobar = etree.fromstring(
|
|
"<div>\n"
|
|
" O Deep Thought computer, he said,\n"
|
|
" <foobar>Vogon Constructor Fleet</foobar>\n"
|
|
" The task we have designed you to perform is this.\n"
|
|
" <p>We want you to tell us.... he paused,</p>\n"
|
|
"</div>",
|
|
html_parser,
|
|
).xpath(".//foobar")[0]
|
|
|
|
texts = [ts.text for ts in foobar.iter_text_segments()]
|
|
|
|
assert texts == ["\n The task we have designed you to perform is this.\n "]
|
|
|
|
def and_it_behaves_like_an_empty_phrasing_element_inside_a_block_element(self):
|
|
div = etree.fromstring(
|
|
"<div>\n"
|
|
" O Deep Thought computer, he said,\n"
|
|
" <foobar>Vogon Constructor Fleet</foobar>\n"
|
|
" The task we have designed you to perform is this.\n"
|
|
" <p>We want you to tell us.... he paused,</p>\n"
|
|
"</div>",
|
|
html_parser,
|
|
).xpath(".//div")[0]
|
|
|
|
texts = [e.text for e in div.iter_elements()]
|
|
|
|
assert texts == [
|
|
"O Deep Thought computer, he said, The task we have designed you to perform is this.",
|
|
"We want you to tell us.... he paused,",
|
|
]
|