mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-01 10:33:09 +00:00
rfctr(html): refine HTML parser (#3351)
**Note** This refines the new HTML parser but _does not install it_. This is why no changes to ingest test expectations or other unit-tests are required here. Installing the new parser will happen in the next PR #3218. **Summary** The initial version of the parser (purposely) raised on a block element nested inside a phrasing element. While such nesting is not valid according to the HTML Standard, it is accepted by the browser and does happen in the wild. The refinements here handle this situation similarly to how the browser does, breaking phrasing at the block element boundaries and starting it up again after the block element. Unfortunately this adds complexity to the parser, but it makes the parser robust against pretty much any HTML we're likely to encounter and partitions it consistent with how it would be rendered in the browser.
This commit is contained in:
parent
7b25dfc337
commit
00e1d5c05b
17
CHANGELOG.md
17
CHANGELOG.md
@ -1,12 +1,21 @@
|
||||
## 0.14.11-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `<p>`, `<div>`) nested inside a phrasing element (e.g. `<strong>` or `<cite>`). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation.
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.14.10
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Update unstructured-client dependency** Change unstructured-client dependency pin back to
|
||||
greater than min version and updated tests that were failing given the update.
|
||||
* **Update unstructured-client dependency** Change unstructured-client dependency pin back to greater than min version and updated tests that were failing given the update.
|
||||
* **`.doc` files are now supported in the `arm64` image.**. `libreoffice24` is added to the `arm64` image, meaning `.doc` files are now supported. We have follow on work planned to investigate adding `.ppt` support for `arm64` as well.
|
||||
* Add table detection metrics: recall, precision and f1
|
||||
* Remove unused _with_spans metrics
|
||||
* **Add table detection metrics: recall, precision and f1.**
|
||||
* **Remove unused _with_spans metrics.**
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -4,6 +4,7 @@ from __future__ import annotations
|
||||
|
||||
from ._classlookup import ElementBase as ElementBase
|
||||
from ._classlookup import ElementDefaultClassLookup as ElementDefaultClassLookup
|
||||
from ._cleanup import strip_elements as strip_elements
|
||||
from ._element import _Element as _Element
|
||||
from ._element import _ElementTree as _ElementTree
|
||||
from ._module_func import fromstring as fromstring
|
||||
|
||||
21
typings/lxml/etree/_cleanup.pyi
Normal file
21
typings/lxml/etree/_cleanup.pyi
Normal file
@ -0,0 +1,21 @@
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Collection, overload
|
||||
|
||||
from .._types import _ElementOrTree, _TagSelector
|
||||
|
||||
@overload
|
||||
def strip_elements(
|
||||
__tree_or_elem: _ElementOrTree,
|
||||
*tag_names: _TagSelector,
|
||||
with_tail: bool = True,
|
||||
) -> None: ...
|
||||
@overload
|
||||
def strip_elements(
|
||||
__tree_or_elem: _ElementOrTree,
|
||||
__tag: Collection[_TagSelector],
|
||||
/,
|
||||
with_tail: bool = True,
|
||||
) -> None: ...
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.14.10" # pragma: no cover
|
||||
__version__ = "0.14.11-dev0" # pragma: no cover
|
||||
|
||||
@ -75,10 +75,9 @@ Other background
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import itertools
|
||||
from collections import defaultdict, deque
|
||||
from types import MappingProxyType
|
||||
from typing import Any, Iterable, Iterator, Mapping, NamedTuple, cast
|
||||
from typing import Any, Iterable, Iterator, Mapping, NamedTuple, Sequence, cast
|
||||
|
||||
from lxml import etree
|
||||
from typing_extensions import TypeAlias
|
||||
@ -102,7 +101,7 @@ from unstructured.partition.text_type import (
|
||||
is_possible_title,
|
||||
is_us_city_state_zip,
|
||||
)
|
||||
from unstructured.utils import htmlify_matrix_of_cell_texts
|
||||
from unstructured.utils import htmlify_matrix_of_cell_texts, lazyproperty
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
# DOMAIN MODEL
|
||||
@ -117,14 +116,14 @@ differ between the individual (text-segment) and consolidated (Element) forms.
|
||||
"""
|
||||
|
||||
|
||||
def _consolidate_annotations(text_segments: Iterable[TextSegment]) -> Annotation:
|
||||
def _consolidate_annotations(annotations: Iterable[Annotation]) -> Annotation:
|
||||
"""Combine individual text-segment annotations into an element-level annotation.
|
||||
|
||||
Sequence is significant.
|
||||
"""
|
||||
combined_annotations = cast(defaultdict[str, list[str]], defaultdict(list))
|
||||
for ts in text_segments:
|
||||
for k, v in ts.annotation.items():
|
||||
for a in annotations:
|
||||
for k, v in a.items():
|
||||
if isinstance(v, list):
|
||||
combined_annotations[k].extend(cast(list[Any], v))
|
||||
else:
|
||||
@ -161,6 +160,171 @@ class TextSegment(NamedTuple):
|
||||
annotation: Annotation
|
||||
|
||||
|
||||
Phrase: TypeAlias = Sequence[TextSegment]
|
||||
"""Contiguous text-segments formed from text and contiguous phrasing.
|
||||
|
||||
These occur within a block element as the element text and contiguous phrasing or the tail and
|
||||
contiguous phrasing. For example, there are two phrases in this div, one before and one after the
|
||||
<p> child element:
|
||||
|
||||
<div>
|
||||
Seagulls <b>gonna <i>come</i></b> and
|
||||
<p>Poke me in the coconut</p>
|
||||
And they <b>did</b>, they <i>did</i>
|
||||
</div>
|
||||
|
||||
The first is `div.text` and the phrasing (text and tail of phrasing elements) that follow it. A
|
||||
phrase terminates at a block element (`<p>` in this case) or at the end of the enclosing block (the
|
||||
`</div>` in this example).
|
||||
"""
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
# PHRASING ACCUMULATORS
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
class _PhraseAccumulator:
|
||||
"""Accumulates sequential `TextSegment`s making them available as iterable on flush().
|
||||
|
||||
- The accumulator starts empty.
|
||||
- `.flush()` is a Phrase iterator and generates zero or one Phrase.
|
||||
- `.flush()` generates zero items when no text-segments have been accumulated
|
||||
- `flush()` resets the accumulator to its initial empty state.
|
||||
|
||||
So far, phrases are used only by the Anchor class.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._text_segments: list[TextSegment] = []
|
||||
|
||||
def add(self, text_segment: TextSegment) -> None:
|
||||
"""Add `text_segment` to this collection."""
|
||||
self._text_segments.append(text_segment)
|
||||
|
||||
def flush(self) -> Iterator[Phrase]:
|
||||
"""Generate each of the stored `TextSegment` objects and clears the accumulator."""
|
||||
# -- harvest accumulated text-segments and empty the accumulator --
|
||||
text_segments = self._text_segments[:]
|
||||
self._text_segments.clear()
|
||||
|
||||
if not text_segments:
|
||||
return
|
||||
|
||||
yield tuple(text_segments)
|
||||
|
||||
|
||||
class _ElementAccumulator:
|
||||
"""Accumulates sequential `TextSegment`s and forms them into an element on flush().
|
||||
|
||||
The text segments come from element text or tails and any contiguous phrasing elements that
|
||||
follow that text or tail.
|
||||
|
||||
- The accumulator starts empty.
|
||||
- `.flush()` is an element iterator and generates zero or one Element.
|
||||
- `.flush()` generates zero elements when no text-segments have been accumulated or the ones
|
||||
that have been accumulated contain only whitespace.
|
||||
- `flush()` resets the accumulator to its initial empty state.
|
||||
"""
|
||||
|
||||
def __init__(self, element: etree.ElementBase):
|
||||
self._element = element
|
||||
self._text_segments: list[TextSegment] = []
|
||||
|
||||
def add(self, text_segment: TextSegment) -> None:
|
||||
"""Add `text_segment` to this Element-under-construction."""
|
||||
self._text_segments.append(text_segment)
|
||||
|
||||
def flush(self, ElementCls: type[Element] | None) -> Iterator[Element]:
|
||||
"""Generate zero-or-one document-`Element` object and clear the accumulator."""
|
||||
# -- normalized-text must be computed before resetting the accumulator --
|
||||
normalized_text = self._normalized_text
|
||||
|
||||
# -- harvest accumulated text-segments and empty the accumulator --
|
||||
text_segments = self._text_segments[:]
|
||||
self._text_segments.clear()
|
||||
|
||||
if not text_segments or not normalized_text:
|
||||
return
|
||||
|
||||
# -- if we don't have a more specific element-class, choose one based on the text --
|
||||
if ElementCls is None:
|
||||
ElementCls = derive_element_type_from_text(normalized_text)
|
||||
# -- normalized text that contains only a single character is skipped unless it
|
||||
# -- identifies as a list-item
|
||||
if ElementCls is None:
|
||||
return
|
||||
# -- derived ListItem means text starts with a bullet character that needs removing --
|
||||
if ElementCls is ListItem:
|
||||
normalized_text = clean_bullets(normalized_text)
|
||||
if not normalized_text:
|
||||
return
|
||||
|
||||
category_depth = self._category_depth(ElementCls)
|
||||
|
||||
yield ElementCls(
|
||||
normalized_text,
|
||||
metadata=ElementMetadata(
|
||||
**_consolidate_annotations(ts.annotation for ts in text_segments),
|
||||
category_depth=category_depth,
|
||||
),
|
||||
)
|
||||
|
||||
def _category_depth(self, ElementCls: type[Element]) -> int | None:
|
||||
"""Not clear on concept. Something to do with hierarchy ..."""
|
||||
if ElementCls is ListItem:
|
||||
return (
|
||||
len([e for e in self._element.iterancestors() if e.tag in ("dl", "ol", "ul")])
|
||||
if self._element.tag in ("li", "dd")
|
||||
else 0
|
||||
)
|
||||
|
||||
if ElementCls is Title:
|
||||
return (
|
||||
int(self._element.tag[1]) - 1
|
||||
if self._element.tag in ("h1", "h2", "h3", "h4", "h5", "h6")
|
||||
else 0
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
@property
|
||||
def _normalized_text(self) -> str:
|
||||
"""Consolidate text-segment text values into a single whitespace-normalized string.
|
||||
|
||||
This normalization is suitable for text inside a block element including any segments from
|
||||
phrasing elements immediately following that text. The spec is:
|
||||
|
||||
- All text segments are concatenated (without adding or removing whitespace)
|
||||
- Leading and trailing whitespace are removed.
|
||||
- Each run of whitespace in the string is reduced to a single space.
|
||||
|
||||
For example:
|
||||
" \n foo bar\nbaz bada \t bing\n "
|
||||
becomes:
|
||||
"foo bar baz bada bing"
|
||||
"""
|
||||
return " ".join("".join(ts.text for ts in self._text_segments).split())
|
||||
|
||||
|
||||
class _PreElementAccumulator(_ElementAccumulator):
|
||||
"""Accumulator specific to `<pre>` element, preserves (most) whitespace in normalized text."""
|
||||
|
||||
@property
|
||||
def _normalized_text(self) -> str:
|
||||
"""Consolidate `texts` into a single whitespace-normalized string.
|
||||
|
||||
This normalization is specific to the `<pre>` element. Only a leading and or trailing
|
||||
newline is removed. All other whitespace is preserved.
|
||||
"""
|
||||
text = "".join(ts.text for ts in self._text_segments)
|
||||
|
||||
start = 1 if text.startswith("\n") else 0
|
||||
end = -1 if text.endswith("\n") else len(text)
|
||||
|
||||
return text[start:end]
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
# CUSTOM ELEMENT-CLASSES
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
@ -195,19 +359,10 @@ class Flow(etree.ElementBase):
|
||||
yield from block_item.iter_elements()
|
||||
yield from self._element_from_text_or_tail(block_item.tail or "", q)
|
||||
|
||||
def _category_depth(self, ElementCls: type[Element]) -> int | None:
|
||||
"""Not clear on concept. Something to do with hierarchy ..."""
|
||||
if ElementCls is ListItem:
|
||||
return (
|
||||
len([e for e in self.iterancestors() if e.tag in ("dl", "ol", "ul")])
|
||||
if self.tag in ("li", "dd")
|
||||
else 0
|
||||
)
|
||||
|
||||
if ElementCls is Title:
|
||||
return int(self.tag[1]) - 1 if self.tag in ("h1", "h2", "h3", "h4", "h5", "h6") else 0
|
||||
|
||||
return None
|
||||
@lazyproperty
|
||||
def _element_accum(self) -> _ElementAccumulator:
|
||||
"""Text-segment accumulator suitable for this block-element."""
|
||||
return _ElementAccumulator(self)
|
||||
|
||||
def _element_from_text_or_tail(
|
||||
self, text: str, q: deque[Flow | Phrasing], ElementCls: type[Element] | None = None
|
||||
@ -216,37 +371,34 @@ class Flow(etree.ElementBase):
|
||||
|
||||
Note this mutates `q` by popping phrasing elements off as they are processed.
|
||||
"""
|
||||
text_segments = tuple(self._iter_text_segments(text, q))
|
||||
normalized_text = " ".join("".join(ts.text for ts in text_segments).split())
|
||||
element_accum = self._element_accum
|
||||
|
||||
if not normalized_text:
|
||||
return
|
||||
for node in self._iter_text_segments(text, q):
|
||||
if isinstance(node, TextSegment):
|
||||
element_accum.add(node)
|
||||
else:
|
||||
# -- otherwise x is an Element, which terminates any accumulating Element --
|
||||
yield from element_accum.flush(ElementCls)
|
||||
yield node
|
||||
|
||||
# -- if we don't have a more specific element-class, choose one based on the text --
|
||||
if ElementCls is None:
|
||||
ElementCls = derive_element_type_from_text(normalized_text)
|
||||
# -- normalized text that contains only a bullet character is skipped --
|
||||
if ElementCls is None:
|
||||
return
|
||||
# -- derived ListItem means text starts with a bullet character that needs removing --
|
||||
if ElementCls is ListItem:
|
||||
normalized_text = clean_bullets(normalized_text)
|
||||
if not normalized_text:
|
||||
return
|
||||
yield from element_accum.flush(ElementCls)
|
||||
|
||||
category_depth = self._category_depth(ElementCls)
|
||||
def _iter_text_segments(
|
||||
self, text: str, q: deque[Flow | Phrasing]
|
||||
) -> Iterator[TextSegment | Element]:
|
||||
"""Generate zero-or-more `TextSegment`s or `Element`s from text and leading phrasing.
|
||||
|
||||
yield ElementCls(
|
||||
normalized_text,
|
||||
metadata=ElementMetadata(
|
||||
**_consolidate_annotations(text_segments), category_depth=category_depth
|
||||
),
|
||||
)
|
||||
Note that while this method is named "._iter_text_segments()", it can also generate
|
||||
`Element` objects when a block item is nested within a phrasing element. This is not
|
||||
technically valid HTML, but folks write some wacky HTML and the browser is pretty forgiving
|
||||
so we try to do the right thing (what the browser does) when that happens, generally
|
||||
interpret each nested block as its own paragraph and generate a separate `Element` object
|
||||
for each.
|
||||
|
||||
def _iter_text_segments(self, text: str, q: deque[Flow | Phrasing]) -> Iterator[TextSegment]:
|
||||
"""Generate zero-or-more `TextSegment`s from text and leading phrasing elements.
|
||||
This method is used to process the text or tail of a block element, including any phrasing
|
||||
elements immediately following the text or tail.
|
||||
|
||||
This is used to process the text or tail of a flow element. For example, this <div>:
|
||||
For example, this <div>:
|
||||
|
||||
<div>
|
||||
For a <b>moment, <i>nothing</i> happened.</b>
|
||||
@ -254,8 +406,13 @@ class Flow(etree.ElementBase):
|
||||
The dolphins had always believed that <em>they</em> were far more intelligent.
|
||||
</div>
|
||||
|
||||
Should generate three distinct elements, one for each contained line. This method is
|
||||
invoked to process the first beginning "For a" and the third line beginning "The dolphins".
|
||||
Should generate three distinct elements:
|
||||
- One for the div's text "For a " and the <b> phrasing element after it,
|
||||
- one for the <p> element, and
|
||||
- one for the tail of the <p> and the phrasing <em> element that follows it.
|
||||
|
||||
This method is invoked to process the first line beginning "For a" and the third line
|
||||
beginning "The dolphins", in two separate calls.
|
||||
|
||||
Note this method mutates `q` by popping phrasing elements off as they are processed.
|
||||
"""
|
||||
@ -314,33 +471,10 @@ class Pre(BlockItem):
|
||||
Can only contain phrasing content.
|
||||
"""
|
||||
|
||||
def iter_elements(self) -> Iterator[Element]:
|
||||
"""Generate zero or one document element for the entire `<pre>` element.
|
||||
|
||||
Whitespace is preserved just as it appears in the source HTML.
|
||||
"""
|
||||
pre_text = self.text or ""
|
||||
# -- this is pretty subtle, but in a browser, if the opening `<pre>` is immediately
|
||||
# -- followed by a newline, that newline is removed from the rendered text.
|
||||
if pre_text.startswith("\n"):
|
||||
pre_text = pre_text[1:]
|
||||
|
||||
text_segments = tuple(self._iter_text_segments(pre_text, deque(self)))
|
||||
text = "".join(ts.text for ts in text_segments)
|
||||
|
||||
# -- also subtle, but in a browser, if the closing `</pre>` tag is immediately preceded
|
||||
# -- by a newline (starts in column 1), that preceding newline is removed too.
|
||||
if text.endswith("\n"):
|
||||
text = text[:-1]
|
||||
|
||||
if not text:
|
||||
return
|
||||
|
||||
ElementCls = derive_element_type_from_text(text)
|
||||
if not ElementCls:
|
||||
return
|
||||
|
||||
yield ElementCls(text, metadata=ElementMetadata(**_consolidate_annotations(text_segments)))
|
||||
@lazyproperty
|
||||
def _element_accum(self) -> _ElementAccumulator:
|
||||
"""Text-segment accumulator suitable for this block-element."""
|
||||
return _PreElementAccumulator(self)
|
||||
|
||||
|
||||
class TableBlock(Flow):
|
||||
@ -404,7 +538,7 @@ class Phrasing(etree.ElementBase):
|
||||
def is_phrasing(self) -> bool:
|
||||
return True
|
||||
|
||||
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]:
|
||||
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment | Element]:
|
||||
"""Generate text segments for text, children, and tail of this element."""
|
||||
inside_emphasis = self._inside_emphasis(enclosing_emphasis)
|
||||
|
||||
@ -445,14 +579,25 @@ class Phrasing(etree.ElementBase):
|
||||
"""
|
||||
return enclosing_emphasis
|
||||
|
||||
def _iter_child_text_segments(self, emphasis: str) -> Iterator[TextSegment]:
|
||||
def _iter_child_text_segments(self, emphasis: str) -> Iterator[TextSegment | Element]:
|
||||
"""Generate zero-or-more text-segments for phrasing children of this element.
|
||||
|
||||
All generated text segments will be annotated with `emphasis` when it is other than the
|
||||
empty string.
|
||||
"""
|
||||
for child in self:
|
||||
yield from child.iter_text_segments(emphasis)
|
||||
q: deque[Flow | Phrasing] = deque(self)
|
||||
# -- Recurse into any nested tags. Phrasing children contribute `TextSegment`s to the
|
||||
# -- stream. Block children contribute document `Element`s. Note however that a phrasing
|
||||
# -- child can also produce an `Element` from any nested block element.
|
||||
while q:
|
||||
child = q.popleft()
|
||||
if child.is_phrasing:
|
||||
yield from cast(Phrasing, child).iter_text_segments(emphasis)
|
||||
else:
|
||||
yield from cast(Flow, child).iter_elements()
|
||||
yield from self._iter_text_segments_from_block_tail_and_phrasing(
|
||||
child.tail or "", q, emphasis
|
||||
)
|
||||
|
||||
def _iter_tail_segment(self, emphasis: str) -> Iterator[TextSegment]:
|
||||
"""Generate zero-or-one text-segment for tail of this element.
|
||||
@ -472,6 +617,150 @@ class Phrasing(etree.ElementBase):
|
||||
if text := self.text:
|
||||
yield TextSegment(text, self._annotation(text, emphasis))
|
||||
|
||||
def _iter_text_segments_from_block_tail_and_phrasing(
|
||||
self, tail: str, q: deque[Flow | Phrasing], emphasis: str
|
||||
) -> Iterator[TextSegment | Element]:
|
||||
"""Generate zero-or-more `TextSegment`s or `Element`s from tail+phrasing of block child.
|
||||
|
||||
When this phrasing element contains a block child (not valid HTML but accepted by
|
||||
browsers), the tail of that block child and any phrasing elements contiguous with that tail
|
||||
also need to contribute their text. This method takes care of that job.
|
||||
|
||||
Note this mutates `q` by popping phrasing elements off as they are processed.
|
||||
"""
|
||||
if tail:
|
||||
yield TextSegment(tail, self._annotation(tail, emphasis))
|
||||
while q and q[0].is_phrasing:
|
||||
e = cast(Phrasing, q.popleft())
|
||||
yield from e.iter_text_segments(emphasis)
|
||||
|
||||
|
||||
class Anchor(Phrasing):
|
||||
"""Custom element-class for `<a>` element.
|
||||
|
||||
Provides link annotations.
|
||||
"""
|
||||
|
||||
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment | Element]:
|
||||
"""Generate text segments for contents and tail of this element, when they exist.
|
||||
|
||||
Phrasing is emitted as `TextSegment` objects. Any nested block items (not valid HTML but
|
||||
are accepted by browser so can occur) are emitted as `Element` objects.
|
||||
|
||||
When an anchor contains a nested block element, there can be multiple phrases and/or
|
||||
elements. Link annotation is only added to the first phrase or element. Otherwise the link
|
||||
annotation would span multiple document-elements.
|
||||
"""
|
||||
q: deque[Phrase | Element] = deque(self._iter_phrases_and_elements(enclosing_emphasis))
|
||||
|
||||
# -- the first non-whitespace phrase or element gets the link annotation --
|
||||
while q:
|
||||
x = q.popleft()
|
||||
if isinstance(x, Element):
|
||||
yield self._link_annotate_element(x)
|
||||
break
|
||||
else:
|
||||
# -- a whitespace-only phrase will not receive the link annotation (no link text) --
|
||||
if lts := self._link_text_segment(x):
|
||||
yield lts
|
||||
break
|
||||
else:
|
||||
yield from x
|
||||
|
||||
# -- whatever phrases or elements remain are emitted without link annotation --
|
||||
|
||||
while q:
|
||||
x = q.popleft()
|
||||
if isinstance(x, Element):
|
||||
yield x
|
||||
else:
|
||||
yield from x
|
||||
|
||||
# -- A tail is emitted when present whether anchor itself was emitted or not --
|
||||
yield from self._iter_tail_segment(enclosing_emphasis)
|
||||
|
||||
def _iter_phrases_and_elements(self, emphasis: str) -> Iterator[Phrase | Element]:
|
||||
"""Divide contents (text+children, but not tail) into phrases and document-elements."""
|
||||
# -- place child elements in a queue, method calls use some and leave the rest --
|
||||
q: deque[Flow | Phrasing] = deque(self)
|
||||
|
||||
yield from self._iter_phrasing(self.text or "", q, emphasis)
|
||||
|
||||
while q:
|
||||
assert not q[0].is_phrasing
|
||||
block_item = cast(Flow, q.popleft())
|
||||
yield from block_item.iter_elements()
|
||||
yield from self._iter_phrasing(block_item.tail or "", q, emphasis)
|
||||
|
||||
def _iter_phrasing(
|
||||
self, text: str, q: deque[Flow | Phrasing], emphasis: str
|
||||
) -> Iterator[Phrase | Element]:
|
||||
"""Generate zero-or-more `TextSegment`s or `Element`s from text and leading phrasing.
|
||||
|
||||
Note that while this method is named "._iter_phrasing()", it can also generate `Element`
|
||||
objects when a block item is nested within a phrasing element. This is not technically
|
||||
valid HTML, but folks write some wacky HTML and the browser is pretty forgiving so we try
|
||||
to do the right thing (what the browser does) when that happens, generally interpret each
|
||||
nested block as its own paragraph and generate a separate `Element` object for each.
|
||||
|
||||
This method is used to process the text or tail of a block element, including any phrasing
|
||||
elements immediately following the text or tail.
|
||||
|
||||
Note this method mutates `q` by popping phrasing elements off as they are processed.
|
||||
"""
|
||||
phrase_accum = _PhraseAccumulator()
|
||||
|
||||
if text:
|
||||
phrase_accum.add(TextSegment(text, self._annotation(text, emphasis)))
|
||||
|
||||
while q and q[0].is_phrasing:
|
||||
e = cast(Phrasing, q.popleft())
|
||||
for x in e.iter_text_segments(emphasis):
|
||||
if isinstance(x, TextSegment):
|
||||
phrase_accum.add(x)
|
||||
# -- otherwise x is an `Element`, which terminates the accumulating phrase --
|
||||
else:
|
||||
yield from phrase_accum.flush()
|
||||
yield x
|
||||
|
||||
# -- emit any phrase remaining in accumulator --
|
||||
yield from phrase_accum.flush()
|
||||
|
||||
def _link_annotate_element(self, element: Element) -> Element:
|
||||
"""Apply this link's annotation to `element` and return it."""
|
||||
link_text = element.text
|
||||
link_url = self.get("href")
|
||||
|
||||
if not link_text or not link_url:
|
||||
return element
|
||||
|
||||
element.metadata.link_texts = (element.metadata.link_texts or []) + [link_text]
|
||||
element.metadata.link_urls = (element.metadata.link_urls or []) + [link_url]
|
||||
|
||||
return element
|
||||
|
||||
def _link_text_segment(self, phrase: Phrase) -> TextSegment | None:
|
||||
"""Consolidate `phrase` into a single text-segment with link annotation.
|
||||
|
||||
Returns None if the phrase contains only whitespace.
|
||||
"""
|
||||
consolidated_text = "".join(text_segment.text for text_segment in phrase)
|
||||
link_text = _normalize_text(consolidated_text)
|
||||
link_url = self.get("href")
|
||||
|
||||
if not link_text or not link_url:
|
||||
return None
|
||||
|
||||
# -- the emphasis annotations must come from the individual text segments in the phrase --
|
||||
consolidated_annotations = _consolidate_annotations(
|
||||
(
|
||||
{"link_texts": [link_text], "link_urls": [link_url]},
|
||||
*(text_segment.annotation for text_segment in phrase),
|
||||
)
|
||||
)
|
||||
|
||||
return TextSegment(consolidated_text, consolidated_annotations)
|
||||
|
||||
|
||||
class Bold(Phrasing):
|
||||
"""Provides annotations for bold/strong text."""
|
||||
@ -526,75 +815,6 @@ class RemovedPhrasing(Phrasing):
|
||||
yield from self._iter_tail_segment(enclosing_emphasis)
|
||||
|
||||
|
||||
# -- DUAL-ROLE ELEMENTS --------------------------------------------------------------------------
|
||||
|
||||
|
||||
class Anchor(Phrasing, Flow):
|
||||
"""Custom element-class for `<a>` element.
|
||||
|
||||
Provides link annotations.
|
||||
"""
|
||||
|
||||
@property
|
||||
def is_phrasing(self) -> bool:
|
||||
"""False when the `<a>` element contains any block items, True otherwise."""
|
||||
return all(e.is_phrasing for e in self)
|
||||
|
||||
def iter_text_segments(self, enclosing_emphasis: str = "") -> Iterator[TextSegment]:
|
||||
"""Generate text segments for text and tail of this element, when they exist.
|
||||
|
||||
The behavior for an anchor element is slightly different because link annotations are only
|
||||
added to the text, not the tail. Also an anchor can have no children.
|
||||
"""
|
||||
# -- the text of the link is everything inside the `<a>` element, text and child text --
|
||||
text_segments = tuple(
|
||||
itertools.chain(
|
||||
self._iter_text_segment(enclosing_emphasis),
|
||||
self._iter_child_text_segments(enclosing_emphasis),
|
||||
)
|
||||
)
|
||||
|
||||
link_text = "".join("".join(ts.text for ts in text_segments))
|
||||
|
||||
# -- the link_text and link_url annotation refers to the entire text inside the `<a>` --
|
||||
link_text_segment = TextSegment(
|
||||
link_text, self._link_annotations(link_text, enclosing_emphasis)
|
||||
)
|
||||
|
||||
# -- but the emphasis annotations must come from the individual text segments within --
|
||||
consolidated_annotations = _consolidate_annotations((link_text_segment, *text_segments))
|
||||
|
||||
# -- generate at most one text-segment for the `<a>` element, the full enclosed text with
|
||||
# -- consolidated emphasis and link annotations.
|
||||
if link_text:
|
||||
yield TextSegment(link_text, consolidated_annotations)
|
||||
|
||||
# -- A tail is emitted when present whether anchor itself was or not --
|
||||
yield from self._iter_tail_segment(enclosing_emphasis)
|
||||
|
||||
def _link_annotations(self, text: str, emphasis: str) -> Annotation:
|
||||
"""Link and emphasis annotations that apply to the text of this anchor.
|
||||
|
||||
An anchor element does not add any emphasis but uses any introduced by enclosing elements.
|
||||
"""
|
||||
normalized_text = _normalize_text(text)
|
||||
|
||||
if not normalized_text:
|
||||
return {}
|
||||
|
||||
def iter_annotation_pairs() -> Iterator[tuple[str, Any]]:
|
||||
# -- emphasis annotation is only added when there is enclosing emphasis --
|
||||
if emphasis:
|
||||
yield "emphasized_text_contents", normalized_text
|
||||
yield "emphasized_text_tags", emphasis
|
||||
|
||||
if href := self.get("href"):
|
||||
yield "link_texts", normalized_text
|
||||
yield "link_urls", href
|
||||
|
||||
return MappingProxyType(dict(iter_annotation_pairs()))
|
||||
|
||||
|
||||
# -- DEFAULT ELEMENT -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user