mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-04 11:10:22 +00:00
rfctr(chunking): split oversized chunks on word boundary (#2297)
The text of an oversized chunk is split on an arbitrary character
boundary (mid-word). The `chunk_by_character()` strategy introduces the
idea of allowing the user to specify a separator to use for
chunk-splitting. For `langchain` this is typically "\n\n", "\n", or " ";
blank-line, newline, or word boundaries respectively.
Even if the user is allowed to specify a separator, we must provide
fall-back for when a chunk contains no such character. This can be done
incrementally, like blank-line is preferable to newline, newline is
preferable to word, and word is preferable to arbitrary character.
Further, there is nothing particular to `chunk_by_character()` in
providing such a fall-back text-splitting strategy. It would be
preferable for all strategies to split oversized chunks on even-word
boundaries for example.
Note that while a "blank-line" ("\n\n") may be common in plain text, it
is unlikely to appear in the text of an element because it would have
been interpreted as an element boundary during partitioning.
Add _TextSplitter with basic separator preferences and fall-back and
apply it to chunk-splitting for all strategies. The `by_character`
chunking strategy may enhance this behavior by adding the option for a
user to specify a particular separator suited to their use case.
This commit is contained in:
parent
4533bdac98
commit
093a11d058
@ -1,3 +1,11 @@
|
||||
## 0.11.7-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.11.6
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List
|
||||
from typing import List, Sequence
|
||||
|
||||
import pytest
|
||||
|
||||
@ -16,6 +16,7 @@ from unstructured.chunking.base import (
|
||||
TablePreChunk,
|
||||
TextPreChunk,
|
||||
TextPreChunkAccumulator,
|
||||
_TextSplitter,
|
||||
is_in_next_section,
|
||||
is_on_next_page,
|
||||
is_title,
|
||||
@ -39,7 +40,7 @@ from unstructured.documents.elements import (
|
||||
|
||||
|
||||
class DescribeChunkingOptions:
|
||||
"""Unit-test suite for `unstructured.chunking.base.ChunkingOptions objects."""
|
||||
"""Unit-test suite for `unstructured.chunking.base.ChunkingOptions` objects."""
|
||||
|
||||
@pytest.mark.parametrize("max_characters", [0, -1, -42])
|
||||
def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int):
|
||||
@ -144,6 +145,97 @@ class DescribeChunkingOptions:
|
||||
assert ChunkingOptions.new().text_separator == "\n\n"
|
||||
|
||||
|
||||
class Describe_TextSplitter:
|
||||
"""Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
|
||||
|
||||
def it_splits_on_a_preferred_separator_when_it_can(self):
|
||||
opts = ChunkingOptions.new(max_characters=50, text_splitting_separators=("\n", " "))
|
||||
split = _TextSplitter(opts)
|
||||
text = (
|
||||
"Lorem ipsum dolor amet consectetur adipiscing.\n"
|
||||
"In rhoncus ipsum sed lectus porta volutpat."
|
||||
)
|
||||
|
||||
s, remainder = split(text)
|
||||
assert s == "Lorem ipsum dolor amet consectetur adipiscing."
|
||||
assert remainder == "In rhoncus ipsum sed lectus porta volutpat."
|
||||
# --
|
||||
s, remainder = split(remainder)
|
||||
assert s == "In rhoncus ipsum sed lectus porta volutpat."
|
||||
assert remainder == ""
|
||||
|
||||
def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self):
|
||||
opts = ChunkingOptions.new(max_characters=40, text_splitting_separators=("\n", " "))
|
||||
split = _TextSplitter(opts)
|
||||
text = (
|
||||
"Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
|
||||
" volutpat."
|
||||
)
|
||||
|
||||
s, remainder = split(text)
|
||||
assert s == "Lorem ipsum dolor amet consectetur"
|
||||
assert remainder == "adipiscing. In rhoncus ipsum sed lectus porta volutpat."
|
||||
# --
|
||||
s, remainder = split(remainder)
|
||||
assert s == "adipiscing. In rhoncus ipsum sed lectus"
|
||||
assert remainder == "porta volutpat."
|
||||
# --
|
||||
s, remainder = split(remainder)
|
||||
assert s == "porta volutpat."
|
||||
assert remainder == ""
|
||||
|
||||
def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
|
||||
opts = ChunkingOptions.new(max_characters=40, text_splitting_separators=("\n", " "))
|
||||
split = _TextSplitter(opts)
|
||||
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
|
||||
|
||||
s, remainder = split(text)
|
||||
assert s == "Loremipsumdolorametconsecteturadipiscing"
|
||||
assert remainder == "elit. In rhoncus ipsum sed lectus porta."
|
||||
# --
|
||||
s, remainder = split(remainder)
|
||||
assert s == "elit. In rhoncus ipsum sed lectus porta."
|
||||
assert remainder == ""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text",
|
||||
[
|
||||
"Lorem ipsum dolor amet consectetur adipiscing.", # 46-chars
|
||||
"Lorem ipsum dolor.", # 18-chars
|
||||
],
|
||||
)
|
||||
def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
|
||||
opts = ChunkingOptions.new(max_characters=46)
|
||||
split = _TextSplitter(opts)
|
||||
|
||||
s, remainder = split(text)
|
||||
|
||||
assert s == text
|
||||
assert remainder == ""
|
||||
|
||||
def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
|
||||
opts = ChunkingOptions.new(max_characters=38)
|
||||
split = _TextSplitter(opts)
|
||||
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
|
||||
|
||||
s, _ = split(text)
|
||||
|
||||
assert s == "Loremipsumdolorametconsecteturadipisci"
|
||||
assert len(s) == 38
|
||||
|
||||
@pytest.mark.parametrize("separators", [("\n", " "), ()])
|
||||
def it_strips_whitespace_around_the_split(self, separators: Sequence[str]):
|
||||
opts = ChunkingOptions.new(max_characters=50, text_splitting_separators=separators)
|
||||
split = _TextSplitter(opts)
|
||||
text = "Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus."
|
||||
# |------------------------------------------------^ 50-chars
|
||||
|
||||
s, remainder = split(text)
|
||||
|
||||
assert s == "Lorem ipsum dolor amet consectetur adipiscing."
|
||||
assert remainder == "In rhoncus ipsum sed lectus."
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# BASE PRE-CHUNKER
|
||||
# ================================================================================================
|
||||
@ -263,7 +355,7 @@ class DescribeTablePreChunk:
|
||||
)
|
||||
pre_chunk = TablePreChunk(
|
||||
Table(text_table, metadata=ElementMetadata(text_as_html=html_table)),
|
||||
opts=ChunkingOptions.new(max_characters=100),
|
||||
opts=ChunkingOptions.new(max_characters=100, text_splitting_separators=("\n", " ")),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
@ -273,8 +365,7 @@ class DescribeTablePreChunk:
|
||||
assert chunk.text == (
|
||||
"Header Col 1 Header Col 2\n"
|
||||
"Lorem ipsum dolor sit amet\n"
|
||||
"Consectetur adipiscing elit\n"
|
||||
"Nunc aliqua"
|
||||
"Consectetur adipiscing elit"
|
||||
)
|
||||
assert chunk.metadata.text_as_html == (
|
||||
"<table>\n"
|
||||
@ -287,8 +378,8 @@ class DescribeTablePreChunk:
|
||||
# --
|
||||
chunk = next(chunk_iter)
|
||||
assert isinstance(chunk, TableChunk)
|
||||
assert (
|
||||
chunk.text == "m id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
|
||||
assert chunk.text == (
|
||||
"Nunc aliquam id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum"
|
||||
)
|
||||
assert chunk.metadata.text_as_html == (
|
||||
"rem ipsum </td><td>A Link example</td></tr>\n"
|
||||
@ -399,7 +490,7 @@ class DescribeTextPreChunk:
|
||||
" commodo consequat."
|
||||
),
|
||||
],
|
||||
opts=ChunkingOptions.new(max_characters=200),
|
||||
opts=ChunkingOptions.new(max_characters=200, text_splitting_separators=("\n", " ")),
|
||||
)
|
||||
|
||||
chunk_iter = pre_chunk.iter_chunks()
|
||||
@ -408,12 +499,12 @@ class DescribeTextPreChunk:
|
||||
assert chunk == CompositeElement(
|
||||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod"
|
||||
" tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim"
|
||||
" veniam, quis nostrud exercitation ullamco laboris nisi ut a"
|
||||
" veniam, quis nostrud exercitation ullamco laboris nisi ut"
|
||||
)
|
||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||
# --
|
||||
chunk = next(chunk_iter)
|
||||
assert chunk == CompositeElement("liquip ex ea commodo consequat.")
|
||||
assert chunk == CompositeElement("aliquip ex ea commodo consequat.")
|
||||
assert chunk.metadata is pre_chunk._consolidated_metadata
|
||||
# --
|
||||
with pytest.raises(StopIteration):
|
||||
|
||||
@ -35,7 +35,7 @@ def test_it_splits_a_large_element_into_multiple_chunks():
|
||||
|
||||
assert chunks == [
|
||||
CompositeElement("Introduction"),
|
||||
CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing "),
|
||||
CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing"),
|
||||
CompositeElement("elit. In rhoncus ipsum sed lectus porta volutpat."),
|
||||
]
|
||||
|
||||
|
||||
@ -1100,7 +1100,10 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
|
||||
assert len(partitioned_table_elements_5_chars) != len(table_elements)
|
||||
assert len(partitioned_table_elements_200_chars) != len(table_elements)
|
||||
|
||||
assert len(partitioned_table_elements_5_chars[0].text) == 5
|
||||
# trailing whitespace is stripped from the first chunk, leaving only a checkbox character
|
||||
assert len(partitioned_table_elements_5_chars[0].text) == 1
|
||||
# but the second chunk is the full 5 characters
|
||||
assert len(partitioned_table_elements_5_chars[1].text) == 5
|
||||
assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5
|
||||
|
||||
# the first table element is under 200 chars so doesn't get chunked!
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.11.6" # pragma: no cover
|
||||
__version__ = "0.11.7-dev0" # pragma: no cover
|
||||
|
||||
@ -18,6 +18,7 @@ from typing import (
|
||||
cast,
|
||||
)
|
||||
|
||||
import regex
|
||||
from typing_extensions import Self, TypeAlias
|
||||
|
||||
from unstructured.documents.elements import (
|
||||
@ -45,7 +46,46 @@ PreChunk: TypeAlias = "TablePreChunk | TextPreChunk"
|
||||
|
||||
|
||||
class ChunkingOptions:
|
||||
"""Specifies parameters of optional chunking behaviors."""
|
||||
"""Specifies parameters of optional chunking behaviors.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
max_characters
|
||||
Hard-maximum text-length of chunk. A chunk longer than this will be split mid-text and be
|
||||
emitted as two or more chunks.
|
||||
new_after_n_chars
|
||||
Preferred approximate chunk size. A chunk composed of elements totalling this size or
|
||||
greater is considered "full" and will not be enlarged by adding another element, even if it
|
||||
will fit within the remaining `max_characters` for that chunk. Defaults to `max_characters`
|
||||
when not specified, which effectively disables this behavior. Specifying 0 for this
|
||||
argument causes each element to appear in a chunk by itself (although an element with text
|
||||
longer than `max_characters` will be still be split into two or more chunks).
|
||||
multipage_sections
|
||||
Indicates that page-boundaries should not be respected while chunking, i.e. elements
|
||||
appearing on two different pages can appear in the same chunk.
|
||||
combine_text_under_n_chars
|
||||
Provides a way to "recombine" small chunks formed by breaking on a semantic boundary. Only
|
||||
relevant for a chunking strategy that specifies higher-level semantic boundaries to be
|
||||
respected, like "section" or "page". Recursively combines two adjacent pre-chunks when the
|
||||
first pre-chunk is smaller than this threshold. "Recursively" here means the resulting
|
||||
pre-chunk can be combined with the next pre-chunk if it is still under the length threshold.
|
||||
Defaults to `max_characters` which combines chunks whenever space allows. Specifying 0 for
|
||||
this argument suppresses combining of small chunks. Note this value is "capped" at the
|
||||
`new_after_n_chars` value since a value higher than that would not change this parameter's
|
||||
effect.
|
||||
overlap
|
||||
Specifies the length of a string ("tail") to be drawn from each chunk and prefixed to the
|
||||
next chunk as a context-preserving mechanism. By default, this only applies to split-chunks
|
||||
where an oversized element is divided into multiple chunks by text-splitting.
|
||||
text_splitting_separators
|
||||
A sequence of strings like `("\n", " ")` to be used as target separators during
|
||||
text-splitting. Text-splitting only applies to splitting an oversized element into two or
|
||||
more chunks. These separators are tried in the specified order until one is found in the
|
||||
string to be split. The default separator is `""` which matches between any two characters.
|
||||
This separator should not be specified in this sequence because it is always the separator
|
||||
of last-resort. Note that because the separator is removed during text-splitting, only
|
||||
whitespace character sequences are suitable.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -54,12 +94,14 @@ class ChunkingOptions:
|
||||
multipage_sections: bool = True,
|
||||
new_after_n_chars: Optional[int] = None,
|
||||
overlap: int = 0,
|
||||
text_splitting_separators: Sequence[str] = (),
|
||||
):
|
||||
self._combine_text_under_n_chars_arg = combine_text_under_n_chars
|
||||
self._max_characters = max_characters
|
||||
self._multipage_sections = multipage_sections
|
||||
self._new_after_n_chars_arg = new_after_n_chars
|
||||
self._overlap = overlap
|
||||
self._text_splitting_separators = text_splitting_separators
|
||||
|
||||
@classmethod
|
||||
def new(
|
||||
@ -69,6 +111,7 @@ class ChunkingOptions:
|
||||
multipage_sections: bool = True,
|
||||
new_after_n_chars: Optional[int] = None,
|
||||
overlap: int = 0,
|
||||
text_splitting_separators: Sequence[str] = (),
|
||||
) -> Self:
|
||||
"""Construct validated instance.
|
||||
|
||||
@ -80,6 +123,7 @@ class ChunkingOptions:
|
||||
multipage_sections,
|
||||
new_after_n_chars,
|
||||
overlap,
|
||||
text_splitting_separators,
|
||||
)
|
||||
self._validate()
|
||||
return self
|
||||
@ -144,6 +188,15 @@ class ChunkingOptions:
|
||||
else new_after_n_chars
|
||||
)
|
||||
|
||||
@lazyproperty
|
||||
def split(self) -> Callable[[str], Tuple[str, str]]:
|
||||
"""A text-splitting function suitable for splitting the text of an oversized pre-chunk.
|
||||
|
||||
The function is pre-configured with the chosen chunking window size and any other applicable
|
||||
options specified by the caller as part of this chunking-options instance.
|
||||
"""
|
||||
return _TextSplitter(self)
|
||||
|
||||
@lazyproperty
|
||||
def text_separator(self) -> str:
|
||||
"""The string to insert between elements when concatenating their text for a chunk.
|
||||
@ -154,6 +207,11 @@ class ChunkingOptions:
|
||||
"""
|
||||
return "\n\n"
|
||||
|
||||
@lazyproperty
|
||||
def text_splitting_separators(self) -> Tuple[str, ...]:
|
||||
"""Sequence of text-splitting target strings to be used in order of preference."""
|
||||
return tuple(self._text_splitting_separators)
|
||||
|
||||
def _validate(self) -> None:
|
||||
"""Raise ValueError if requestion option-set is invalid."""
|
||||
max_characters = self._max_characters
|
||||
@ -187,6 +245,90 @@ class ChunkingOptions:
|
||||
raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}")
|
||||
|
||||
|
||||
class _TextSplitter:
|
||||
"""Provides a text-splitting function configured on construction.
|
||||
|
||||
Text is split on the best-available separator, falling-back from the preferred separator
|
||||
through a sequence of alternate separators.
|
||||
|
||||
- The separator is removed by splitting so only whitespace strings are suitable separators.
|
||||
- A "blank-line" ("\n\n") is unlikely to occur in an element as it would have been used as an
|
||||
element boundary during partitioning.
|
||||
|
||||
This is a *callable* object. Constructing it essentially produces a function:
|
||||
|
||||
split = _TextSplitter(opts)
|
||||
fragment, remainder = split(s)
|
||||
|
||||
This allows it to be configured with length-options etc. on construction and used throughout a
|
||||
chunking operation on a given element-stream.
|
||||
"""
|
||||
|
||||
def __init__(self, opts: ChunkingOptions):
|
||||
self._opts = opts
|
||||
|
||||
def __call__(self, s: str) -> Tuple[str, str]:
|
||||
"""Return pair of strings split from `s` on the best match of configured patterns.
|
||||
|
||||
The first string is the split, the second is the remainder of the string. The split string
|
||||
will never be longer than `maxlen`. The separators are tried in order until a match is
|
||||
found. The last separator is "" which matches between any two characters so there will
|
||||
always be a split.
|
||||
|
||||
The separator is removed and does not appear in the split or remainder.
|
||||
|
||||
An `s` that is already less than the maximum length is returned unchanged with no remainder.
|
||||
This allows this function to be called repeatedly with the remainder until it is consumed
|
||||
and returns a remainder of "".
|
||||
"""
|
||||
maxlen = self._opts.hard_max
|
||||
|
||||
if len(s) <= maxlen:
|
||||
return s, ""
|
||||
|
||||
for p, length in self._patterns:
|
||||
# -- length of separator must be added to include that separator when it happens to be
|
||||
# -- located exactly at maxlen. Otherwise the search-from-end regex won't find it.
|
||||
fragment, remainder = self._split_from_maxlen(p, maxlen + length, s)
|
||||
if not fragment:
|
||||
continue
|
||||
return fragment.rstrip(), remainder.lstrip()
|
||||
|
||||
# -- the terminal "" pattern is not actually executed via regex since its implementation is
|
||||
# -- trivial and provides a hard back-stop here in this method.
|
||||
return s[:maxlen].rstrip(), s[maxlen:].lstrip()
|
||||
|
||||
@lazyproperty
|
||||
def _patterns(self) -> Tuple[Tuple[regex.Pattern[str], int], ...]:
|
||||
"""Sequence of (pattern, len) pairs to match against.
|
||||
|
||||
Patterns appear in order of preference, those following are "fall-back" patterns to be used
|
||||
if no match of a prior pattern is found.
|
||||
|
||||
NOTE these regexes search *from the end of the string*, which is what the "(?r)" bit
|
||||
specifies. This is much more efficient than starting at the beginning of the string which
|
||||
could result in hundreds of matches before the desired one.
|
||||
"""
|
||||
separators = self._opts.text_splitting_separators
|
||||
return tuple((regex.compile(f"(?r){sep}"), len(sep)) for sep in separators)
|
||||
|
||||
@staticmethod
|
||||
def _split_from_maxlen(pattern: regex.Pattern[str], maxlen: int, s: str) -> Tuple[str, str]:
|
||||
"""Return (split, remainder) pair split from `s` on the right-most match before `maxlen`.
|
||||
|
||||
Returns `"", s` if no suitable match was found. The first string in the pair will never be
|
||||
longer than `maxlen` and there is no longer split available using `pattern`.
|
||||
|
||||
The separator is removed and does not appear in either the split or remainder.
|
||||
"""
|
||||
match = pattern.search(s[:maxlen])
|
||||
if match is None:
|
||||
return "", s
|
||||
start: int = match.start()
|
||||
end: int = match.end()
|
||||
return s[:start], s[end:]
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# BASE PRE-CHUNKER
|
||||
# ================================================================================================
|
||||
@ -276,27 +418,28 @@ class TablePreChunk:
|
||||
|
||||
def iter_chunks(self) -> Iterator[Table | TableChunk]:
|
||||
"""Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller."""
|
||||
text = self._table.text
|
||||
html = self._table.metadata.text_as_html or ""
|
||||
split = self._opts.split
|
||||
text_remainder = self._table.text
|
||||
html_remainder = self._table.metadata.text_as_html or ""
|
||||
maxlen = self._opts.hard_max
|
||||
|
||||
# -- only chunk a table when it's too big to swallow whole --
|
||||
if len(text) <= maxlen and len(html) <= maxlen:
|
||||
if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen:
|
||||
yield self._table
|
||||
return
|
||||
|
||||
is_continuation = False
|
||||
|
||||
while text or html:
|
||||
# -- split off the next maxchars into the next TableChunk --
|
||||
text_chunk, text = text[:maxlen], text[maxlen:]
|
||||
table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata))
|
||||
while text_remainder or html_remainder:
|
||||
# -- split off the next chunk-worth of characters into a TableChunk --
|
||||
chunk_text, text_remainder = split(text_remainder)
|
||||
table_chunk = TableChunk(text=chunk_text, metadata=copy.deepcopy(self._table.metadata))
|
||||
|
||||
# -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the
|
||||
# -- HTML elements that *correspond* to the TextChunk.text fragment.
|
||||
if html:
|
||||
html_chunk, html = html[:maxlen], html[maxlen:]
|
||||
table_chunk.metadata.text_as_html = html_chunk
|
||||
if html_remainder:
|
||||
chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:]
|
||||
table_chunk.metadata.text_as_html = chunk_html
|
||||
|
||||
# -- mark second and later chunks as a continuation --
|
||||
if is_continuation:
|
||||
@ -332,17 +475,14 @@ class TextPreChunk:
|
||||
|
||||
def iter_chunks(self) -> Iterator[CompositeElement]:
|
||||
"""Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller."""
|
||||
text = self._text
|
||||
text_len = len(text)
|
||||
maxlen = self._opts.hard_max
|
||||
start = 0
|
||||
remaining = text_len
|
||||
split = self._opts.split
|
||||
metadata = self._consolidated_metadata
|
||||
|
||||
while remaining > 0:
|
||||
end = min(start + maxlen, text_len)
|
||||
yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata)
|
||||
start = end
|
||||
remaining = text_len - end
|
||||
remainder = self._text
|
||||
|
||||
while remainder:
|
||||
s, remainder = split(remainder)
|
||||
yield CompositeElement(text=s, metadata=metadata)
|
||||
|
||||
@lazyproperty
|
||||
def text_length(self) -> int:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user