diff --git a/CHANGELOG.md b/CHANGELOG.md index 0dc1da4ca..623c00d67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.11.7-dev0 + +### Enhancements + +### Features + +### Fixes + ## 0.11.6 ### Enhancements diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index e91bf2b22..89858f143 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import List +from typing import List, Sequence import pytest @@ -16,6 +16,7 @@ from unstructured.chunking.base import ( TablePreChunk, TextPreChunk, TextPreChunkAccumulator, + _TextSplitter, is_in_next_section, is_on_next_page, is_title, @@ -39,7 +40,7 @@ from unstructured.documents.elements import ( class DescribeChunkingOptions: - """Unit-test suite for `unstructured.chunking.base.ChunkingOptions objects.""" + """Unit-test suite for `unstructured.chunking.base.ChunkingOptions` objects.""" @pytest.mark.parametrize("max_characters", [0, -1, -42]) def it_rejects_max_characters_not_greater_than_zero(self, max_characters: int): @@ -144,6 +145,97 @@ class DescribeChunkingOptions: assert ChunkingOptions.new().text_separator == "\n\n" +class Describe_TextSplitter: + """Unit-test suite for `unstructured.chunking.base._TextSplitter` objects.""" + + def it_splits_on_a_preferred_separator_when_it_can(self): + opts = ChunkingOptions.new(max_characters=50, text_splitting_separators=("\n", " ")) + split = _TextSplitter(opts) + text = ( + "Lorem ipsum dolor amet consectetur adipiscing.\n" + "In rhoncus ipsum sed lectus porta volutpat." + ) + + s, remainder = split(text) + assert s == "Lorem ipsum dolor amet consectetur adipiscing." + assert remainder == "In rhoncus ipsum sed lectus porta volutpat." + # -- + s, remainder = split(remainder) + assert s == "In rhoncus ipsum sed lectus porta volutpat." + assert remainder == "" + + def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self): + opts = ChunkingOptions.new(max_characters=40, text_splitting_separators=("\n", " ")) + split = _TextSplitter(opts) + text = ( + "Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta" + " volutpat." + ) + + s, remainder = split(text) + assert s == "Lorem ipsum dolor amet consectetur" + assert remainder == "adipiscing. In rhoncus ipsum sed lectus porta volutpat." + # -- + s, remainder = split(remainder) + assert s == "adipiscing. In rhoncus ipsum sed lectus" + assert remainder == "porta volutpat." + # -- + s, remainder = split(remainder) + assert s == "porta volutpat." + assert remainder == "" + + def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self): + opts = ChunkingOptions.new(max_characters=40, text_splitting_separators=("\n", " ")) + split = _TextSplitter(opts) + text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta." + + s, remainder = split(text) + assert s == "Loremipsumdolorametconsecteturadipiscing" + assert remainder == "elit. In rhoncus ipsum sed lectus porta." + # -- + s, remainder = split(remainder) + assert s == "elit. In rhoncus ipsum sed lectus porta." + assert remainder == "" + + @pytest.mark.parametrize( + "text", + [ + "Lorem ipsum dolor amet consectetur adipiscing.", # 46-chars + "Lorem ipsum dolor.", # 18-chars + ], + ) + def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str): + opts = ChunkingOptions.new(max_characters=46) + split = _TextSplitter(opts) + + s, remainder = split(text) + + assert s == text + assert remainder == "" + + def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self): + opts = ChunkingOptions.new(max_characters=38) + split = _TextSplitter(opts) + text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta." + + s, _ = split(text) + + assert s == "Loremipsumdolorametconsecteturadipisci" + assert len(s) == 38 + + @pytest.mark.parametrize("separators", [("\n", " "), ()]) + def it_strips_whitespace_around_the_split(self, separators: Sequence[str]): + opts = ChunkingOptions.new(max_characters=50, text_splitting_separators=separators) + split = _TextSplitter(opts) + text = "Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus." + # |------------------------------------------------^ 50-chars + + s, remainder = split(text) + + assert s == "Lorem ipsum dolor amet consectetur adipiscing." + assert remainder == "In rhoncus ipsum sed lectus." + + # ================================================================================================ # BASE PRE-CHUNKER # ================================================================================================ @@ -263,7 +355,7 @@ class DescribeTablePreChunk: ) pre_chunk = TablePreChunk( Table(text_table, metadata=ElementMetadata(text_as_html=html_table)), - opts=ChunkingOptions.new(max_characters=100), + opts=ChunkingOptions.new(max_characters=100, text_splitting_separators=("\n", " ")), ) chunk_iter = pre_chunk.iter_chunks() @@ -273,8 +365,7 @@ class DescribeTablePreChunk: assert chunk.text == ( "Header Col 1 Header Col 2\n" "Lorem ipsum dolor sit amet\n" - "Consectetur adipiscing elit\n" - "Nunc aliqua" + "Consectetur adipiscing elit" ) assert chunk.metadata.text_as_html == ( "\n" @@ -287,8 +378,8 @@ class DescribeTablePreChunk: # -- chunk = next(chunk_iter) assert isinstance(chunk, TableChunk) - assert ( - chunk.text == "m id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum" + assert chunk.text == ( + "Nunc aliquam id enim nec molestie\nVivamus quis nunc ipsum donec ac fermentum" ) assert chunk.metadata.text_as_html == ( "rem ipsum \n" @@ -399,7 +490,7 @@ class DescribeTextPreChunk: " commodo consequat." ), ], - opts=ChunkingOptions.new(max_characters=200), + opts=ChunkingOptions.new(max_characters=200, text_splitting_separators=("\n", " ")), ) chunk_iter = pre_chunk.iter_chunks() @@ -408,12 +499,12 @@ class DescribeTextPreChunk: assert chunk == CompositeElement( "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" - " veniam, quis nostrud exercitation ullamco laboris nisi ut a" + " veniam, quis nostrud exercitation ullamco laboris nisi ut" ) assert chunk.metadata is pre_chunk._consolidated_metadata # -- chunk = next(chunk_iter) - assert chunk == CompositeElement("liquip ex ea commodo consequat.") + assert chunk == CompositeElement("aliquip ex ea commodo consequat.") assert chunk.metadata is pre_chunk._consolidated_metadata # -- with pytest.raises(StopIteration): diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index 82646a921..2ee2b060f 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -35,7 +35,7 @@ def test_it_splits_a_large_element_into_multiple_chunks(): assert chunks == [ CompositeElement("Introduction"), - CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing "), + CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing"), CompositeElement("elit. In rhoncus ipsum sed lectus porta volutpat."), ] diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index c1d30a292..a9070a3b9 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -1100,7 +1100,10 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars(): assert len(partitioned_table_elements_5_chars) != len(table_elements) assert len(partitioned_table_elements_200_chars) != len(table_elements) - assert len(partitioned_table_elements_5_chars[0].text) == 5 + # trailing whitespace is stripped from the first chunk, leaving only a checkbox character + assert len(partitioned_table_elements_5_chars[0].text) == 1 + # but the second chunk is the full 5 characters + assert len(partitioned_table_elements_5_chars[1].text) == 5 assert len(partitioned_table_elements_5_chars[0].metadata.text_as_html) == 5 # the first table element is under 200 chars so doesn't get chunked! diff --git a/unstructured/__version__.py b/unstructured/__version__.py index bd98cf8f2..491d1fa3d 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.11.6" # pragma: no cover +__version__ = "0.11.7-dev0" # pragma: no cover diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index d08f3c4f3..b85c4e14f 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -18,6 +18,7 @@ from typing import ( cast, ) +import regex from typing_extensions import Self, TypeAlias from unstructured.documents.elements import ( @@ -45,7 +46,46 @@ PreChunk: TypeAlias = "TablePreChunk | TextPreChunk" class ChunkingOptions: - """Specifies parameters of optional chunking behaviors.""" + """Specifies parameters of optional chunking behaviors. + + Parameters + ---------- + max_characters + Hard-maximum text-length of chunk. A chunk longer than this will be split mid-text and be + emitted as two or more chunks. + new_after_n_chars + Preferred approximate chunk size. A chunk composed of elements totalling this size or + greater is considered "full" and will not be enlarged by adding another element, even if it + will fit within the remaining `max_characters` for that chunk. Defaults to `max_characters` + when not specified, which effectively disables this behavior. Specifying 0 for this + argument causes each element to appear in a chunk by itself (although an element with text + longer than `max_characters` will be still be split into two or more chunks). + multipage_sections + Indicates that page-boundaries should not be respected while chunking, i.e. elements + appearing on two different pages can appear in the same chunk. + combine_text_under_n_chars + Provides a way to "recombine" small chunks formed by breaking on a semantic boundary. Only + relevant for a chunking strategy that specifies higher-level semantic boundaries to be + respected, like "section" or "page". Recursively combines two adjacent pre-chunks when the + first pre-chunk is smaller than this threshold. "Recursively" here means the resulting + pre-chunk can be combined with the next pre-chunk if it is still under the length threshold. + Defaults to `max_characters` which combines chunks whenever space allows. Specifying 0 for + this argument suppresses combining of small chunks. Note this value is "capped" at the + `new_after_n_chars` value since a value higher than that would not change this parameter's + effect. + overlap + Specifies the length of a string ("tail") to be drawn from each chunk and prefixed to the + next chunk as a context-preserving mechanism. By default, this only applies to split-chunks + where an oversized element is divided into multiple chunks by text-splitting. + text_splitting_separators + A sequence of strings like `("\n", " ")` to be used as target separators during + text-splitting. Text-splitting only applies to splitting an oversized element into two or + more chunks. These separators are tried in the specified order until one is found in the + string to be split. The default separator is `""` which matches between any two characters. + This separator should not be specified in this sequence because it is always the separator + of last-resort. Note that because the separator is removed during text-splitting, only + whitespace character sequences are suitable. + """ def __init__( self, @@ -54,12 +94,14 @@ class ChunkingOptions: multipage_sections: bool = True, new_after_n_chars: Optional[int] = None, overlap: int = 0, + text_splitting_separators: Sequence[str] = (), ): self._combine_text_under_n_chars_arg = combine_text_under_n_chars self._max_characters = max_characters self._multipage_sections = multipage_sections self._new_after_n_chars_arg = new_after_n_chars self._overlap = overlap + self._text_splitting_separators = text_splitting_separators @classmethod def new( @@ -69,6 +111,7 @@ class ChunkingOptions: multipage_sections: bool = True, new_after_n_chars: Optional[int] = None, overlap: int = 0, + text_splitting_separators: Sequence[str] = (), ) -> Self: """Construct validated instance. @@ -80,6 +123,7 @@ class ChunkingOptions: multipage_sections, new_after_n_chars, overlap, + text_splitting_separators, ) self._validate() return self @@ -144,6 +188,15 @@ class ChunkingOptions: else new_after_n_chars ) + @lazyproperty + def split(self) -> Callable[[str], Tuple[str, str]]: + """A text-splitting function suitable for splitting the text of an oversized pre-chunk. + + The function is pre-configured with the chosen chunking window size and any other applicable + options specified by the caller as part of this chunking-options instance. + """ + return _TextSplitter(self) + @lazyproperty def text_separator(self) -> str: """The string to insert between elements when concatenating their text for a chunk. @@ -154,6 +207,11 @@ class ChunkingOptions: """ return "\n\n" + @lazyproperty + def text_splitting_separators(self) -> Tuple[str, ...]: + """Sequence of text-splitting target strings to be used in order of preference.""" + return tuple(self._text_splitting_separators) + def _validate(self) -> None: """Raise ValueError if requestion option-set is invalid.""" max_characters = self._max_characters @@ -187,6 +245,90 @@ class ChunkingOptions: raise ValueError(f"'overlap' must be less than max_characters," f" got {self._overlap}") +class _TextSplitter: + """Provides a text-splitting function configured on construction. + + Text is split on the best-available separator, falling-back from the preferred separator + through a sequence of alternate separators. + + - The separator is removed by splitting so only whitespace strings are suitable separators. + - A "blank-line" ("\n\n") is unlikely to occur in an element as it would have been used as an + element boundary during partitioning. + + This is a *callable* object. Constructing it essentially produces a function: + + split = _TextSplitter(opts) + fragment, remainder = split(s) + + This allows it to be configured with length-options etc. on construction and used throughout a + chunking operation on a given element-stream. + """ + + def __init__(self, opts: ChunkingOptions): + self._opts = opts + + def __call__(self, s: str) -> Tuple[str, str]: + """Return pair of strings split from `s` on the best match of configured patterns. + + The first string is the split, the second is the remainder of the string. The split string + will never be longer than `maxlen`. The separators are tried in order until a match is + found. The last separator is "" which matches between any two characters so there will + always be a split. + + The separator is removed and does not appear in the split or remainder. + + An `s` that is already less than the maximum length is returned unchanged with no remainder. + This allows this function to be called repeatedly with the remainder until it is consumed + and returns a remainder of "". + """ + maxlen = self._opts.hard_max + + if len(s) <= maxlen: + return s, "" + + for p, length in self._patterns: + # -- length of separator must be added to include that separator when it happens to be + # -- located exactly at maxlen. Otherwise the search-from-end regex won't find it. + fragment, remainder = self._split_from_maxlen(p, maxlen + length, s) + if not fragment: + continue + return fragment.rstrip(), remainder.lstrip() + + # -- the terminal "" pattern is not actually executed via regex since its implementation is + # -- trivial and provides a hard back-stop here in this method. + return s[:maxlen].rstrip(), s[maxlen:].lstrip() + + @lazyproperty + def _patterns(self) -> Tuple[Tuple[regex.Pattern[str], int], ...]: + """Sequence of (pattern, len) pairs to match against. + + Patterns appear in order of preference, those following are "fall-back" patterns to be used + if no match of a prior pattern is found. + + NOTE these regexes search *from the end of the string*, which is what the "(?r)" bit + specifies. This is much more efficient than starting at the beginning of the string which + could result in hundreds of matches before the desired one. + """ + separators = self._opts.text_splitting_separators + return tuple((regex.compile(f"(?r){sep}"), len(sep)) for sep in separators) + + @staticmethod + def _split_from_maxlen(pattern: regex.Pattern[str], maxlen: int, s: str) -> Tuple[str, str]: + """Return (split, remainder) pair split from `s` on the right-most match before `maxlen`. + + Returns `"", s` if no suitable match was found. The first string in the pair will never be + longer than `maxlen` and there is no longer split available using `pattern`. + + The separator is removed and does not appear in either the split or remainder. + """ + match = pattern.search(s[:maxlen]) + if match is None: + return "", s + start: int = match.start() + end: int = match.end() + return s[:start], s[end:] + + # ================================================================================================ # BASE PRE-CHUNKER # ================================================================================================ @@ -276,27 +418,28 @@ class TablePreChunk: def iter_chunks(self) -> Iterator[Table | TableChunk]: """Split this pre-chunk into `Table` or `TableChunk` objects maxlen or smaller.""" - text = self._table.text - html = self._table.metadata.text_as_html or "" + split = self._opts.split + text_remainder = self._table.text + html_remainder = self._table.metadata.text_as_html or "" maxlen = self._opts.hard_max # -- only chunk a table when it's too big to swallow whole -- - if len(text) <= maxlen and len(html) <= maxlen: + if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen: yield self._table return is_continuation = False - while text or html: - # -- split off the next maxchars into the next TableChunk -- - text_chunk, text = text[:maxlen], text[maxlen:] - table_chunk = TableChunk(text=text_chunk, metadata=copy.deepcopy(self._table.metadata)) + while text_remainder or html_remainder: + # -- split off the next chunk-worth of characters into a TableChunk -- + chunk_text, text_remainder = split(text_remainder) + table_chunk = TableChunk(text=chunk_text, metadata=copy.deepcopy(self._table.metadata)) # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the # -- HTML elements that *correspond* to the TextChunk.text fragment. - if html: - html_chunk, html = html[:maxlen], html[maxlen:] - table_chunk.metadata.text_as_html = html_chunk + if html_remainder: + chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:] + table_chunk.metadata.text_as_html = chunk_html # -- mark second and later chunks as a continuation -- if is_continuation: @@ -332,17 +475,14 @@ class TextPreChunk: def iter_chunks(self) -> Iterator[CompositeElement]: """Split this pre-chunk into one or more `CompositeElement` objects maxlen or smaller.""" - text = self._text - text_len = len(text) - maxlen = self._opts.hard_max - start = 0 - remaining = text_len + split = self._opts.split + metadata = self._consolidated_metadata - while remaining > 0: - end = min(start + maxlen, text_len) - yield CompositeElement(text=text[start:end], metadata=self._consolidated_metadata) - start = end - remaining = text_len - end + remainder = self._text + + while remainder: + s, remainder = split(remainder) + yield CompositeElement(text=s, metadata=metadata) @lazyproperty def text_length(self) -> int:
A Link example