From 1af41d5f90e7b7ac0cfe588f3aa6bd545ff13f70 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Mon, 18 Mar 2024 12:27:39 -0700 Subject: [PATCH] feat(chunking): add .orig_elements behavior to chunking (#2656) **Summary** Add the actual behavior to populate `.metadata.orig_elements` during chunking, when so instructed by the `include_orig_elements` option. **Additional Context** The underlying structures to support this, namely the `.metadata.orig_elements` field and the `include_orig_elements` chunking option, were added in closely prior PRs. This PR adds the behavior to actually populate that metadata field during chunking when the option is set. --- CHANGELOG.md | 2 + test_unstructured/chunking/test_base.py | 191 ++++++++++++++++++++--- test_unstructured/chunking/test_basic.py | 21 +++ test_unstructured/chunking/test_title.py | 42 +++-- unstructured/chunking/base.py | 67 +++++++- unstructured/documents/elements.py | 3 + 6 files changed, 271 insertions(+), 55 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd4bf4177..a09b57c57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ ### Features +* **Chunking populates `.metadata.orig_elements` for each chunk.** This behavior allows the text and metadata of the elements combined to make each chunk to be accessed. This can be important for example to recover metadata such as `.coordinates` that cannot be consolidated across elements and so is dropped from chunks. This option is controlled by the `include_orig_elements` parameter to `partition_*()` or to the chunking functions. This option defaults to `True` so original-elements are preserved by default. This behavior is not yet supported via the REST APIs or SDKs but will be in a closely subsequent PR to other `unstructured` repositories. The original elements will also not serialize or deserialize yet; this will also be added in a closely subsequent PR. + ### Fixes * **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles. diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index ab16e74f4..dd9a68fa7 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -352,7 +352,30 @@ class DescribeTablePreChunk: with pytest.raises(StopIteration): next(chunk_iter) - def but_it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self): + def and_it_includes_the_original_table_element_in_metadata_when_so_instructed(self): + table = Table("foo bar", metadata=ElementMetadata(text_as_html="foo bar
")) + opts = ChunkingOptions(include_orig_elements=True) + pre_chunk = TablePreChunk(table, "", opts) + + chunk_iter = pre_chunk.iter_chunks() + + chunk = next(chunk_iter) + assert isinstance(chunk, Table) + assert chunk.metadata.orig_elements == [table] + assert chunk.metadata.text_as_html == "foo bar
" + # -- + with pytest.raises(StopIteration): + next(chunk_iter) + + def but_not_when_instructed_not_to(self): + pre_chunk = TablePreChunk(Table("foobar"), "", ChunkingOptions(include_orig_elements=False)) + + chunk = next(pre_chunk.iter_chunks()) + + assert isinstance(chunk, Table) + assert chunk.metadata.orig_elements is None + + def it_splits_its_table_into_TableChunks_when_the_table_text_exceeds_the_window(self): # fixed-overhead = 8+8+9+8+9+8 = 50 # per-row overhead = 27 html_table = ( @@ -398,6 +421,7 @@ class DescribeTablePreChunk: "\n" "Lo" ) + assert not chunk.metadata.is_continuation # -- chunk = next(chunk_iter) assert isinstance(chunk, TableChunk) @@ -408,6 +432,7 @@ class DescribeTablePreChunk: "rem ipsum A Link example\n" "Consectetur adipiscing elit<" ) + assert chunk.metadata.is_continuation # -- note that text runs out but HTML continues because it's significantly longer. So two # -- of these chunks have HTML but no text. chunk = next(chunk_iter) @@ -418,6 +443,7 @@ class DescribeTablePreChunk: "Nunc aliquam id enim nec molestie\n" "Vivamus quis " ) + assert chunk.metadata.is_continuation # -- chunk = next(chunk_iter) assert isinstance(chunk, TableChunk) @@ -425,10 +451,34 @@ class DescribeTablePreChunk: assert chunk.metadata.text_as_html == ( "nunc ipsum donec ac fermentum\n\n" ) + assert chunk.metadata.is_continuation # -- with pytest.raises(StopIteration): next(chunk_iter) + def and_it_includes_the_whole_original_Table_in_each_metadata_when_so_instructed(self): + """Even though text and html are split, the orig_elements metadata is not.""" + table = Table( + "Header Col 1 Header Col 2\nLorem ipsum dolor sit amet", + metadata=ElementMetadata(text_as_html=""), + ) + opts = ChunkingOptions(max_characters=30, include_orig_elements=True) + pre_chunk = TablePreChunk(table, overlap_prefix="", opts=opts) + + chunk_iter = pre_chunk.iter_chunks() + + chunk = next(chunk_iter) + assert isinstance(chunk, TableChunk) + assert chunk.text == "Header Col 1 Header Col 2" + assert chunk.metadata.orig_elements == [table] + assert not chunk.metadata.is_continuation + # -- + chunk = next(chunk_iter) + assert isinstance(chunk, TableChunk) + assert chunk.text == "Lorem ipsum dolor sit amet" + assert chunk.metadata.orig_elements == [table] + assert chunk.metadata.is_continuation + @pytest.mark.parametrize( ("text", "expected_value"), [ @@ -469,6 +519,50 @@ class DescribeTablePreChunk: ) assert pre_chunk._text == expected_value + def it_computes_metadata_for_each_chunk_to_help(self): + table = Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="
")) + pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions()) + + metadata = pre_chunk._metadata + + assert metadata.text_as_html == "
" + # -- opts.include_orig_elements is True by default -- + assert metadata.orig_elements == [table] + # -- it produces a new instance each time it is called so changing one chunk's metadata does + # -- not change that of any other chunk. + assert pre_chunk._metadata is not metadata + + def but_it_omits_orig_elements_from_metadata_when_so_instructed(self): + pre_chunk = TablePreChunk( + Table("Lorem ipsum", metadata=ElementMetadata(text_as_html="
")), + overlap_prefix="", + opts=ChunkingOptions(include_orig_elements=False), + ) + + assert pre_chunk._metadata.orig_elements is None + + def it_computes_the_original_elements_list_to_help(self): + table = Table( + "Lorem ipsum", + metadata=ElementMetadata(text_as_html="
", orig_elements=[Table("Lorem Ipsum")]), + ) + pre_chunk = TablePreChunk(table, overlap_prefix="", opts=ChunkingOptions()) + + orig_elements = pre_chunk._orig_elements + + # -- a TablePreChunk always has exactly one original (Table) element -- + assert len(orig_elements) == 1 + orig_element = orig_elements[0] + # -- each item in orig_elements is a copy of the original element so we can mutate it + # -- without changing user's data. + assert orig_element == table + assert orig_element is not table + # -- it strips any .metadata.orig_elements from each element to prevent a recursive data + # -- structure + assert orig_element.metadata.orig_elements is None + # -- computation is only on first call, all chunks get exactly the same orig-elements -- + assert pre_chunk._orig_elements is orig_elements + class DescribeTextPreChunk: """Unit-test suite for `unstructured.chunking.base.TextPreChunk` objects.""" @@ -599,17 +693,15 @@ class DescribeTextPreChunk: ) def it_generates_a_single_chunk_from_its_elements_if_they_together_fit_in_window(self): - pre_chunk = TextPreChunk( - [ - Title("Introduction"), - Text( - "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" - " lectus porta volutpat.", - ), - ], - overlap_prefix="e feugiat efficitur.", - opts=ChunkingOptions(max_characters=200), - ) + elements = [ + Title("Introduction"), + Text( + "Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed" + " lectus porta volutpat.", + ), + ] + opts = ChunkingOptions(max_characters=200, include_orig_elements=True) + pre_chunk = TextPreChunk(elements, overlap_prefix="e feugiat efficitur.", opts=opts) chunk_iter = pre_chunk.iter_chunks() @@ -619,25 +711,31 @@ class DescribeTextPreChunk: " adipiscing elit. In rhoncus ipsum sed lectus porta volutpat.", ) assert chunk.metadata is pre_chunk._consolidated_metadata + assert chunk.metadata.orig_elements == elements + # -- + with pytest.raises(StopIteration): + next(chunk_iter) def but_it_generates_split_chunks_when_its_single_element_exceeds_window_size(self): # -- Chunk-splitting only occurs when a *single* element is too big to fit in the window. # -- The pre-chunker will isolate that element in a pre_chunk of its own. - pre_chunk = TextPreChunk( - [ - Text( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" - " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" - " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea" - " commodo consequat." - ), - ], - overlap_prefix="", - opts=ChunkingOptions(max_characters=200, text_splitting_separators=("\n", " ")), - ) + elements = [ + Text( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" + " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" + " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea" + " commodo consequat." + ) + ] + opts = ChunkingOptions(max_characters=200, include_orig_elements=True) + pre_chunk = TextPreChunk(elements, overlap_prefix="", opts=opts) chunk_iter = pre_chunk.iter_chunks() + # -- Note that .metadata.orig_elements is the same single original element, "repeated" for + # -- each text-split chunk. This behavior emerges without explicit command as a consequence + # -- of using `._consolidated_metadata` (and `._continuation_metadata` which extends + # -- `._consolidated_metadata)` for each text-split chunk. chunk = next(chunk_iter) assert chunk == CompositeElement( "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" @@ -645,10 +743,12 @@ class DescribeTextPreChunk: " veniam, quis nostrud exercitation ullamco laboris nisi ut" ) assert chunk.metadata is pre_chunk._consolidated_metadata + assert chunk.metadata.orig_elements == elements # -- chunk = next(chunk_iter) assert chunk == CompositeElement("aliquip ex ea commodo consequat.") assert chunk.metadata is pre_chunk._continuation_metadata + assert chunk.metadata.orig_elements == elements # -- with pytest.raises(StopIteration): next(chunk_iter) @@ -762,6 +862,23 @@ class DescribeTextPreChunk: "parent_id": ["f87731e0"], } + def and_it_adds_the_pre_chunk_elements_to_metadata_when_so_instructed(self): + opts = ChunkingOptions(include_orig_elements=True) + metadata = ElementMetadata(filename="foo.pdf") + element = Title("Lorem Ipsum", metadata=metadata) + element_2 = Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata) + pre_chunk = TextPreChunk([element, element_2], overlap_prefix="", opts=opts) + + consolidated_metadata = pre_chunk._consolidated_metadata + + # -- pre-chunk elements are included as metadata -- + orig_elements = consolidated_metadata.orig_elements + assert orig_elements is not None + assert orig_elements == [element, element_2] + # -- and they are the exact instances, not copies -- + assert orig_elements[0] is element + assert orig_elements[1] is element_2 + def it_consolidates_regex_metadata_in_a_field_specific_way(self): """regex_metadata of chunk is combined regex_metadatas of its elements. @@ -868,6 +985,32 @@ class DescribeTextPreChunk: }, } + def it_computes_the_original_elements_list_to_help(self): + element = Title("Introduction") + element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") + element_3 = CompositeElement( + "In rhoncus ipsum sed lectus porta volutpat.", + metadata=ElementMetadata(orig_elements=[Text("Porta volupat.")]), + ) + pre_chunk = TextPreChunk( + [element, element_2, element_3], + overlap_prefix="", + opts=ChunkingOptions(include_orig_elements=True), + ) + + orig_elements = pre_chunk._orig_elements + + # -- all elements of pre-chunk are included -- + assert orig_elements == [element, element_2, element_3] + # -- orig_elements that are chunks (having orig-elements of their own) are copied and the + # -- copy is stripped of its `.metadata.orig_elements` to prevent a recursive data + # -- structure that nests orig_elements within orig_elements. + assert orig_elements[0] is element + assert orig_elements[2] is not element_3 + assert orig_elements[2].metadata.orig_elements is None + # -- computation is only on first call, all chunks get exactly the same orig-elements -- + assert pre_chunk._orig_elements is orig_elements + @pytest.mark.parametrize( ("elements", "overlap_prefix", "expected_value"), [ diff --git a/test_unstructured/chunking/test_basic.py b/test_unstructured/chunking/test_basic.py index b5f5adf97..85f807b6f 100644 --- a/test_unstructured/chunking/test_basic.py +++ b/test_unstructured/chunking/test_basic.py @@ -113,6 +113,27 @@ def test_it_chunks_elements_when_the_user_already_has_them(): ] +def test_it_includes_original_elements_as_metadata_when_requested(): + element = Title("Introduction") + element_2 = Text("Lorem ipsum dolor sit amet consectetur adipiscing elit.") + element_3 = Text("In rhoncus ipsum sed lectus porta volutpat.") + + chunks = chunk_elements( + [element, element_2, element_3], max_characters=70, include_orig_elements=True + ) + + assert len(chunks) == 2 + chunk = chunks[0] + assert chunk == CompositeElement( + "Introduction\n\nLorem ipsum dolor sit amet consectetur adipiscing elit." + ) + assert chunk.metadata.orig_elements == [element, element_2] + # -- + chunk = chunks[1] + assert chunk == CompositeElement("In rhoncus ipsum sed lectus porta volutpat.") + assert chunk.metadata.orig_elements == [element_3] + + # ------------------------------------------------------------------------------------------------ # UNIT TESTS # ------------------------------------------------------------------------------------------------ diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index 923d10a89..369588d68 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -9,12 +9,7 @@ from typing import Any, Optional import pytest from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock -from unstructured.chunking.base import ( - CHUNK_MULTI_PAGE_DEFAULT, - PreChunker, - TablePreChunk, - TextPreChunk, -) +from unstructured.chunking.base import CHUNK_MULTI_PAGE_DEFAULT from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title from unstructured.documents.coordinates import CoordinateSystem from unstructured.documents.elements import ( @@ -57,7 +52,7 @@ def test_it_splits_a_large_element_into_multiple_chunks(): ] -def test_split_elements_by_title_and_table(): +def test_it_splits_elements_by_title_and_table(): elements: list[Element] = [ Title("A Great Day"), Text("Today is a great day."), @@ -72,39 +67,38 @@ def test_split_elements_by_title_and_table(): CheckBox(), ] - pre_chunks = PreChunker.iter_pre_chunks(elements, opts=_ByTitleChunkingOptions.new()) + chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=True) - pre_chunk = next(pre_chunks) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ + assert len(chunks) == 4 + # -- + chunk = chunks[0] + assert isinstance(chunk, CompositeElement) + assert chunk.metadata.orig_elements == [ Title("A Great Day"), Text("Today is a great day."), Text("It is sunny outside."), ] # -- - pre_chunk = next(pre_chunks) - assert isinstance(pre_chunk, TablePreChunk) - assert pre_chunk._table == Table("Heading\nCell text") + chunk = chunks[1] + assert isinstance(chunk, Table) + assert chunk.metadata.orig_elements == [Table("Heading\nCell text")] # == - pre_chunk = next(pre_chunks) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ + chunk = chunks[2] + assert isinstance(chunk, CompositeElement) + assert chunk.metadata.orig_elements == [ Title("An Okay Day"), Text("Today is an okay day."), Text("It is rainy outside."), ] # -- - pre_chunk = next(pre_chunks) - assert isinstance(pre_chunk, TextPreChunk) - assert pre_chunk._elements == [ + chunk = chunks[3] + assert isinstance(chunk, CompositeElement) + assert chunk.metadata.orig_elements == [ Title("A Bad Day"), Text("Today is a bad day."), Text("It is storming outside."), CheckBox(), ] - # -- - with pytest.raises(StopIteration): - next(pre_chunks) def test_chunk_by_title(): @@ -127,7 +121,7 @@ def test_chunk_by_title(): CheckBox(), ] - chunks = chunk_by_title(elements, combine_text_under_n_chars=0) + chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=False) assert chunks == [ CompositeElement( diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index f360f97bf..ec368f85d 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -447,10 +447,10 @@ class TablePreChunk: text_remainder = self._text html_remainder = self._table.metadata.text_as_html or "" - # -- only chunk a table when it's too big to swallow whole -- + # -- only text-split a table when it's longer than the chunking window -- if len(text_remainder) <= maxlen and len(html_remainder) <= maxlen: # -- but the overlap-prefix must be added to its text -- - yield Table(text=text_remainder, metadata=copy.deepcopy(self._table.metadata)) + yield Table(text=text_remainder, metadata=self._metadata) return split = self._opts.split @@ -459,19 +459,19 @@ class TablePreChunk: while text_remainder or html_remainder: # -- split off the next chunk-worth of characters into a TableChunk -- chunk_text, text_remainder = split(text_remainder) - table_chunk = TableChunk(text=chunk_text, metadata=copy.deepcopy(self._table.metadata)) + metadata = self._metadata # -- Attach maxchars of the html to the chunk. Note no attempt is made to add only the # -- HTML elements that *correspond* to the TextChunk.text fragment. if html_remainder: chunk_html, html_remainder = html_remainder[:maxlen], html_remainder[maxlen:] - table_chunk.metadata.text_as_html = chunk_html + metadata.text_as_html = chunk_html # -- mark second and later chunks as a continuation -- if is_continuation: - table_chunk.metadata.is_continuation = True + metadata.is_continuation = True - yield table_chunk + yield TableChunk(text=chunk_text, metadata=metadata) is_continuation = True @@ -486,6 +486,37 @@ class TablePreChunk: overlap = self._opts.inter_chunk_overlap return self._text[-overlap:].strip() if overlap else "" + @property + def _metadata(self) -> ElementMetadata: + """The base `.metadata` value for chunks formed from this pre-chunk. + + The term "base" here means that other metadata fields will be added, depending on the chunk. + In particular, `.metadata.text_as_html` will be different for each text-split chunk and + `.metadata.is_continuation` must be added for second-and-later text-split chunks. + + Note this is a fresh copy of the metadata on each call since it will need to be mutated + differently for each chunk formed from from this pre-chunk. + """ + metadata = copy.deepcopy(self._table.metadata) + if self._opts.include_orig_elements: + metadata.orig_elements = self._orig_elements + return metadata + + @lazyproperty + def _orig_elements(self) -> list[Element]: + """The `.metadata.orig_elements` value for chunks formed from this pre-chunk. + + Note this is not just the `Table` element, it must be adjusted to strip out any + `.metadata.orig_elements` value it may have when it is itself a chunk and not a direct + product of partitioning. + """ + # -- make a copy because we're going to mutate the `Table` element and it doesn't belong to + # -- us (the user may have downstream purposes for it). + orig_table = copy.deepcopy(self._table) + # -- prevent recursive .orig_elements when `Table` element is a chunk -- + orig_table.metadata.orig_elements = None + return [orig_table] + @lazyproperty def _text(self) -> str: """The text for this chunk, including the overlap-prefix when present.""" @@ -615,7 +646,10 @@ class TextPreChunk: to a single-element pre-chunk too, even though metadata for such a pre-chunk is already "consolidated". """ - return ElementMetadata(**self._meta_kwargs) + consolidated_metadata = ElementMetadata(**self._meta_kwargs) + if self._opts.include_orig_elements: + consolidated_metadata.orig_elements = self._orig_elements + return consolidated_metadata @lazyproperty def _continuation_metadata(self) -> ElementMetadata: @@ -717,6 +751,25 @@ class TextPreChunk: return dict(iter_kwarg_pairs()) + @lazyproperty + def _orig_elements(self) -> list[Element]: + """The `.metadata.orig_elements` value for chunks formed from this pre-chunk.""" + + def iter_orig_elements(): + for e in self._elements: + if e.metadata.orig_elements is None: + yield e + continue + # -- make copy of any element we're going to mutate because these elements don't + # -- belong to us (the user may have downstream purposes for them). + orig_element = copy.copy(e) + # -- prevent recursive .orig_elements when element is a chunk (has orig-elements of + # -- its own) + orig_element.metadata.orig_elements = None + yield orig_element + + return list(iter_orig_elements()) + @lazyproperty def _text(self) -> str: """The concatenated text of all elements in this pre-chunk. diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 7cfc5c83f..1a9406be9 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -378,6 +378,9 @@ class ElementMetadata: for field_name in self.DEBUG_FIELD_NAMES: meta_dict.pop(field_name, None) + # -- remove `.orig_elements` for now as that won't serialize -- + meta_dict.pop("orig_elements", None) + # -- don't serialize empty lists -- meta_dict: dict[str, Any] = { field_name: value