mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-29 16:08:38 +00:00
Fixes incorrect ID generation for identical chunks in RecursiveDocumentSplitter (#9517)
* fix(preprocessor): ensure RecursiveDocumentSplitter generates unique chunk IDs * fix: update meta handling in RecursiveDocumentSplitter to ensure correct overlap information --------- Co-authored-by: Michele Pangrazzi <xmikex83@gmail.com>
This commit is contained in:
parent
7570f6b769
commit
7dbac5b3c9
@ -423,10 +423,12 @@ class RecursiveDocumentSplitter:
|
||||
new_docs: List[Document] = []
|
||||
|
||||
for split_nr, chunk in enumerate(chunks):
|
||||
new_doc = Document(content=chunk, meta=deepcopy(doc.meta))
|
||||
new_doc.meta["split_id"] = split_nr
|
||||
new_doc.meta["split_idx_start"] = current_position
|
||||
new_doc.meta["_split_overlap"] = [] if self.split_overlap > 0 else None
|
||||
meta = deepcopy(doc.meta)
|
||||
meta["parent_id"] = doc.id
|
||||
meta["split_id"] = split_nr
|
||||
meta["split_idx_start"] = current_position
|
||||
meta["_split_overlap"] = [] if self.split_overlap > 0 else None
|
||||
new_doc = Document(content=chunk, meta=meta)
|
||||
|
||||
# add overlap information to the previous and current doc
|
||||
if split_nr > 0 and self.split_overlap > 0:
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
**RecursiveDocumentSplitter** now generates a unique `Document.id` for every chunk. The meta fields (`split_id`, `parent_id`, etc.) are populated _before_ `Document` creation, so the hash used for `id` generation is always unique.
|
||||
@ -990,3 +990,21 @@ def test_run_complex_text_with_multiple_separators():
|
||||
assert len(chunks[3].content) == 152
|
||||
assert chunks[3].content.startswith("C")
|
||||
assert chunks[3].content.endswith("D" * 50)
|
||||
|
||||
|
||||
def test_recursive_splitter_generates_unique_ids_and_correct_meta():
|
||||
text = "Haystack is awesome. " * 5
|
||||
source_doc = Document(content=text)
|
||||
|
||||
splitter = RecursiveDocumentSplitter(split_length=3)
|
||||
splitter.warm_up()
|
||||
|
||||
chunks = splitter.run([source_doc])["documents"]
|
||||
|
||||
# IDs must be unique
|
||||
assert len({c.id for c in chunks}) == len(chunks)
|
||||
|
||||
# parent_id and split_id checks
|
||||
for idx, chunk in enumerate(chunks):
|
||||
assert chunk.meta["parent_id"] == source_doc.id
|
||||
assert chunk.meta["split_id"] == idx
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user