Fixes incorrect ID generation for identical chunks in RecursiveDocumentSplitter (#9517)

* fix(preprocessor): ensure RecursiveDocumentSplitter generates unique chunk IDs

* fix: update meta handling in RecursiveDocumentSplitter to ensure correct overlap information

---------

Co-authored-by: Michele Pangrazzi <xmikex83@gmail.com>
This commit is contained in:
baki gul 2025-06-16 22:49:00 +03:00 committed by GitHub
parent 7570f6b769
commit 7dbac5b3c9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 28 additions and 4 deletions

View File

@ -423,10 +423,12 @@ class RecursiveDocumentSplitter:
new_docs: List[Document] = []
for split_nr, chunk in enumerate(chunks):
new_doc = Document(content=chunk, meta=deepcopy(doc.meta))
new_doc.meta["split_id"] = split_nr
new_doc.meta["split_idx_start"] = current_position
new_doc.meta["_split_overlap"] = [] if self.split_overlap > 0 else None
meta = deepcopy(doc.meta)
meta["parent_id"] = doc.id
meta["split_id"] = split_nr
meta["split_idx_start"] = current_position
meta["_split_overlap"] = [] if self.split_overlap > 0 else None
new_doc = Document(content=chunk, meta=meta)
# add overlap information to the previous and current doc
if split_nr > 0 and self.split_overlap > 0:

View File

@ -0,0 +1,4 @@
---
fixes:
- |
**RecursiveDocumentSplitter** now generates a unique `Document.id` for every chunk. The meta fields (`split_id`, `parent_id`, etc.) are populated _before_ `Document` creation, so the hash used for `id` generation is always unique.

View File

@ -990,3 +990,21 @@ def test_run_complex_text_with_multiple_separators():
assert len(chunks[3].content) == 152
assert chunks[3].content.startswith("C")
assert chunks[3].content.endswith("D" * 50)
def test_recursive_splitter_generates_unique_ids_and_correct_meta():
text = "Haystack is awesome. " * 5
source_doc = Document(content=text)
splitter = RecursiveDocumentSplitter(split_length=3)
splitter.warm_up()
chunks = splitter.run([source_doc])["documents"]
# IDs must be unique
assert len({c.id for c in chunks}) == len(chunks)
# parent_id and split_id checks
for idx, chunk in enumerate(chunks):
assert chunk.meta["parent_id"] == source_doc.id
assert chunk.meta["split_id"] == idx