fix: update meta data before initializing new Document in DocumentSplitter (#8745)

* updated DocumentSplitter

issue #8741

* release note

* updated DocumentSplitter

in _create_docs_from_splits function initialize a new variable copied_mete instead to overwrite meta

* added test

test_duplicate_pages_get_different_doc_id

* fix fmt

---------

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
This commit is contained in:
Nicola Procopio 2025-01-20 09:51:47 +01:00 committed by GitHub
parent 242138c68b
commit 542a7f7ef5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 17 additions and 5 deletions

View File

@ -323,11 +323,11 @@ class DocumentSplitter:
documents: List[Document] = []
for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):
meta = deepcopy(meta)
doc = Document(content=txt, meta=meta)
doc.meta["page_number"] = splits_pages[i]
doc.meta["split_id"] = i
doc.meta["split_idx_start"] = split_idx
copied_meta = deepcopy(meta)
copied_meta["page_number"] = splits_pages[i]
copied_meta["split_id"] = i
copied_meta["split_idx_start"] = split_idx
doc = Document(content=txt, meta=copied_meta)
documents.append(doc)
if self.split_overlap <= 0:

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Updated Document's meta data after initializing the Document in DocumentSplitter as requested in issue #8741

View File

@ -827,3 +827,11 @@ class TestSplittingNLTKSentenceSplitter:
assert deserialized.respect_sentence_boundary == True
assert hasattr(deserialized, "sentence_splitter")
assert deserialized.language == "de"
def test_duplicate_pages_get_different_doc_id(self):
splitter = DocumentSplitter(split_by="page", split_length=1)
doc1 = Document(content="This is some text.\fThis is some text.\fThis is some text.\fThis is some text.")
splitter.warm_up()
result = splitter.run(documents=[doc1])
assert len({doc.id for doc in result["documents"]}) == 4