mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-28 16:08:37 +00:00
fix: update meta data before initializing new Document in DocumentSplitter (#8745)
* updated DocumentSplitter issue #8741 * release note * updated DocumentSplitter in _create_docs_from_splits function initialize a new variable copied_mete instead to overwrite meta * added test test_duplicate_pages_get_different_doc_id * fix fmt --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
This commit is contained in:
parent
242138c68b
commit
542a7f7ef5
@ -323,11 +323,11 @@ class DocumentSplitter:
|
||||
documents: List[Document] = []
|
||||
|
||||
for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):
|
||||
meta = deepcopy(meta)
|
||||
doc = Document(content=txt, meta=meta)
|
||||
doc.meta["page_number"] = splits_pages[i]
|
||||
doc.meta["split_id"] = i
|
||||
doc.meta["split_idx_start"] = split_idx
|
||||
copied_meta = deepcopy(meta)
|
||||
copied_meta["page_number"] = splits_pages[i]
|
||||
copied_meta["split_id"] = i
|
||||
copied_meta["split_idx_start"] = split_idx
|
||||
doc = Document(content=txt, meta=copied_meta)
|
||||
documents.append(doc)
|
||||
|
||||
if self.split_overlap <= 0:
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Updated Document's meta data after initializing the Document in DocumentSplitter as requested in issue #8741
|
||||
@ -827,3 +827,11 @@ class TestSplittingNLTKSentenceSplitter:
|
||||
assert deserialized.respect_sentence_boundary == True
|
||||
assert hasattr(deserialized, "sentence_splitter")
|
||||
assert deserialized.language == "de"
|
||||
|
||||
def test_duplicate_pages_get_different_doc_id(self):
|
||||
splitter = DocumentSplitter(split_by="page", split_length=1)
|
||||
doc1 = Document(content="This is some text.\fThis is some text.\fThis is some text.\fThis is some text.")
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[doc1])
|
||||
|
||||
assert len({doc.id for doc in result["documents"]}) == 4
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user