diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index 200fa8aa9..ca3812464 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -133,7 +133,6 @@ class DocumentSplitter: text_splits: List[str] = [] splits_pages = [] splits_start_idxs = [] - split_at_len = len(self.split_at) cur_start_idx = 0 cur_page = 1 segments = windowed(elements, n=split_length, step=split_length - split_overlap) @@ -153,7 +152,7 @@ class DocumentSplitter: splits_start_idxs.append(cur_start_idx) processed_units = current_units[: split_length - split_overlap] - cur_start_idx += len("".join(processed_units)) + split_at_len + cur_start_idx += len("".join(processed_units)) if self.split_by == "page": num_page_breaks = len(processed_units) @@ -207,7 +206,7 @@ class DocumentSplitter: :param previous_doc: The Document that was split before the current Document. :param previous_doc_start_idx: The starting index of the previous Document. """ - overlapping_range = (current_doc_start_idx - previous_doc_start_idx - 1, len(previous_doc.content) - 1) # type: ignore + overlapping_range = (current_doc_start_idx - previous_doc_start_idx, len(previous_doc.content)) # type: ignore if overlapping_range[0] < overlapping_range[1]: overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]] # type: ignore diff --git a/haystack/components/retrievers/sentence_window_retrieval.py b/haystack/components/retrievers/sentence_window_retrieval.py index 7e460028b..383f95c4b 100644 --- a/haystack/components/retrievers/sentence_window_retrieval.py +++ b/haystack/components/retrievers/sentence_window_retrieval.py @@ -85,7 +85,6 @@ class SentenceWindowRetrieval: start = max(start, last_idx_end) # append the non-overlapping part to the merged text - merged_text = merged_text.strip() merged_text += doc.content[start - doc.meta["split_idx_start"] :] # type: ignore # update the last end index diff --git a/releasenotes/notes/fix-document-splitter-split-info-1704f16c8b0f374a.yaml b/releasenotes/notes/fix-document-splitter-split-info-1704f16c8b0f374a.yaml new file mode 100644 index 000000000..87108281b --- /dev/null +++ b/releasenotes/notes/fix-document-splitter-split-info-1704f16c8b0f374a.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + The DocumentSplitter was incorrectly calculating the `split_start_idx` and `_split_overlap` information due to slight miscalculations of appropriate indices. + This fixes those so the `split_start_idx` and `_split_overlap` information is correct. diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py index d6fcaa9d1..d9fa85f00 100644 --- a/test/components/preprocessors/test_document_splitter.py +++ b/test/components/preprocessors/test_document_splitter.py @@ -20,7 +20,6 @@ def merge_documents(documents): start = last_idx_end # append the non-overlapping part to the merged text - merged_text = merged_text.strip() merged_text += doc.content[start - doc.meta["split_idx_start"] :] # update the last end index @@ -61,16 +60,16 @@ class TestDocumentSplitter: def test_split_by_word(self): splitter = DocumentSplitter(split_by="word", split_length=10) - result = splitter.run( - documents=[ - Document( - content="This is a text with some words. There is a second sentence. And there is a third sentence." - ) - ] - ) - assert len(result["documents"]) == 2 - assert result["documents"][0].content == "This is a text with some words. There is a " - assert result["documents"][1].content == "second sentence. And there is a third sentence." + text = "This is a text with some words. There is a second sentence. And there is a third sentence." + result = splitter.run(documents=[Document(content=text)]) + docs = result["documents"] + assert len(docs) == 2 + assert docs[0].content == "This is a text with some words. There is a " + assert docs[0].meta["split_id"] == 0 + assert docs[0].meta["split_idx_start"] == text.index(docs[0].content) + assert docs[1].content == "second sentence. And there is a third sentence." + assert docs[1].meta["split_id"] == 1 + assert docs[1].meta["split_idx_start"] == text.index(docs[1].content) def test_split_by_word_with_threshold(self): splitter = DocumentSplitter(split_by="word", split_length=15, split_threshold=10) @@ -89,77 +88,101 @@ class TestDocumentSplitter: def test_split_by_word_multiple_input_docs(self): splitter = DocumentSplitter(split_by="word", split_length=10) - result = splitter.run( - documents=[ - Document( - content="This is a text with some words. There is a second sentence. And there is a third sentence." - ), - Document( - content="This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence." - ), - ] - ) - assert len(result["documents"]) == 5 - assert result["documents"][0].content == "This is a text with some words. There is a " - assert result["documents"][1].content == "second sentence. And there is a third sentence." - assert result["documents"][2].content == "This is a different text with some words. There is " - assert result["documents"][3].content == "a second sentence. And there is a third sentence. And " - assert result["documents"][4].content == "there is a fourth sentence." + text1 = "This is a text with some words. There is a second sentence. And there is a third sentence." + text2 = "This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence." + result = splitter.run(documents=[Document(content=text1), Document(content=text2)]) + docs = result["documents"] + assert len(docs) == 5 + # doc 0 + assert docs[0].content == "This is a text with some words. There is a " + assert docs[0].meta["split_id"] == 0 + assert docs[0].meta["split_idx_start"] == text1.index(docs[0].content) + # doc 1 + assert docs[1].content == "second sentence. And there is a third sentence." + assert docs[1].meta["split_id"] == 1 + assert docs[1].meta["split_idx_start"] == text1.index(docs[1].content) + # doc 2 + assert docs[2].content == "This is a different text with some words. There is " + assert docs[2].meta["split_id"] == 0 + assert docs[2].meta["split_idx_start"] == text2.index(docs[2].content) + # doc 3 + assert docs[3].content == "a second sentence. And there is a third sentence. And " + assert docs[3].meta["split_id"] == 1 + assert docs[3].meta["split_idx_start"] == text2.index(docs[3].content) + # doc 4 + assert docs[4].content == "there is a fourth sentence." + assert docs[4].meta["split_id"] == 2 + assert docs[4].meta["split_idx_start"] == text2.index(docs[4].content) def test_split_by_sentence(self): splitter = DocumentSplitter(split_by="sentence", split_length=1) - result = splitter.run( - documents=[ - Document( - content="This is a text with some words. There is a second sentence. And there is a third sentence." - ) - ] - ) - assert len(result["documents"]) == 3 - assert result["documents"][0].content == "This is a text with some words." - assert result["documents"][1].content == " There is a second sentence." - assert result["documents"][2].content == " And there is a third sentence." + text = "This is a text with some words. There is a second sentence. And there is a third sentence." + result = splitter.run(documents=[Document(content=text)]) + docs = result["documents"] + assert len(docs) == 3 + assert docs[0].content == "This is a text with some words." + assert docs[0].meta["split_id"] == 0 + assert docs[0].meta["split_idx_start"] == text.index(docs[0].content) + assert docs[1].content == " There is a second sentence." + assert docs[1].meta["split_id"] == 1 + assert docs[1].meta["split_idx_start"] == text.index(docs[1].content) + assert docs[2].content == " And there is a third sentence." + assert docs[2].meta["split_id"] == 2 + assert docs[2].meta["split_idx_start"] == text.index(docs[2].content) def test_split_by_passage(self): splitter = DocumentSplitter(split_by="passage", split_length=1) - result = splitter.run( - documents=[ - Document( - content="This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage." - ) - ] - ) - assert len(result["documents"]) == 3 - assert result["documents"][0].content == "This is a text with some words. There is a second sentence.\n\n" - assert result["documents"][1].content == "And there is a third sentence.\n\n" - assert result["documents"][2].content == " And another passage." + text = "This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage." + result = splitter.run(documents=[Document(content=text)]) + docs = result["documents"] + assert len(docs) == 3 + assert docs[0].content == "This is a text with some words. There is a second sentence.\n\n" + assert docs[0].meta["split_id"] == 0 + assert docs[0].meta["split_idx_start"] == text.index(docs[0].content) + assert docs[1].content == "And there is a third sentence.\n\n" + assert docs[1].meta["split_id"] == 1 + assert docs[1].meta["split_idx_start"] == text.index(docs[1].content) + assert docs[2].content == " And another passage." + assert docs[2].meta["split_id"] == 2 + assert docs[2].meta["split_idx_start"] == text.index(docs[2].content) def test_split_by_page(self): splitter = DocumentSplitter(split_by="page", split_length=1) - result = splitter.run( - documents=[ - Document( - content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." - ) - ] - ) - assert len(result["documents"]) == 3 - assert result["documents"][0].content == "This is a text with some words. There is a second sentence.\x0c" - assert result["documents"][1].content == " And there is a third sentence.\x0c" - assert result["documents"][2].content == " And another passage." + text = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." + result = splitter.run(documents=[Document(content=text)]) + docs = result["documents"] + assert len(docs) == 3 + assert docs[0].content == "This is a text with some words. There is a second sentence.\f" + assert docs[0].meta["split_id"] == 0 + assert docs[0].meta["split_idx_start"] == text.index(docs[0].content) + assert docs[0].meta["page_number"] == 1 + assert docs[1].content == " And there is a third sentence.\f" + assert docs[1].meta["split_id"] == 1 + assert docs[1].meta["split_idx_start"] == text.index(docs[1].content) + assert docs[1].meta["page_number"] == 2 + assert docs[2].content == " And another passage." + assert docs[2].meta["split_id"] == 2 + assert docs[2].meta["split_idx_start"] == text.index(docs[2].content) + assert docs[2].meta["page_number"] == 3 def test_split_by_word_with_overlap(self): splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2) - result = splitter.run( - documents=[ - Document( - content="This is a text with some words. There is a second sentence. And there is a third sentence." - ) - ] - ) - assert len(result["documents"]) == 2 - assert result["documents"][0].content == "This is a text with some words. There is a " - assert result["documents"][1].content == "is a second sentence. And there is a third sentence." + text = "This is a text with some words. There is a second sentence. And there is a third sentence." + result = splitter.run(documents=[Document(content=text)]) + docs = result["documents"] + assert len(docs) == 2 + # doc 0 + assert docs[0].content == "This is a text with some words. There is a " + assert docs[0].meta["split_id"] == 0 + assert docs[0].meta["split_idx_start"] == text.index(docs[0].content) + assert docs[0].meta["_split_overlap"][0]["range"] == (0, 5) + assert docs[1].content[0:5] == "is a " + # doc 1 + assert docs[1].content == "is a second sentence. And there is a third sentence." + assert docs[1].meta["split_id"] == 1 + assert docs[1].meta["split_idx_start"] == text.index(docs[1].content) + assert docs[1].meta["_split_overlap"][0]["range"] == (38, 43) + assert docs[0].content[38:43] == "is a " def test_source_id_stored_in_metadata(self): splitter = DocumentSplitter(split_by="word", split_length=10) @@ -277,13 +300,32 @@ class TestDocumentSplitter: def test_add_split_overlap_information(self): splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word") + text = "This is a text with some words. There is a second sentence. And a third sentence." doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.") - docs = splitter.run(documents=[doc]) + docs = splitter.run(documents=[doc])["documents"] # check split_overlap is added to all the documents - assert len(docs["documents"]) == 3 - for d in docs["documents"]: - assert "_split_overlap" in d.meta + assert len(docs) == 3 + # doc 0 + assert docs[0].content == "This is a text with some words. There is a " + assert docs[0].meta["split_id"] == 0 + assert docs[0].meta["split_idx_start"] == text.index(docs[0].content) # 0 + assert docs[0].meta["_split_overlap"][0]["range"] == (0, 23) + assert docs[1].content[0:23] == "some words. There is a " + # doc 1 + assert docs[1].content == "some words. There is a second sentence. And a third " + assert docs[1].meta["split_id"] == 1 + assert docs[1].meta["split_idx_start"] == text.index(docs[1].content) # 20 + assert docs[1].meta["_split_overlap"][0]["range"] == (20, 43) + assert docs[1].meta["_split_overlap"][1]["range"] == (0, 29) + assert docs[0].content[20:43] == "some words. There is a " + assert docs[2].content[0:29] == "second sentence. And a third " + # doc 2 + assert docs[2].content == "second sentence. And a third sentence." + assert docs[2].meta["split_id"] == 2 + assert docs[2].meta["split_idx_start"] == text.index(docs[2].content) # 43 + assert docs[2].meta["_split_overlap"][0]["range"] == (23, 52) + assert docs[1].content[23:52] == "second sentence. And a third " # reconstruct the original document content from the split documents - assert doc.content == merge_documents(docs["documents"]) + assert doc.content == merge_documents(docs) diff --git a/test/components/retrievers/test_sentence_window_retriever.py b/test/components/retrievers/test_sentence_window_retriever.py index be2a52e6b..ad9a3bddb 100644 --- a/test/components/retrievers/test_sentence_window_retriever.py +++ b/test/components/retrievers/test_sentence_window_retriever.py @@ -29,7 +29,7 @@ class TestSentenceWindowRetrieval: "page_number": 1, "split_id": 0, "split_idx_start": 0, - "_split_overlap": [{"doc_id": "doc_1", "range": (0, 22)}], + "_split_overlap": [{"doc_id": "doc_1", "range": (0, 23)}], }, { "id": "doc_1", @@ -37,8 +37,8 @@ class TestSentenceWindowRetrieval: "source_id": "c5d7c632affc486d0cfe7b3c0f4dc1d3896ea720da2b538d6d10b104a3df5f99", "page_number": 1, "split_id": 1, - "split_idx_start": 21, - "_split_overlap": [{"doc_id": "doc_0", "range": (20, 42)}, {"doc_id": "doc_2", "range": (0, 29)}], + "split_idx_start": 20, + "_split_overlap": [{"doc_id": "doc_0", "range": (20, 43)}, {"doc_id": "doc_2", "range": (0, 29)}], }, { "id": "doc_2", @@ -46,7 +46,7 @@ class TestSentenceWindowRetrieval: "source_id": "c5d7c632affc486d0cfe7b3c0f4dc1d3896ea720da2b538d6d10b104a3df5f99", "page_number": 1, "split_id": 2, - "split_idx_start": 45, + "split_idx_start": 43, "_split_overlap": [{"doc_id": "doc_1", "range": (23, 52)}], }, ]