mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-30 00:30:09 +00:00
fix: Fix split_start_idx and _split_overlap information in DocumentSplitter (#8046)
* Fix bug in DocumentSplitter and expand tests to catch said bug * Fix split overlap information calc and actually test it * Add release notes * Remove comments * Same fix in SentenceWindowRetrieval --------- Co-authored-by: Vladimir Blagojevic <dovlex@gmail.com>
This commit is contained in:
parent
b36ec0a38c
commit
baed478f23
@ -133,7 +133,6 @@ class DocumentSplitter:
|
||||
text_splits: List[str] = []
|
||||
splits_pages = []
|
||||
splits_start_idxs = []
|
||||
split_at_len = len(self.split_at)
|
||||
cur_start_idx = 0
|
||||
cur_page = 1
|
||||
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
|
||||
@ -153,7 +152,7 @@ class DocumentSplitter:
|
||||
splits_start_idxs.append(cur_start_idx)
|
||||
|
||||
processed_units = current_units[: split_length - split_overlap]
|
||||
cur_start_idx += len("".join(processed_units)) + split_at_len
|
||||
cur_start_idx += len("".join(processed_units))
|
||||
|
||||
if self.split_by == "page":
|
||||
num_page_breaks = len(processed_units)
|
||||
@ -207,7 +206,7 @@ class DocumentSplitter:
|
||||
:param previous_doc: The Document that was split before the current Document.
|
||||
:param previous_doc_start_idx: The starting index of the previous Document.
|
||||
"""
|
||||
overlapping_range = (current_doc_start_idx - previous_doc_start_idx - 1, len(previous_doc.content) - 1) # type: ignore
|
||||
overlapping_range = (current_doc_start_idx - previous_doc_start_idx, len(previous_doc.content)) # type: ignore
|
||||
|
||||
if overlapping_range[0] < overlapping_range[1]:
|
||||
overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]] # type: ignore
|
||||
|
||||
@ -85,7 +85,6 @@ class SentenceWindowRetrieval:
|
||||
start = max(start, last_idx_end)
|
||||
|
||||
# append the non-overlapping part to the merged text
|
||||
merged_text = merged_text.strip()
|
||||
merged_text += doc.content[start - doc.meta["split_idx_start"] :] # type: ignore
|
||||
|
||||
# update the last end index
|
||||
|
||||
@ -0,0 +1,5 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
The DocumentSplitter was incorrectly calculating the `split_start_idx` and `_split_overlap` information due to slight miscalculations of appropriate indices.
|
||||
This fixes those so the `split_start_idx` and `_split_overlap` information is correct.
|
||||
@ -20,7 +20,6 @@ def merge_documents(documents):
|
||||
start = last_idx_end
|
||||
|
||||
# append the non-overlapping part to the merged text
|
||||
merged_text = merged_text.strip()
|
||||
merged_text += doc.content[start - doc.meta["split_idx_start"] :]
|
||||
|
||||
# update the last end index
|
||||
@ -61,16 +60,16 @@ class TestDocumentSplitter:
|
||||
|
||||
def test_split_by_word(self):
|
||||
splitter = DocumentSplitter(split_by="word", split_length=10)
|
||||
result = splitter.run(
|
||||
documents=[
|
||||
Document(
|
||||
content="This is a text with some words. There is a second sentence. And there is a third sentence."
|
||||
)
|
||||
]
|
||||
)
|
||||
assert len(result["documents"]) == 2
|
||||
assert result["documents"][0].content == "This is a text with some words. There is a "
|
||||
assert result["documents"][1].content == "second sentence. And there is a third sentence."
|
||||
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
|
||||
result = splitter.run(documents=[Document(content=text)])
|
||||
docs = result["documents"]
|
||||
assert len(docs) == 2
|
||||
assert docs[0].content == "This is a text with some words. There is a "
|
||||
assert docs[0].meta["split_id"] == 0
|
||||
assert docs[0].meta["split_idx_start"] == text.index(docs[0].content)
|
||||
assert docs[1].content == "second sentence. And there is a third sentence."
|
||||
assert docs[1].meta["split_id"] == 1
|
||||
assert docs[1].meta["split_idx_start"] == text.index(docs[1].content)
|
||||
|
||||
def test_split_by_word_with_threshold(self):
|
||||
splitter = DocumentSplitter(split_by="word", split_length=15, split_threshold=10)
|
||||
@ -89,77 +88,101 @@ class TestDocumentSplitter:
|
||||
|
||||
def test_split_by_word_multiple_input_docs(self):
|
||||
splitter = DocumentSplitter(split_by="word", split_length=10)
|
||||
result = splitter.run(
|
||||
documents=[
|
||||
Document(
|
||||
content="This is a text with some words. There is a second sentence. And there is a third sentence."
|
||||
),
|
||||
Document(
|
||||
content="This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence."
|
||||
),
|
||||
]
|
||||
)
|
||||
assert len(result["documents"]) == 5
|
||||
assert result["documents"][0].content == "This is a text with some words. There is a "
|
||||
assert result["documents"][1].content == "second sentence. And there is a third sentence."
|
||||
assert result["documents"][2].content == "This is a different text with some words. There is "
|
||||
assert result["documents"][3].content == "a second sentence. And there is a third sentence. And "
|
||||
assert result["documents"][4].content == "there is a fourth sentence."
|
||||
text1 = "This is a text with some words. There is a second sentence. And there is a third sentence."
|
||||
text2 = "This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence."
|
||||
result = splitter.run(documents=[Document(content=text1), Document(content=text2)])
|
||||
docs = result["documents"]
|
||||
assert len(docs) == 5
|
||||
# doc 0
|
||||
assert docs[0].content == "This is a text with some words. There is a "
|
||||
assert docs[0].meta["split_id"] == 0
|
||||
assert docs[0].meta["split_idx_start"] == text1.index(docs[0].content)
|
||||
# doc 1
|
||||
assert docs[1].content == "second sentence. And there is a third sentence."
|
||||
assert docs[1].meta["split_id"] == 1
|
||||
assert docs[1].meta["split_idx_start"] == text1.index(docs[1].content)
|
||||
# doc 2
|
||||
assert docs[2].content == "This is a different text with some words. There is "
|
||||
assert docs[2].meta["split_id"] == 0
|
||||
assert docs[2].meta["split_idx_start"] == text2.index(docs[2].content)
|
||||
# doc 3
|
||||
assert docs[3].content == "a second sentence. And there is a third sentence. And "
|
||||
assert docs[3].meta["split_id"] == 1
|
||||
assert docs[3].meta["split_idx_start"] == text2.index(docs[3].content)
|
||||
# doc 4
|
||||
assert docs[4].content == "there is a fourth sentence."
|
||||
assert docs[4].meta["split_id"] == 2
|
||||
assert docs[4].meta["split_idx_start"] == text2.index(docs[4].content)
|
||||
|
||||
def test_split_by_sentence(self):
|
||||
splitter = DocumentSplitter(split_by="sentence", split_length=1)
|
||||
result = splitter.run(
|
||||
documents=[
|
||||
Document(
|
||||
content="This is a text with some words. There is a second sentence. And there is a third sentence."
|
||||
)
|
||||
]
|
||||
)
|
||||
assert len(result["documents"]) == 3
|
||||
assert result["documents"][0].content == "This is a text with some words."
|
||||
assert result["documents"][1].content == " There is a second sentence."
|
||||
assert result["documents"][2].content == " And there is a third sentence."
|
||||
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
|
||||
result = splitter.run(documents=[Document(content=text)])
|
||||
docs = result["documents"]
|
||||
assert len(docs) == 3
|
||||
assert docs[0].content == "This is a text with some words."
|
||||
assert docs[0].meta["split_id"] == 0
|
||||
assert docs[0].meta["split_idx_start"] == text.index(docs[0].content)
|
||||
assert docs[1].content == " There is a second sentence."
|
||||
assert docs[1].meta["split_id"] == 1
|
||||
assert docs[1].meta["split_idx_start"] == text.index(docs[1].content)
|
||||
assert docs[2].content == " And there is a third sentence."
|
||||
assert docs[2].meta["split_id"] == 2
|
||||
assert docs[2].meta["split_idx_start"] == text.index(docs[2].content)
|
||||
|
||||
def test_split_by_passage(self):
|
||||
splitter = DocumentSplitter(split_by="passage", split_length=1)
|
||||
result = splitter.run(
|
||||
documents=[
|
||||
Document(
|
||||
content="This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage."
|
||||
)
|
||||
]
|
||||
)
|
||||
assert len(result["documents"]) == 3
|
||||
assert result["documents"][0].content == "This is a text with some words. There is a second sentence.\n\n"
|
||||
assert result["documents"][1].content == "And there is a third sentence.\n\n"
|
||||
assert result["documents"][2].content == " And another passage."
|
||||
text = "This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage."
|
||||
result = splitter.run(documents=[Document(content=text)])
|
||||
docs = result["documents"]
|
||||
assert len(docs) == 3
|
||||
assert docs[0].content == "This is a text with some words. There is a second sentence.\n\n"
|
||||
assert docs[0].meta["split_id"] == 0
|
||||
assert docs[0].meta["split_idx_start"] == text.index(docs[0].content)
|
||||
assert docs[1].content == "And there is a third sentence.\n\n"
|
||||
assert docs[1].meta["split_id"] == 1
|
||||
assert docs[1].meta["split_idx_start"] == text.index(docs[1].content)
|
||||
assert docs[2].content == " And another passage."
|
||||
assert docs[2].meta["split_id"] == 2
|
||||
assert docs[2].meta["split_idx_start"] == text.index(docs[2].content)
|
||||
|
||||
def test_split_by_page(self):
|
||||
splitter = DocumentSplitter(split_by="page", split_length=1)
|
||||
result = splitter.run(
|
||||
documents=[
|
||||
Document(
|
||||
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
||||
)
|
||||
]
|
||||
)
|
||||
assert len(result["documents"]) == 3
|
||||
assert result["documents"][0].content == "This is a text with some words. There is a second sentence.\x0c"
|
||||
assert result["documents"][1].content == " And there is a third sentence.\x0c"
|
||||
assert result["documents"][2].content == " And another passage."
|
||||
text = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
||||
result = splitter.run(documents=[Document(content=text)])
|
||||
docs = result["documents"]
|
||||
assert len(docs) == 3
|
||||
assert docs[0].content == "This is a text with some words. There is a second sentence.\f"
|
||||
assert docs[0].meta["split_id"] == 0
|
||||
assert docs[0].meta["split_idx_start"] == text.index(docs[0].content)
|
||||
assert docs[0].meta["page_number"] == 1
|
||||
assert docs[1].content == " And there is a third sentence.\f"
|
||||
assert docs[1].meta["split_id"] == 1
|
||||
assert docs[1].meta["split_idx_start"] == text.index(docs[1].content)
|
||||
assert docs[1].meta["page_number"] == 2
|
||||
assert docs[2].content == " And another passage."
|
||||
assert docs[2].meta["split_id"] == 2
|
||||
assert docs[2].meta["split_idx_start"] == text.index(docs[2].content)
|
||||
assert docs[2].meta["page_number"] == 3
|
||||
|
||||
def test_split_by_word_with_overlap(self):
|
||||
splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
|
||||
result = splitter.run(
|
||||
documents=[
|
||||
Document(
|
||||
content="This is a text with some words. There is a second sentence. And there is a third sentence."
|
||||
)
|
||||
]
|
||||
)
|
||||
assert len(result["documents"]) == 2
|
||||
assert result["documents"][0].content == "This is a text with some words. There is a "
|
||||
assert result["documents"][1].content == "is a second sentence. And there is a third sentence."
|
||||
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
|
||||
result = splitter.run(documents=[Document(content=text)])
|
||||
docs = result["documents"]
|
||||
assert len(docs) == 2
|
||||
# doc 0
|
||||
assert docs[0].content == "This is a text with some words. There is a "
|
||||
assert docs[0].meta["split_id"] == 0
|
||||
assert docs[0].meta["split_idx_start"] == text.index(docs[0].content)
|
||||
assert docs[0].meta["_split_overlap"][0]["range"] == (0, 5)
|
||||
assert docs[1].content[0:5] == "is a "
|
||||
# doc 1
|
||||
assert docs[1].content == "is a second sentence. And there is a third sentence."
|
||||
assert docs[1].meta["split_id"] == 1
|
||||
assert docs[1].meta["split_idx_start"] == text.index(docs[1].content)
|
||||
assert docs[1].meta["_split_overlap"][0]["range"] == (38, 43)
|
||||
assert docs[0].content[38:43] == "is a "
|
||||
|
||||
def test_source_id_stored_in_metadata(self):
|
||||
splitter = DocumentSplitter(split_by="word", split_length=10)
|
||||
@ -277,13 +300,32 @@ class TestDocumentSplitter:
|
||||
|
||||
def test_add_split_overlap_information(self):
|
||||
splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
|
||||
text = "This is a text with some words. There is a second sentence. And a third sentence."
|
||||
doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.")
|
||||
docs = splitter.run(documents=[doc])
|
||||
docs = splitter.run(documents=[doc])["documents"]
|
||||
|
||||
# check split_overlap is added to all the documents
|
||||
assert len(docs["documents"]) == 3
|
||||
for d in docs["documents"]:
|
||||
assert "_split_overlap" in d.meta
|
||||
assert len(docs) == 3
|
||||
# doc 0
|
||||
assert docs[0].content == "This is a text with some words. There is a "
|
||||
assert docs[0].meta["split_id"] == 0
|
||||
assert docs[0].meta["split_idx_start"] == text.index(docs[0].content) # 0
|
||||
assert docs[0].meta["_split_overlap"][0]["range"] == (0, 23)
|
||||
assert docs[1].content[0:23] == "some words. There is a "
|
||||
# doc 1
|
||||
assert docs[1].content == "some words. There is a second sentence. And a third "
|
||||
assert docs[1].meta["split_id"] == 1
|
||||
assert docs[1].meta["split_idx_start"] == text.index(docs[1].content) # 20
|
||||
assert docs[1].meta["_split_overlap"][0]["range"] == (20, 43)
|
||||
assert docs[1].meta["_split_overlap"][1]["range"] == (0, 29)
|
||||
assert docs[0].content[20:43] == "some words. There is a "
|
||||
assert docs[2].content[0:29] == "second sentence. And a third "
|
||||
# doc 2
|
||||
assert docs[2].content == "second sentence. And a third sentence."
|
||||
assert docs[2].meta["split_id"] == 2
|
||||
assert docs[2].meta["split_idx_start"] == text.index(docs[2].content) # 43
|
||||
assert docs[2].meta["_split_overlap"][0]["range"] == (23, 52)
|
||||
assert docs[1].content[23:52] == "second sentence. And a third "
|
||||
|
||||
# reconstruct the original document content from the split documents
|
||||
assert doc.content == merge_documents(docs["documents"])
|
||||
assert doc.content == merge_documents(docs)
|
||||
|
||||
@ -29,7 +29,7 @@ class TestSentenceWindowRetrieval:
|
||||
"page_number": 1,
|
||||
"split_id": 0,
|
||||
"split_idx_start": 0,
|
||||
"_split_overlap": [{"doc_id": "doc_1", "range": (0, 22)}],
|
||||
"_split_overlap": [{"doc_id": "doc_1", "range": (0, 23)}],
|
||||
},
|
||||
{
|
||||
"id": "doc_1",
|
||||
@ -37,8 +37,8 @@ class TestSentenceWindowRetrieval:
|
||||
"source_id": "c5d7c632affc486d0cfe7b3c0f4dc1d3896ea720da2b538d6d10b104a3df5f99",
|
||||
"page_number": 1,
|
||||
"split_id": 1,
|
||||
"split_idx_start": 21,
|
||||
"_split_overlap": [{"doc_id": "doc_0", "range": (20, 42)}, {"doc_id": "doc_2", "range": (0, 29)}],
|
||||
"split_idx_start": 20,
|
||||
"_split_overlap": [{"doc_id": "doc_0", "range": (20, 43)}, {"doc_id": "doc_2", "range": (0, 29)}],
|
||||
},
|
||||
{
|
||||
"id": "doc_2",
|
||||
@ -46,7 +46,7 @@ class TestSentenceWindowRetrieval:
|
||||
"source_id": "c5d7c632affc486d0cfe7b3c0f4dc1d3896ea720da2b538d6d10b104a3df5f99",
|
||||
"page_number": 1,
|
||||
"split_id": 2,
|
||||
"split_idx_start": 45,
|
||||
"split_idx_start": 43,
|
||||
"_split_overlap": [{"doc_id": "doc_1", "range": (23, 52)}],
|
||||
},
|
||||
]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user