feat : adding split_id and split_overlap to DocumentSplitter (#7933)

* wip: adding _split_overlapp

* fixing join issue for _split_overlap

* adding tests

* adding release notes

* cleaning and fixing tests

* making mypy happy

* Update haystack/components/preprocessors/document_splitter.py

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* adding docstrings

---------

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
This commit is contained in:
David S. Batista 2024-06-27 15:07:43 +02:00 committed by GitHub
parent 569b2a87cb
commit 91f57015c0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 107 additions and 16 deletions

View File

@ -90,38 +90,38 @@ class DocumentSplitter:
f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
)
units = self._split_into_units(doc.content, self.split_by)
text_splits, splits_pages = self._concatenate_units(
text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
units, self.split_length, self.split_overlap, self.split_threshold
)
metadata = deepcopy(doc.meta)
metadata["source_id"] = doc.id
split_docs += self._create_docs_from_splits(
text_splits=text_splits, splits_pages=splits_pages, meta=metadata
text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
)
return {"documents": split_docs}
def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
if split_by == "page":
split_at = "\f"
self.split_at = "\f"
elif split_by == "passage":
split_at = "\n\n"
self.split_at = "\n\n"
elif split_by == "sentence":
split_at = "."
self.split_at = "."
elif split_by == "word":
split_at = " "
self.split_at = " "
else:
raise NotImplementedError(
"DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
)
units = text.split(split_at)
units = text.split(self.split_at)
# Add the delimiter back to all units except the last one
for i in range(len(units) - 1):
units[i] += split_at
units[i] += self.split_at
return units
def _concatenate_units(
self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
) -> Tuple[List[str], List[int]]:
) -> Tuple[List[str], List[int], List[int]]:
"""
Concatenates the elements into parts of split_length units.
@ -132,36 +132,90 @@ class DocumentSplitter:
text_splits: List[str] = []
splits_pages = []
splits_start_idxs = []
split_at_len = len(self.split_at)
cur_start_idx = 0
cur_page = 1
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
for seg in segments:
current_units = [unit for unit in seg if unit is not None]
txt = "".join(current_units)
# check if length of current units is below split_threshold
if len(current_units) < split_threshold and len(text_splits) > 0:
# concatenate the last split with the current one
text_splits[-1] += txt
elif len(txt) > 0:
text_splits.append(txt)
splits_pages.append(cur_page)
splits_start_idxs.append(cur_start_idx)
processed_units = current_units[: split_length - split_overlap]
cur_start_idx += len("".join(processed_units)) + split_at_len
if self.split_by == "page":
num_page_breaks = len(processed_units)
else:
num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
cur_page += num_page_breaks
return text_splits, splits_pages
@staticmethod
def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
cur_page += num_page_breaks
return text_splits, splits_pages, splits_start_idxs
def _create_docs_from_splits(
self, text_splits: List[str], splits_pages: List[int], splits_start_idxs: List[int], meta: Dict
) -> List[Document]:
"""
Creates Document objects from splits enriching them with page number and the metadata of the original document.
"""
documents: List[Document] = []
for i, txt in enumerate(text_splits):
for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):
meta = deepcopy(meta)
doc = Document(content=txt, meta=meta)
doc.meta["page_number"] = splits_pages[i]
doc.meta["split_id"] = i
doc.meta["split_idx_start"] = split_idx
documents.append(doc)
if self.split_overlap <= 0:
continue
doc.meta["_split_overlap"] = []
if i == 0:
continue
doc_start_idx = splits_start_idxs[i]
previous_doc = documents[i - 1]
previous_doc_start_idx = splits_start_idxs[i - 1]
self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx)
return documents
@staticmethod
def _add_split_overlap_information(
current_doc: Document, current_doc_start_idx: int, previous_doc: Document, previous_doc_start_idx: int
):
"""
Adds split overlap information to the current and previous Document's meta.
:param current_doc: The Document that is being split.
:param current_doc_start_idx: The starting index of the current Document.
:param previous_doc: The Document that was split before the current Document.
:param previous_doc_start_idx: The starting index of the previous Document.
"""
overlapping_range = (current_doc_start_idx - previous_doc_start_idx - 1, len(previous_doc.content) - 1) # type: ignore
if overlapping_range[0] < overlapping_range[1]:
overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]] # type: ignore
if current_doc.content.startswith(overlapping_str): # type: ignore
# add split overlap information to this Document regarding the previous Document
current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range})
# add split overlap information to previous Document regarding this Document
overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})

View File

@ -0,0 +1,4 @@
---
features:
- |
The `DocumentSplitter` now has support for the `split_id` and `split_overlap` to allow for more control over the splitting process.

View File

@ -7,6 +7,28 @@ from haystack import Document
from haystack.components.preprocessors import DocumentSplitter
def merge_documents(documents):
"""Merge a list of doc chunks into a single doc by concatenating their content, eliminating overlapping content."""
sorted_docs = sorted(documents, key=lambda doc: doc.meta["split_idx_start"])
merged_text = ""
last_idx_end = 0
for doc in sorted_docs:
start = doc.meta["split_idx_start"] # start of the current content
# if the start of the current content is before the end of the last appended content, adjust it
if start < last_idx_end:
start = last_idx_end
# append the non-overlapping part to the merged text
merged_text = merged_text.strip()
merged_text += doc.content[start - doc.meta["split_idx_start"] :]
# update the last end index
last_idx_end = doc.meta["split_idx_start"] + len(doc.content)
return merged_text
class TestDocumentSplitter:
def test_non_text_document(self):
with pytest.raises(
@ -219,7 +241,6 @@ class TestDocumentSplitter:
expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
for doc, p in zip(result["documents"], expected_pages):
print(doc.content, doc.meta, p)
assert doc.meta["page_number"] == p
def test_add_page_number_to_metadata_with_overlap_sentence_split(self):
@ -230,7 +251,6 @@ class TestDocumentSplitter:
expected_pages = [1, 1, 1, 2, 1, 1]
for doc, p in zip(result["documents"], expected_pages):
print(doc.content, doc.meta, p)
assert doc.meta["page_number"] == p
def test_add_page_number_to_metadata_with_overlap_passage_split(self):
@ -254,3 +274,16 @@ class TestDocumentSplitter:
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p
def test_add_split_overlap_information(self):
splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.")
docs = splitter.run(documents=[doc])
# check split_overlap is added to all the documents
assert len(docs["documents"]) == 3
for d in docs["documents"]:
assert "_split_overlap" in d.meta
# reconstruct the original document content from the split documents
assert doc.content == merge_documents(docs["documents"])