feat : adding split_id and split_overlap to DocumentSplitter (#7933)

* wip: adding _split_overlapp * fixing join issue for _split_overlap * adding tests * adding release notes * cleaning and fixing tests * making mypy happy * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * adding docstrings --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
2026-01-08 21:28:00 +00:00 · 2024-06-27 15:07:43 +02:00 · 2024-06-27 15:07:43 +02:00 · 91f57015c0
commit 91f57015c0
parent 569b2a87cb
3 changed files with 107 additions and 16 deletions
--- a/haystack/components/preprocessors/document_splitter.py
+++ b/haystack/components/preprocessors/document_splitter.py
@ -90,38 +90,38 @@ class DocumentSplitter:
                    f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
                )
            units = self._split_into_units(doc.content, self.split_by)
-            text_splits, splits_pages = self._concatenate_units(
+            text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
                units, self.split_length, self.split_overlap, self.split_threshold
            )
            metadata = deepcopy(doc.meta)
            metadata["source_id"] = doc.id
            split_docs += self._create_docs_from_splits(
-                text_splits=text_splits, splits_pages=splits_pages, meta=metadata
+                text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
            )
        return {"documents": split_docs}

    def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
        if split_by == "page":
-            split_at = "\f"
+            self.split_at = "\f"
        elif split_by == "passage":
-            split_at = "\n\n"
+            self.split_at = "\n\n"
        elif split_by == "sentence":
-            split_at = "."
+            self.split_at = "."
        elif split_by == "word":
-            split_at = " "
+            self.split_at = " "
        else:
            raise NotImplementedError(
                "DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
            )
-        units = text.split(split_at)
+        units = text.split(self.split_at)
        # Add the delimiter back to all units except the last one
        for i in range(len(units) - 1):
-            units[i] += split_at
+            units[i] += self.split_at
        return units

    def _concatenate_units(
        self, elements: List[str], split_length: int, split_overlap: int, split_threshold: int
-    ) -> Tuple[List[str], List[int]]:
+    ) -> Tuple[List[str], List[int], List[int]]:
        """
        Concatenates the elements into parts of split_length units.

@ -132,36 +132,90 @@ class DocumentSplitter:

        text_splits: List[str] = []
        splits_pages = []
+        splits_start_idxs = []
+        split_at_len = len(self.split_at)
+        cur_start_idx = 0
        cur_page = 1
        segments = windowed(elements, n=split_length, step=split_length - split_overlap)
+
        for seg in segments:
            current_units = [unit for unit in seg if unit is not None]
            txt = "".join(current_units)
+
            # check if length of current units is below split_threshold
            if len(current_units) < split_threshold and len(text_splits) > 0:
                # concatenate the last split with the current one
                text_splits[-1] += txt
+
            elif len(txt) > 0:
                text_splits.append(txt)
                splits_pages.append(cur_page)
+                splits_start_idxs.append(cur_start_idx)
+
            processed_units = current_units[: split_length - split_overlap]
+            cur_start_idx += len("".join(processed_units)) + split_at_len
+
            if self.split_by == "page":
                num_page_breaks = len(processed_units)
            else:
                num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
-            cur_page += num_page_breaks
-        return text_splits, splits_pages

-    @staticmethod
-    def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
+            cur_page += num_page_breaks
+
+        return text_splits, splits_pages, splits_start_idxs
+
+    def _create_docs_from_splits(
+        self, text_splits: List[str], splits_pages: List[int], splits_start_idxs: List[int], meta: Dict
+    ) -> List[Document]:
        """
        Creates Document objects from splits enriching them with page number and the metadata of the original document.
        """
        documents: List[Document] = []

-        for i, txt in enumerate(text_splits):
+        for i, (txt, split_idx) in enumerate(zip(text_splits, splits_start_idxs)):
            meta = deepcopy(meta)
            doc = Document(content=txt, meta=meta)
            doc.meta["page_number"] = splits_pages[i]
+            doc.meta["split_id"] = i
+            doc.meta["split_idx_start"] = split_idx
            documents.append(doc)
+
+            if self.split_overlap <= 0:
+                continue
+
+            doc.meta["_split_overlap"] = []
+
+            if i == 0:
+                continue
+
+            doc_start_idx = splits_start_idxs[i]
+            previous_doc = documents[i - 1]
+            previous_doc_start_idx = splits_start_idxs[i - 1]
+            self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx)
+
        return documents
+
+    @staticmethod
+    def _add_split_overlap_information(
+        current_doc: Document, current_doc_start_idx: int, previous_doc: Document, previous_doc_start_idx: int
+    ):
+        """
+        Adds split overlap information to the current and previous Document's meta.
+
+        :param current_doc: The Document that is being split.
+        :param current_doc_start_idx: The starting index of the current Document.
+        :param previous_doc: The Document that was split before the current Document.
+        :param previous_doc_start_idx: The starting index of the previous Document.
+        """
+        overlapping_range = (current_doc_start_idx - previous_doc_start_idx - 1, len(previous_doc.content) - 1)  # type: ignore
+
+        if overlapping_range[0] < overlapping_range[1]:
+            overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]]  # type: ignore
+
+            if current_doc.content.startswith(overlapping_str):  # type: ignore
+                # add split overlap information to this Document regarding the previous Document
+                current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range})
+
+                # add split overlap information to previous Document regarding this Document
+                overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
+                previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})
--- a/releasenotes/notes/add-split_id_and_overlap_to_DocumentSplitter-8180ad8f13495741.yaml
+++ b/releasenotes/notes/add-split_id_and_overlap_to_DocumentSplitter-8180ad8f13495741.yaml
@ -0,0 +1,4 @@
+---
+features:
+  - |
+    The `DocumentSplitter` now has support for the `split_id` and `split_overlap` to allow for more control over the splitting process.
--- a/test/components/preprocessors/test_document_splitter.py
+++ b/test/components/preprocessors/test_document_splitter.py
@ -7,6 +7,28 @@ from haystack import Document
 from haystack.components.preprocessors import DocumentSplitter


+def merge_documents(documents):
+    """Merge a list of doc chunks into a single doc by concatenating their content, eliminating overlapping content."""
+    sorted_docs = sorted(documents, key=lambda doc: doc.meta["split_idx_start"])
+    merged_text = ""
+    last_idx_end = 0
+    for doc in sorted_docs:
+        start = doc.meta["split_idx_start"]  # start of the current content
+
+        # if the start of the current content is before the end of the last appended content, adjust it
+        if start < last_idx_end:
+            start = last_idx_end
+
+        # append the non-overlapping part to the merged text
+        merged_text = merged_text.strip()
+        merged_text += doc.content[start - doc.meta["split_idx_start"] :]
+
+        # update the last end index
+        last_idx_end = doc.meta["split_idx_start"] + len(doc.content)
+
+    return merged_text
+
+
 class TestDocumentSplitter:
    def test_non_text_document(self):
        with pytest.raises(
@ -219,7 +241,6 @@ class TestDocumentSplitter:

        expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
        for doc, p in zip(result["documents"], expected_pages):
-            print(doc.content, doc.meta, p)
            assert doc.meta["page_number"] == p

    def test_add_page_number_to_metadata_with_overlap_sentence_split(self):
@ -230,7 +251,6 @@ class TestDocumentSplitter:

        expected_pages = [1, 1, 1, 2, 1, 1]
        for doc, p in zip(result["documents"], expected_pages):
-            print(doc.content, doc.meta, p)
            assert doc.meta["page_number"] == p

    def test_add_page_number_to_metadata_with_overlap_passage_split(self):
@ -254,3 +274,16 @@ class TestDocumentSplitter:

        for doc, p in zip(result["documents"], expected_pages):
            assert doc.meta["page_number"] == p
+
+    def test_add_split_overlap_information(self):
+        splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
+        doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.")
+        docs = splitter.run(documents=[doc])
+
+        # check split_overlap is added to all the documents
+        assert len(docs["documents"]) == 3
+        for d in docs["documents"]:
+            assert "_split_overlap" in d.meta
+
+        # reconstruct the original document content from the split documents
+        assert doc.content == merge_documents(docs["documents"])