feat: added split by page to DocumentSplitter (#6753)

* feat-added-split-by-page-to-DocumentSplitter * added test case and the suggested changes * Update document_splitter.py * Update haystack/components/preprocessors/document_splitter.py * Update test_document_splitter.py --------- Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>
2025-12-12 23:37:36 +00:00 · 2024-01-17 20:06:29 +05:30 · 2024-01-17 20:06:29 +05:30 · a7ac4edd07
commit a7ac4edd07
parent 6a1514550e
3 changed files with 31 additions and 8 deletions
--- a/haystack/components/preprocessors/document_splitter.py
+++ b/haystack/components/preprocessors/document_splitter.py
@ -14,18 +14,21 @@ class DocumentSplitter:
    """

    def __init__(
-        self, split_by: Literal["word", "sentence", "passage"] = "word", split_length: int = 200, split_overlap: int = 0
+        self,
+        split_by: Literal["word", "sentence", "page", "passage"] = "word",
+        split_length: int = 200,
+        split_overlap: int = 0,
    ):
        """
        :param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
-        "sentence" for splitting by ".", or "passage" for splitting by "\\n\\n".
+        "sentence" for splitting by ".", "page" for splitting by "\f" or "passage" for splitting by "\\n\\n".
        :param split_length: The maximum number of units in each split.
        :param split_overlap: The number of units that each split should overlap.
        """

        self.split_by = split_by
-        if split_by not in ["word", "sentence", "passage"]:
-            raise ValueError("split_by must be one of 'word', 'sentence' or 'passage'.")
+        if split_by not in ["word", "sentence", "page", "passage"]:
+            raise ValueError("split_by must be one of 'word', 'sentence', 'page' or 'passage'.")
        if split_length <= 0:
            raise ValueError("split_length must be greater than 0.")
        self.split_length = split_length
@ -60,8 +63,10 @@ class DocumentSplitter:
            split_docs += [Document(content=txt, meta=metadata) for txt in text_splits]
        return {"documents": split_docs}

-    def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage"]) -> List[str]:
-        if split_by == "passage":
+    def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
+        if split_by == "page":
+            split_at = "\f"
+        elif split_by == "passage":
            split_at = "\n\n"
        elif split_by == "sentence":
            split_at = "."
@ -69,7 +74,7 @@ class DocumentSplitter:
            split_at = " "
        else:
            raise NotImplementedError(
-                "DocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options."
+                "DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
            )
        units = text.split(split_at)
        # Add the delimiter back to all units except the last one
--- a/releasenotes/notes/add-split-by-page-to-DocumentSplitter-63232c17d858d787.yaml
+++ b/releasenotes/notes/add-split-by-page-to-DocumentSplitter-63232c17d858d787.yaml
@ -0,0 +1,4 @@
+---
+enhancements:
+  - |
+    Added split by page to DocumentSplitter, which will split the document at \f
--- a/test/components/preprocessors/test_document_splitter.py
+++ b/test/components/preprocessors/test_document_splitter.py
@ -23,7 +23,7 @@ class TestDocumentSplitter:
        assert res == {"documents": []}

    def test_unsupported_split_by(self):
-        with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence' or 'passage'."):
+        with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence', 'page' or 'passage'."):
            DocumentSplitter(split_by="unsupported")

    def test_unsupported_split_length(self):
@ -94,6 +94,20 @@ class TestDocumentSplitter:
        assert result["documents"][1].content == "And there is a third sentence.\n\n"
        assert result["documents"][2].content == " And another passage."

+    def test_split_by_page(self):
+        splitter = DocumentSplitter(split_by="page", split_length=1)
+        result = splitter.run(
+            documents=[
+                Document(
+                    content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
+                )
+            ]
+        )
+        assert len(result["documents"]) == 3
+        assert result["documents"][0].content == "This is a text with some words. There is a second sentence.\x0c"
+        assert result["documents"][1].content == " And there is a third sentence.\x0c"
+        assert result["documents"][2].content == " And another passage."
+
    def test_split_by_word_with_overlap(self):
        splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
        result = splitter.run(