feat: add page_number to metadata in DocumentSplitter (#7599)

* Add the implementation for page counting used in the v1.25.x branch. It should work as expected in issue #6705. * Add tests that reflect the desired behabiour. This behabiour is inffered from the one it had on Haystack 1.x Solve some minor bugs spotted by tests. * Update docstrings. * Add reno. * Update haystack/components/preprocessors/document_splitter.py Update docstring from suggestion Co-authored-by: David S. Batista <dsbatista@gmail.com> * solve suggestion to improve readability * fragment tests * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: David S. Batista <dsbatista@gmail.com> * Update .gitignore * Update .gitignore * Update add-page-number-to-document-splitter-162e9dc7443575f0.yaml * blackening --------- Co-authored-by: David S. Batista <dsbatista@gmail.com>
2026-01-06 12:07:04 +00:00 · 2024-04-29 12:51:18 +02:00 · 2024-04-29 12:51:18 +02:00 · d2c87b2fd9
commit d2c87b2fd9
parent 8d04e530da
3 changed files with 136 additions and 7 deletions
--- a/haystack/components/preprocessors/document_splitter.py
+++ b/haystack/components/preprocessors/document_splitter.py
@ -1,5 +1,5 @@
 from copy import deepcopy
-from typing import List, Literal
+from typing import Dict, List, Literal, Tuple

 from more_itertools import windowed

@ -53,7 +53,7 @@ class DocumentSplitter:

        :returns: A dictionary with the following key:
            - `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
-            document to keep track of the original document that was split. Other metadata are copied from the original
+            document to keep track of the original document that was split. Another metadata field "page_number" is added to each number to keep track of the page it belonged to in the original document. Other metadata are copied from the original
            document.

        :raises TypeError: if the input is not a list of Documents.
@ -70,10 +70,12 @@ class DocumentSplitter:
                    f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None."
                )
            units = self._split_into_units(doc.content, self.split_by)
-            text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
+            text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap)
            metadata = deepcopy(doc.meta)
            metadata["source_id"] = doc.id
-            split_docs += [Document(content=txt, meta=metadata) for txt in text_splits]
+            split_docs += self._create_docs_from_splits(
+                text_splits=text_splits, splits_pages=splits_pages, meta=metadata
+            )
        return {"documents": split_docs}

    def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
@ -95,15 +97,40 @@ class DocumentSplitter:
            units[i] += split_at
        return units

-    def _concatenate_units(self, elements: List[str], split_length: int, split_overlap: int) -> List[str]:
+    def _concatenate_units(
+        self, elements: List[str], split_length: int, split_overlap: int
+    ) -> Tuple[List[str], List[int]]:
        """
-        Concatenates the elements into parts of split_length units.
+        Concatenates the elements into parts of split_length units keeping track of the original page number that each element belongs.
        """
        text_splits = []
+        splits_pages = []
+        cur_page = 1
        segments = windowed(elements, n=split_length, step=split_length - split_overlap)
        for seg in segments:
            current_units = [unit for unit in seg if unit is not None]
            txt = "".join(current_units)
            if len(txt) > 0:
                text_splits.append(txt)
-        return text_splits
+                splits_pages.append(cur_page)
+                processed_units = current_units[: split_length - split_overlap]
+                if self.split_by == "page":
+                    num_page_breaks = len(processed_units)
+                else:
+                    num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
+                cur_page += num_page_breaks
+        return text_splits, splits_pages
+
+    @staticmethod
+    def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]:
+        """
+        Creates Document objects from text splits enriching them with page number and the metadata of the original document.
+        """
+        documents: List[Document] = []
+
+        for i, txt in enumerate(text_splits):
+            meta = deepcopy(meta)
+            doc = Document(content=txt, meta=meta)
+            doc.meta["page_number"] = splits_pages[i]
+            documents.append(doc)
+        return documents
--- a/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml
+++ b/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml
@ -0,0 +1,7 @@
+---
+highlights: >
+    Add the "page_number" field to the metadata of all output documents.
+
+enhancements:
+  - |
+    Now the DocumentSplitter adds the "page_number" field to the metadata of all output documents to keep track of the page of the original document it belongs to.
--- a/test/components/preprocessors/test_document_splitter.py
+++ b/test/components/preprocessors/test_document_splitter.py
@ -141,3 +141,98 @@ class TestDocumentSplitter:
        for doc, split_doc in zip(documents, result["documents"]):
            assert doc.meta.items() <= split_doc.meta.items()
            assert split_doc.content == "Text."
+
+    def test_add_page_number_to_metadata_with_no_overlap_word_split(self):
+        splitter = DocumentSplitter(split_by="word", split_length=2)
+        doc1 = Document(content="This is some text.\f This text is on another page.")
+        doc2 = Document(content="This content has two.\f\f page brakes.")
+        result = splitter.run(documents=[doc1, doc2])
+
+        expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
+        for doc, p in zip(result["documents"], expected_pages):
+            assert doc.meta["page_number"] == p
+
+    def test_add_page_number_to_metadata_with_no_overlap_sentence_split(self):
+        splitter = DocumentSplitter(split_by="sentence", split_length=1)
+        doc1 = Document(content="This is some text.\f This text is on another page.")
+        doc2 = Document(content="This content has two.\f\f page brakes.")
+        result = splitter.run(documents=[doc1, doc2])
+
+        expected_pages = [1, 1, 1, 1]
+        for doc, p in zip(result["documents"], expected_pages):
+            assert doc.meta["page_number"] == p
+
+    def test_add_page_number_to_metadata_with_no_overlap_passage_split(self):
+        splitter = DocumentSplitter(split_by="passage", split_length=1)
+        doc1 = Document(
+            content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
+        )
+        result = splitter.run(documents=[doc1])
+
+        expected_pages = [1, 2, 2, 2]
+        for doc, p in zip(result["documents"], expected_pages):
+            assert doc.meta["page_number"] == p
+
+    def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
+        splitter = DocumentSplitter(split_by="page", split_length=1)
+        doc1 = Document(
+            content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
+        )
+        result = splitter.run(documents=[doc1])
+        expected_pages = [1, 2, 3]
+        for doc, p in zip(result["documents"], expected_pages):
+            assert doc.meta["page_number"] == p
+
+        splitter = DocumentSplitter(split_by="page", split_length=2)
+        doc1 = Document(
+            content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
+        )
+        result = splitter.run(documents=[doc1])
+        expected_pages = [1, 3]
+
+        for doc, p in zip(result["documents"], expected_pages):
+            assert doc.meta["page_number"] == p
+
+    def test_add_page_number_to_metadata_with_overlap_word_split(self):
+        splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
+        doc1 = Document(content="This is some text. And\f this text is on another page.")
+        doc2 = Document(content="This content has two.\f\f page brakes.")
+        result = splitter.run(documents=[doc1, doc2])
+
+        expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
+        for doc, p in zip(result["documents"], expected_pages):
+            print(doc.content, doc.meta, p)
+            assert doc.meta["page_number"] == p
+
+    def test_add_page_number_to_metadata_with_overlap_sentence_split(self):
+        splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1)
+        doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
+        doc2 = Document(content="This content has two.\f\f page brakes. More text.")
+        result = splitter.run(documents=[doc1, doc2])
+
+        expected_pages = [1, 1, 1, 2, 1, 1]
+        for doc, p in zip(result["documents"], expected_pages):
+            print(doc.content, doc.meta, p)
+            assert doc.meta["page_number"] == p
+
+    def test_add_page_number_to_metadata_with_overlap_passage_split(self):
+        splitter = DocumentSplitter(split_by="passage", split_length=2, split_overlap=1)
+        doc1 = Document(
+            content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
+        )
+        result = splitter.run(documents=[doc1])
+
+        expected_pages = [1, 2, 2]
+        for doc, p in zip(result["documents"], expected_pages):
+            assert doc.meta["page_number"] == p
+
+    def test_add_page_number_to_metadata_with_overlap_page_split(self):
+        splitter = DocumentSplitter(split_by="page", split_length=2, split_overlap=1)
+        doc1 = Document(
+            content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
+        )
+        result = splitter.run(documents=[doc1])
+        expected_pages = [1, 2, 3]
+
+        for doc, p in zip(result["documents"], expected_pages):
+            assert doc.meta["page_number"] == p