diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index adea7cc3c..033f55a89 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -1,5 +1,5 @@ from copy import deepcopy -from typing import List, Literal +from typing import Dict, List, Literal, Tuple from more_itertools import windowed @@ -53,7 +53,7 @@ class DocumentSplitter: :returns: A dictionary with the following key: - `documents`: List of documents with the split texts. A metadata field "source_id" is added to each - document to keep track of the original document that was split. Other metadata are copied from the original + document to keep track of the original document that was split. Another metadata field "page_number" is added to each number to keep track of the page it belonged to in the original document. Other metadata are copied from the original document. :raises TypeError: if the input is not a list of Documents. @@ -70,10 +70,12 @@ class DocumentSplitter: f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None." ) units = self._split_into_units(doc.content, self.split_by) - text_splits = self._concatenate_units(units, self.split_length, self.split_overlap) + text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap) metadata = deepcopy(doc.meta) metadata["source_id"] = doc.id - split_docs += [Document(content=txt, meta=metadata) for txt in text_splits] + split_docs += self._create_docs_from_splits( + text_splits=text_splits, splits_pages=splits_pages, meta=metadata + ) return {"documents": split_docs} def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]: @@ -95,15 +97,40 @@ class DocumentSplitter: units[i] += split_at return units - def _concatenate_units(self, elements: List[str], split_length: int, split_overlap: int) -> List[str]: + def _concatenate_units( + self, elements: List[str], split_length: int, split_overlap: int + ) -> Tuple[List[str], List[int]]: """ - Concatenates the elements into parts of split_length units. + Concatenates the elements into parts of split_length units keeping track of the original page number that each element belongs. """ text_splits = [] + splits_pages = [] + cur_page = 1 segments = windowed(elements, n=split_length, step=split_length - split_overlap) for seg in segments: current_units = [unit for unit in seg if unit is not None] txt = "".join(current_units) if len(txt) > 0: text_splits.append(txt) - return text_splits + splits_pages.append(cur_page) + processed_units = current_units[: split_length - split_overlap] + if self.split_by == "page": + num_page_breaks = len(processed_units) + else: + num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units) + cur_page += num_page_breaks + return text_splits, splits_pages + + @staticmethod + def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]: + """ + Creates Document objects from text splits enriching them with page number and the metadata of the original document. + """ + documents: List[Document] = [] + + for i, txt in enumerate(text_splits): + meta = deepcopy(meta) + doc = Document(content=txt, meta=meta) + doc.meta["page_number"] = splits_pages[i] + documents.append(doc) + return documents diff --git a/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml b/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml new file mode 100644 index 000000000..8c97663cf --- /dev/null +++ b/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml @@ -0,0 +1,7 @@ +--- +highlights: > + Add the "page_number" field to the metadata of all output documents. + +enhancements: + - | + Now the DocumentSplitter adds the "page_number" field to the metadata of all output documents to keep track of the page of the original document it belongs to. diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py index 479f0d50c..4874c25be 100644 --- a/test/components/preprocessors/test_document_splitter.py +++ b/test/components/preprocessors/test_document_splitter.py @@ -141,3 +141,98 @@ class TestDocumentSplitter: for doc, split_doc in zip(documents, result["documents"]): assert doc.meta.items() <= split_doc.meta.items() assert split_doc.content == "Text." + + def test_add_page_number_to_metadata_with_no_overlap_word_split(self): + splitter = DocumentSplitter(split_by="word", split_length=2) + doc1 = Document(content="This is some text.\f This text is on another page.") + doc2 = Document(content="This content has two.\f\f page brakes.") + result = splitter.run(documents=[doc1, doc2]) + + expected_pages = [1, 1, 2, 2, 2, 1, 1, 3] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + def test_add_page_number_to_metadata_with_no_overlap_sentence_split(self): + splitter = DocumentSplitter(split_by="sentence", split_length=1) + doc1 = Document(content="This is some text.\f This text is on another page.") + doc2 = Document(content="This content has two.\f\f page brakes.") + result = splitter.run(documents=[doc1, doc2]) + + expected_pages = [1, 1, 1, 1] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + def test_add_page_number_to_metadata_with_no_overlap_passage_split(self): + splitter = DocumentSplitter(split_by="passage", split_length=1) + doc1 = Document( + content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage." + ) + result = splitter.run(documents=[doc1]) + + expected_pages = [1, 2, 2, 2] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + def test_add_page_number_to_metadata_with_no_overlap_page_split(self): + splitter = DocumentSplitter(split_by="page", split_length=1) + doc1 = Document( + content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." + ) + result = splitter.run(documents=[doc1]) + expected_pages = [1, 2, 3] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + splitter = DocumentSplitter(split_by="page", split_length=2) + doc1 = Document( + content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." + ) + result = splitter.run(documents=[doc1]) + expected_pages = [1, 3] + + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + def test_add_page_number_to_metadata_with_overlap_word_split(self): + splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1) + doc1 = Document(content="This is some text. And\f this text is on another page.") + doc2 = Document(content="This content has two.\f\f page brakes.") + result = splitter.run(documents=[doc1, doc2]) + + expected_pages = [1, 1, 1, 2, 2, 1, 1, 3] + for doc, p in zip(result["documents"], expected_pages): + print(doc.content, doc.meta, p) + assert doc.meta["page_number"] == p + + def test_add_page_number_to_metadata_with_overlap_sentence_split(self): + splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1) + doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.") + doc2 = Document(content="This content has two.\f\f page brakes. More text.") + result = splitter.run(documents=[doc1, doc2]) + + expected_pages = [1, 1, 1, 2, 1, 1] + for doc, p in zip(result["documents"], expected_pages): + print(doc.content, doc.meta, p) + assert doc.meta["page_number"] == p + + def test_add_page_number_to_metadata_with_overlap_passage_split(self): + splitter = DocumentSplitter(split_by="passage", split_length=2, split_overlap=1) + doc1 = Document( + content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage." + ) + result = splitter.run(documents=[doc1]) + + expected_pages = [1, 2, 2] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + def test_add_page_number_to_metadata_with_overlap_page_split(self): + splitter = DocumentSplitter(split_by="page", split_length=2, split_overlap=1) + doc1 = Document( + content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." + ) + result = splitter.run(documents=[doc1]) + expected_pages = [1, 2, 3] + + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p