diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index 57531b73c..4649c7f5d 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -14,18 +14,21 @@ class DocumentSplitter: """ def __init__( - self, split_by: Literal["word", "sentence", "passage"] = "word", split_length: int = 200, split_overlap: int = 0 + self, + split_by: Literal["word", "sentence", "page", "passage"] = "word", + split_length: int = 200, + split_overlap: int = 0, ): """ :param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ", - "sentence" for splitting by ".", or "passage" for splitting by "\\n\\n". + "sentence" for splitting by ".", "page" for splitting by "\f" or "passage" for splitting by "\\n\\n". :param split_length: The maximum number of units in each split. :param split_overlap: The number of units that each split should overlap. """ self.split_by = split_by - if split_by not in ["word", "sentence", "passage"]: - raise ValueError("split_by must be one of 'word', 'sentence' or 'passage'.") + if split_by not in ["word", "sentence", "page", "passage"]: + raise ValueError("split_by must be one of 'word', 'sentence', 'page' or 'passage'.") if split_length <= 0: raise ValueError("split_length must be greater than 0.") self.split_length = split_length @@ -60,8 +63,10 @@ class DocumentSplitter: split_docs += [Document(content=txt, meta=metadata) for txt in text_splits] return {"documents": split_docs} - def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage"]) -> List[str]: - if split_by == "passage": + def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]: + if split_by == "page": + split_at = "\f" + elif split_by == "passage": split_at = "\n\n" elif split_by == "sentence": split_at = "." @@ -69,7 +74,7 @@ class DocumentSplitter: split_at = " " else: raise NotImplementedError( - "DocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options." + "DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options." ) units = text.split(split_at) # Add the delimiter back to all units except the last one diff --git a/releasenotes/notes/add-split-by-page-to-DocumentSplitter-63232c17d858d787.yaml b/releasenotes/notes/add-split-by-page-to-DocumentSplitter-63232c17d858d787.yaml new file mode 100644 index 000000000..3fccae32c --- /dev/null +++ b/releasenotes/notes/add-split-by-page-to-DocumentSplitter-63232c17d858d787.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Added split by page to DocumentSplitter, which will split the document at \f diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py index b99e3be2d..479f0d50c 100644 --- a/test/components/preprocessors/test_document_splitter.py +++ b/test/components/preprocessors/test_document_splitter.py @@ -23,7 +23,7 @@ class TestDocumentSplitter: assert res == {"documents": []} def test_unsupported_split_by(self): - with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence' or 'passage'."): + with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence', 'page' or 'passage'."): DocumentSplitter(split_by="unsupported") def test_unsupported_split_length(self): @@ -94,6 +94,20 @@ class TestDocumentSplitter: assert result["documents"][1].content == "And there is a third sentence.\n\n" assert result["documents"][2].content == " And another passage." + def test_split_by_page(self): + splitter = DocumentSplitter(split_by="page", split_length=1) + result = splitter.run( + documents=[ + Document( + content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." + ) + ] + ) + assert len(result["documents"]) == 3 + assert result["documents"][0].content == "This is a text with some words. There is a second sentence.\x0c" + assert result["documents"][1].content == " And there is a third sentence.\x0c" + assert result["documents"][2].content == " And another passage." + def test_split_by_word_with_overlap(self): splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2) result = splitter.run(