mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-13 07:47:26 +00:00
feat: added split by page to DocumentSplitter (#6753)
* feat-added-split-by-page-to-DocumentSplitter * added test case and the suggested changes * Update document_splitter.py * Update haystack/components/preprocessors/document_splitter.py * Update test_document_splitter.py --------- Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>
This commit is contained in:
parent
6a1514550e
commit
a7ac4edd07
@ -14,18 +14,21 @@ class DocumentSplitter:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, split_by: Literal["word", "sentence", "passage"] = "word", split_length: int = 200, split_overlap: int = 0
|
self,
|
||||||
|
split_by: Literal["word", "sentence", "page", "passage"] = "word",
|
||||||
|
split_length: int = 200,
|
||||||
|
split_overlap: int = 0,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
:param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
|
:param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
|
||||||
"sentence" for splitting by ".", or "passage" for splitting by "\\n\\n".
|
"sentence" for splitting by ".", "page" for splitting by "\f" or "passage" for splitting by "\\n\\n".
|
||||||
:param split_length: The maximum number of units in each split.
|
:param split_length: The maximum number of units in each split.
|
||||||
:param split_overlap: The number of units that each split should overlap.
|
:param split_overlap: The number of units that each split should overlap.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.split_by = split_by
|
self.split_by = split_by
|
||||||
if split_by not in ["word", "sentence", "passage"]:
|
if split_by not in ["word", "sentence", "page", "passage"]:
|
||||||
raise ValueError("split_by must be one of 'word', 'sentence' or 'passage'.")
|
raise ValueError("split_by must be one of 'word', 'sentence', 'page' or 'passage'.")
|
||||||
if split_length <= 0:
|
if split_length <= 0:
|
||||||
raise ValueError("split_length must be greater than 0.")
|
raise ValueError("split_length must be greater than 0.")
|
||||||
self.split_length = split_length
|
self.split_length = split_length
|
||||||
@ -60,8 +63,10 @@ class DocumentSplitter:
|
|||||||
split_docs += [Document(content=txt, meta=metadata) for txt in text_splits]
|
split_docs += [Document(content=txt, meta=metadata) for txt in text_splits]
|
||||||
return {"documents": split_docs}
|
return {"documents": split_docs}
|
||||||
|
|
||||||
def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage"]) -> List[str]:
|
def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]:
|
||||||
if split_by == "passage":
|
if split_by == "page":
|
||||||
|
split_at = "\f"
|
||||||
|
elif split_by == "passage":
|
||||||
split_at = "\n\n"
|
split_at = "\n\n"
|
||||||
elif split_by == "sentence":
|
elif split_by == "sentence":
|
||||||
split_at = "."
|
split_at = "."
|
||||||
@ -69,7 +74,7 @@ class DocumentSplitter:
|
|||||||
split_at = " "
|
split_at = " "
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"DocumentSplitter only supports 'passage', 'sentence' or 'word' split_by options."
|
"DocumentSplitter only supports 'word', 'sentence', 'page' or 'passage' split_by options."
|
||||||
)
|
)
|
||||||
units = text.split(split_at)
|
units = text.split(split_at)
|
||||||
# Add the delimiter back to all units except the last one
|
# Add the delimiter back to all units except the last one
|
||||||
|
|||||||
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
enhancements:
|
||||||
|
- |
|
||||||
|
Added split by page to DocumentSplitter, which will split the document at \f
|
||||||
@ -23,7 +23,7 @@ class TestDocumentSplitter:
|
|||||||
assert res == {"documents": []}
|
assert res == {"documents": []}
|
||||||
|
|
||||||
def test_unsupported_split_by(self):
|
def test_unsupported_split_by(self):
|
||||||
with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence' or 'passage'."):
|
with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence', 'page' or 'passage'."):
|
||||||
DocumentSplitter(split_by="unsupported")
|
DocumentSplitter(split_by="unsupported")
|
||||||
|
|
||||||
def test_unsupported_split_length(self):
|
def test_unsupported_split_length(self):
|
||||||
@ -94,6 +94,20 @@ class TestDocumentSplitter:
|
|||||||
assert result["documents"][1].content == "And there is a third sentence.\n\n"
|
assert result["documents"][1].content == "And there is a third sentence.\n\n"
|
||||||
assert result["documents"][2].content == " And another passage."
|
assert result["documents"][2].content == " And another passage."
|
||||||
|
|
||||||
|
def test_split_by_page(self):
|
||||||
|
splitter = DocumentSplitter(split_by="page", split_length=1)
|
||||||
|
result = splitter.run(
|
||||||
|
documents=[
|
||||||
|
Document(
|
||||||
|
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert len(result["documents"]) == 3
|
||||||
|
assert result["documents"][0].content == "This is a text with some words. There is a second sentence.\x0c"
|
||||||
|
assert result["documents"][1].content == " And there is a third sentence.\x0c"
|
||||||
|
assert result["documents"][2].content == " And another passage."
|
||||||
|
|
||||||
def test_split_by_word_with_overlap(self):
|
def test_split_by_word_with_overlap(self):
|
||||||
splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
|
splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
|
||||||
result = splitter.run(
|
result = splitter.run(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user