mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-03 07:04:01 +00:00

* Add the implementation for page counting used in the v1.25.x branch. It should work as expected in issue #6705. * Add tests that reflect the desired behabiour. This behabiour is inffered from the one it had on Haystack 1.x Solve some minor bugs spotted by tests. * Update docstrings. * Add reno. * Update haystack/components/preprocessors/document_splitter.py Update docstring from suggestion Co-authored-by: David S. Batista <dsbatista@gmail.com> * solve suggestion to improve readability * fragment tests * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: David S. Batista <dsbatista@gmail.com> * Update .gitignore * Update .gitignore * Update add-page-number-to-document-splitter-162e9dc7443575f0.yaml * blackening --------- Co-authored-by: David S. Batista <dsbatista@gmail.com>
239 lines
11 KiB
Python
239 lines
11 KiB
Python
import pytest
|
|
|
|
from haystack import Document
|
|
from haystack.components.preprocessors import DocumentSplitter
|
|
|
|
|
|
class TestDocumentSplitter:
|
|
def test_non_text_document(self):
|
|
with pytest.raises(
|
|
ValueError, match="DocumentSplitter only works with text documents but document.content for document ID"
|
|
):
|
|
splitter = DocumentSplitter()
|
|
splitter.run(documents=[Document()])
|
|
|
|
def test_single_doc(self):
|
|
with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
|
|
splitter = DocumentSplitter()
|
|
splitter.run(documents=Document())
|
|
|
|
def test_empty_list(self):
|
|
splitter = DocumentSplitter()
|
|
res = splitter.run(documents=[])
|
|
assert res == {"documents": []}
|
|
|
|
def test_unsupported_split_by(self):
|
|
with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence', 'page' or 'passage'."):
|
|
DocumentSplitter(split_by="unsupported")
|
|
|
|
def test_unsupported_split_length(self):
|
|
with pytest.raises(ValueError, match="split_length must be greater than 0."):
|
|
DocumentSplitter(split_length=0)
|
|
|
|
def test_unsupported_split_overlap(self):
|
|
with pytest.raises(ValueError, match="split_overlap must be greater than or equal to 0."):
|
|
DocumentSplitter(split_overlap=-1)
|
|
|
|
def test_split_by_word(self):
|
|
splitter = DocumentSplitter(split_by="word", split_length=10)
|
|
result = splitter.run(
|
|
documents=[
|
|
Document(
|
|
content="This is a text with some words. There is a second sentence. And there is a third sentence."
|
|
)
|
|
]
|
|
)
|
|
assert len(result["documents"]) == 2
|
|
assert result["documents"][0].content == "This is a text with some words. There is a "
|
|
assert result["documents"][1].content == "second sentence. And there is a third sentence."
|
|
|
|
def test_split_by_word_multiple_input_docs(self):
|
|
splitter = DocumentSplitter(split_by="word", split_length=10)
|
|
result = splitter.run(
|
|
documents=[
|
|
Document(
|
|
content="This is a text with some words. There is a second sentence. And there is a third sentence."
|
|
),
|
|
Document(
|
|
content="This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence."
|
|
),
|
|
]
|
|
)
|
|
assert len(result["documents"]) == 5
|
|
assert result["documents"][0].content == "This is a text with some words. There is a "
|
|
assert result["documents"][1].content == "second sentence. And there is a third sentence."
|
|
assert result["documents"][2].content == "This is a different text with some words. There is "
|
|
assert result["documents"][3].content == "a second sentence. And there is a third sentence. And "
|
|
assert result["documents"][4].content == "there is a fourth sentence."
|
|
|
|
def test_split_by_sentence(self):
|
|
splitter = DocumentSplitter(split_by="sentence", split_length=1)
|
|
result = splitter.run(
|
|
documents=[
|
|
Document(
|
|
content="This is a text with some words. There is a second sentence. And there is a third sentence."
|
|
)
|
|
]
|
|
)
|
|
assert len(result["documents"]) == 3
|
|
assert result["documents"][0].content == "This is a text with some words."
|
|
assert result["documents"][1].content == " There is a second sentence."
|
|
assert result["documents"][2].content == " And there is a third sentence."
|
|
|
|
def test_split_by_passage(self):
|
|
splitter = DocumentSplitter(split_by="passage", split_length=1)
|
|
result = splitter.run(
|
|
documents=[
|
|
Document(
|
|
content="This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage."
|
|
)
|
|
]
|
|
)
|
|
assert len(result["documents"]) == 3
|
|
assert result["documents"][0].content == "This is a text with some words. There is a second sentence.\n\n"
|
|
assert result["documents"][1].content == "And there is a third sentence.\n\n"
|
|
assert result["documents"][2].content == " And another passage."
|
|
|
|
def test_split_by_page(self):
|
|
splitter = DocumentSplitter(split_by="page", split_length=1)
|
|
result = splitter.run(
|
|
documents=[
|
|
Document(
|
|
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
|
)
|
|
]
|
|
)
|
|
assert len(result["documents"]) == 3
|
|
assert result["documents"][0].content == "This is a text with some words. There is a second sentence.\x0c"
|
|
assert result["documents"][1].content == " And there is a third sentence.\x0c"
|
|
assert result["documents"][2].content == " And another passage."
|
|
|
|
def test_split_by_word_with_overlap(self):
|
|
splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
|
|
result = splitter.run(
|
|
documents=[
|
|
Document(
|
|
content="This is a text with some words. There is a second sentence. And there is a third sentence."
|
|
)
|
|
]
|
|
)
|
|
assert len(result["documents"]) == 2
|
|
assert result["documents"][0].content == "This is a text with some words. There is a "
|
|
assert result["documents"][1].content == "is a second sentence. And there is a third sentence."
|
|
|
|
def test_source_id_stored_in_metadata(self):
|
|
splitter = DocumentSplitter(split_by="word", split_length=10)
|
|
doc1 = Document(content="This is a text with some words.")
|
|
doc2 = Document(content="This is a different text with some words.")
|
|
result = splitter.run(documents=[doc1, doc2])
|
|
assert result["documents"][0].meta["source_id"] == doc1.id
|
|
assert result["documents"][1].meta["source_id"] == doc2.id
|
|
|
|
def test_copy_metadata(self):
|
|
splitter = DocumentSplitter(split_by="word", split_length=10)
|
|
documents = [
|
|
Document(content="Text.", meta={"name": "doc 0"}),
|
|
Document(content="Text.", meta={"name": "doc 1"}),
|
|
]
|
|
result = splitter.run(documents=documents)
|
|
assert len(result["documents"]) == 2
|
|
assert result["documents"][0].id != result["documents"][1].id
|
|
for doc, split_doc in zip(documents, result["documents"]):
|
|
assert doc.meta.items() <= split_doc.meta.items()
|
|
assert split_doc.content == "Text."
|
|
|
|
def test_add_page_number_to_metadata_with_no_overlap_word_split(self):
|
|
splitter = DocumentSplitter(split_by="word", split_length=2)
|
|
doc1 = Document(content="This is some text.\f This text is on another page.")
|
|
doc2 = Document(content="This content has two.\f\f page brakes.")
|
|
result = splitter.run(documents=[doc1, doc2])
|
|
|
|
expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
|
|
for doc, p in zip(result["documents"], expected_pages):
|
|
assert doc.meta["page_number"] == p
|
|
|
|
def test_add_page_number_to_metadata_with_no_overlap_sentence_split(self):
|
|
splitter = DocumentSplitter(split_by="sentence", split_length=1)
|
|
doc1 = Document(content="This is some text.\f This text is on another page.")
|
|
doc2 = Document(content="This content has two.\f\f page brakes.")
|
|
result = splitter.run(documents=[doc1, doc2])
|
|
|
|
expected_pages = [1, 1, 1, 1]
|
|
for doc, p in zip(result["documents"], expected_pages):
|
|
assert doc.meta["page_number"] == p
|
|
|
|
def test_add_page_number_to_metadata_with_no_overlap_passage_split(self):
|
|
splitter = DocumentSplitter(split_by="passage", split_length=1)
|
|
doc1 = Document(
|
|
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
|
|
)
|
|
result = splitter.run(documents=[doc1])
|
|
|
|
expected_pages = [1, 2, 2, 2]
|
|
for doc, p in zip(result["documents"], expected_pages):
|
|
assert doc.meta["page_number"] == p
|
|
|
|
def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
|
|
splitter = DocumentSplitter(split_by="page", split_length=1)
|
|
doc1 = Document(
|
|
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
|
)
|
|
result = splitter.run(documents=[doc1])
|
|
expected_pages = [1, 2, 3]
|
|
for doc, p in zip(result["documents"], expected_pages):
|
|
assert doc.meta["page_number"] == p
|
|
|
|
splitter = DocumentSplitter(split_by="page", split_length=2)
|
|
doc1 = Document(
|
|
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
|
)
|
|
result = splitter.run(documents=[doc1])
|
|
expected_pages = [1, 3]
|
|
|
|
for doc, p in zip(result["documents"], expected_pages):
|
|
assert doc.meta["page_number"] == p
|
|
|
|
def test_add_page_number_to_metadata_with_overlap_word_split(self):
|
|
splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
|
|
doc1 = Document(content="This is some text. And\f this text is on another page.")
|
|
doc2 = Document(content="This content has two.\f\f page brakes.")
|
|
result = splitter.run(documents=[doc1, doc2])
|
|
|
|
expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
|
|
for doc, p in zip(result["documents"], expected_pages):
|
|
print(doc.content, doc.meta, p)
|
|
assert doc.meta["page_number"] == p
|
|
|
|
def test_add_page_number_to_metadata_with_overlap_sentence_split(self):
|
|
splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1)
|
|
doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
|
|
doc2 = Document(content="This content has two.\f\f page brakes. More text.")
|
|
result = splitter.run(documents=[doc1, doc2])
|
|
|
|
expected_pages = [1, 1, 1, 2, 1, 1]
|
|
for doc, p in zip(result["documents"], expected_pages):
|
|
print(doc.content, doc.meta, p)
|
|
assert doc.meta["page_number"] == p
|
|
|
|
def test_add_page_number_to_metadata_with_overlap_passage_split(self):
|
|
splitter = DocumentSplitter(split_by="passage", split_length=2, split_overlap=1)
|
|
doc1 = Document(
|
|
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
|
|
)
|
|
result = splitter.run(documents=[doc1])
|
|
|
|
expected_pages = [1, 2, 2]
|
|
for doc, p in zip(result["documents"], expected_pages):
|
|
assert doc.meta["page_number"] == p
|
|
|
|
def test_add_page_number_to_metadata_with_overlap_page_split(self):
|
|
splitter = DocumentSplitter(split_by="page", split_length=2, split_overlap=1)
|
|
doc1 = Document(
|
|
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
|
)
|
|
result = splitter.run(documents=[doc1])
|
|
expected_pages = [1, 2, 3]
|
|
|
|
for doc, p in zip(result["documents"], expected_pages):
|
|
assert doc.meta["page_number"] == p
|