haystack/test/preview/components/preprocessors/test_text_document_splitter.py

144 lines
6.4 KiB
Python
Raw Normal View History

import pytest
from haystack.preview import Document
from haystack.preview.components.preprocessors import TextDocumentSplitter
class TestTextDocumentSplitter:
@pytest.mark.unit
def test_non_text_document(self):
with pytest.raises(
ValueError, match="TextDocumentSplitter only works with text documents but document.text for document ID"
):
splitter = TextDocumentSplitter()
splitter.run(documents=[Document()])
@pytest.mark.unit
def test_single_doc(self):
with pytest.raises(TypeError, match="TextDocumentSplitter expects a List of Documents as input."):
splitter = TextDocumentSplitter()
splitter.run(documents=Document())
@pytest.mark.unit
def test_empty_list(self):
splitter = TextDocumentSplitter()
res = splitter.run(documents=[])
assert res == {"documents": []}
@pytest.mark.unit
def test_unsupported_split_by(self):
with pytest.raises(ValueError, match="split_by must be one of 'word', 'sentence' or 'passage'."):
TextDocumentSplitter(split_by="unsupported")
@pytest.mark.unit
def test_unsupported_split_length(self):
with pytest.raises(ValueError, match="split_length must be greater than 0."):
TextDocumentSplitter(split_length=0)
@pytest.mark.unit
def test_unsupported_split_overlap(self):
with pytest.raises(ValueError, match="split_overlap must be greater than or equal to 0."):
TextDocumentSplitter(split_overlap=-1)
@pytest.mark.unit
def test_split_by_word(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10)
result = splitter.run(
documents=[
Document(
text="This is a text with some words. There is a second sentence. And there is a third sentence."
)
]
)
assert len(result["documents"]) == 2
assert result["documents"][0].text == "This is a text with some words. There is a "
assert result["documents"][1].text == "second sentence. And there is a third sentence."
@pytest.mark.unit
def test_split_by_word_multiple_input_docs(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10)
result = splitter.run(
documents=[
Document(
text="This is a text with some words. There is a second sentence. And there is a third sentence."
),
Document(
text="This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence."
),
]
)
assert len(result["documents"]) == 5
assert result["documents"][0].text == "This is a text with some words. There is a "
assert result["documents"][1].text == "second sentence. And there is a third sentence."
assert result["documents"][2].text == "This is a different text with some words. There is "
assert result["documents"][3].text == "a second sentence. And there is a third sentence. And "
assert result["documents"][4].text == "there is a fourth sentence."
@pytest.mark.unit
def test_split_by_sentence(self):
splitter = TextDocumentSplitter(split_by="sentence", split_length=1)
result = splitter.run(
documents=[
Document(
text="This is a text with some words. There is a second sentence. And there is a third sentence."
)
]
)
assert len(result["documents"]) == 3
assert result["documents"][0].text == "This is a text with some words."
assert result["documents"][1].text == " There is a second sentence."
assert result["documents"][2].text == " And there is a third sentence."
@pytest.mark.unit
def test_split_by_passage(self):
splitter = TextDocumentSplitter(split_by="passage", split_length=1)
result = splitter.run(
documents=[
Document(
text="This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage."
)
]
)
assert len(result["documents"]) == 3
assert result["documents"][0].text == "This is a text with some words. There is a second sentence.\n\n"
assert result["documents"][1].text == "And there is a third sentence.\n\n"
assert result["documents"][2].text == " And another passage."
@pytest.mark.unit
def test_split_by_word_with_overlap(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10, split_overlap=2)
result = splitter.run(
documents=[
Document(
text="This is a text with some words. There is a second sentence. And there is a third sentence."
)
]
)
assert len(result["documents"]) == 2
assert result["documents"][0].text == "This is a text with some words. There is a "
assert result["documents"][1].text == "is a second sentence. And there is a third sentence."
@pytest.mark.unit
def test_source_id_stored_in_metadata(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10)
doc1 = Document(text="This is a text with some words.")
doc2 = Document(text="This is a different text with some words.")
result = splitter.run(documents=[doc1, doc2])
assert result["documents"][0].metadata["source_id"] == doc1.id
assert result["documents"][1].metadata["source_id"] == doc2.id
@pytest.mark.unit
def test_copy_id_hash_keys_and_metadata(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10)
documents = [
Document(text="Text.", metadata={"name": "doc 0"}, id_hash_keys=["name"]),
Document(text="Text.", metadata={"name": "doc 1"}, id_hash_keys=["name"]),
]
result = splitter.run(documents=documents)
assert len(result["documents"]) == 2
assert result["documents"][0].id != result["documents"][1].id
for doc, split_doc in zip(documents, result["documents"]):
assert doc.id_hash_keys == split_doc.id_hash_keys
assert doc.metadata.items() <= split_doc.metadata.items()
assert split_doc.text == "Text."