mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-19 15:01:40 +00:00

* Rework Document serialisation Make Document backward compatible Fix InMemoryDocumentStore filters Fix InMemoryDocumentStore.bm25_retrieval Add release notes Fix pylint failures Enhance Document kwargs handling and docstrings Rename Document's text field to content Fix e2e tests Fix SimilarityRanker tests Fix typo in release notes Rename Document's metadata field to meta (#6183) * fix bugs * make linters happy * fix * more fix * match regex --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
140 lines
4.8 KiB
Python
140 lines
4.8 KiB
Python
import logging
|
||
|
||
import pytest
|
||
|
||
from haystack.preview import Document
|
||
from haystack.preview.components.preprocessors import DocumentCleaner
|
||
|
||
|
||
class TestDocumentCleaner:
|
||
@pytest.mark.unit
|
||
def test_init(self):
|
||
cleaner = DocumentCleaner()
|
||
assert cleaner.remove_empty_lines == True
|
||
assert cleaner.remove_extra_whitespaces == True
|
||
assert cleaner.remove_repeated_substrings == False
|
||
assert cleaner.remove_substrings is None
|
||
assert cleaner.remove_regex is None
|
||
|
||
@pytest.mark.unit
|
||
def test_non_text_document(self, caplog):
|
||
with caplog.at_level(logging.WARNING):
|
||
cleaner = DocumentCleaner()
|
||
cleaner.run(documents=[Document()])
|
||
assert "DocumentCleaner only cleans text documents but document.content for document ID" in caplog.text
|
||
|
||
@pytest.mark.unit
|
||
def test_single_document(self):
|
||
with pytest.raises(TypeError, match="DocumentCleaner expects a List of Documents as input."):
|
||
cleaner = DocumentCleaner()
|
||
cleaner.run(documents=Document())
|
||
|
||
@pytest.mark.unit
|
||
def test_empty_list(self):
|
||
cleaner = DocumentCleaner()
|
||
result = cleaner.run(documents=[])
|
||
assert result == {"documents": []}
|
||
|
||
@pytest.mark.unit
|
||
def test_remove_empty_lines(self):
|
||
cleaner = DocumentCleaner(remove_extra_whitespaces=False)
|
||
result = cleaner.run(
|
||
documents=[
|
||
Document(
|
||
content="This is a text with some words. "
|
||
""
|
||
"There is a second sentence. "
|
||
""
|
||
"And there is a third sentence."
|
||
)
|
||
]
|
||
)
|
||
assert len(result["documents"]) == 1
|
||
assert (
|
||
result["documents"][0].content
|
||
== "This is a text with some words. There is a second sentence. And there is a third sentence."
|
||
)
|
||
|
||
@pytest.mark.unit
|
||
def test_remove_whitespaces(self):
|
||
cleaner = DocumentCleaner(remove_empty_lines=False)
|
||
result = cleaner.run(
|
||
documents=[
|
||
Document(
|
||
content=" This is a text with some words. "
|
||
""
|
||
"There is a second sentence. "
|
||
""
|
||
"And there is a third sentence. "
|
||
)
|
||
]
|
||
)
|
||
assert len(result["documents"]) == 1
|
||
assert result["documents"][0].content == (
|
||
"This is a text with some words. " "" "There is a second sentence. " "" "And there is a third sentence."
|
||
)
|
||
|
||
@pytest.mark.unit
|
||
def test_remove_substrings(self):
|
||
cleaner = DocumentCleaner(remove_substrings=["This", "A", "words", "🪲"])
|
||
result = cleaner.run(documents=[Document(content="This is a text with some words.🪲")])
|
||
assert len(result["documents"]) == 1
|
||
assert result["documents"][0].content == " is a text with some ."
|
||
|
||
@pytest.mark.unit
|
||
def test_remove_regex(self):
|
||
cleaner = DocumentCleaner(remove_regex=r"\s\s+")
|
||
result = cleaner.run(documents=[Document(content="This is a text with some words.")])
|
||
assert len(result["documents"]) == 1
|
||
assert result["documents"][0].content == "This is a text with some words."
|
||
|
||
@pytest.mark.unit
|
||
def test_remove_repeated_substrings(self):
|
||
cleaner = DocumentCleaner(
|
||
remove_empty_lines=False, remove_extra_whitespaces=False, remove_repeated_substrings=True
|
||
)
|
||
|
||
text = """First PageThis is a header.
|
||
Page of
|
||
2
|
||
4
|
||
Lorem ipsum dolor sit amet
|
||
This is a footer number 1
|
||
This is footer number 2This is a header.
|
||
Page of
|
||
3
|
||
4
|
||
Sid ut perspiciatis unde
|
||
This is a footer number 1
|
||
This is footer number 2This is a header.
|
||
Page of
|
||
4
|
||
4
|
||
Sed do eiusmod tempor.
|
||
This is a footer number 1
|
||
This is footer number 2"""
|
||
|
||
expected_text = """First Page 2
|
||
4
|
||
Lorem ipsum dolor sit amet 3
|
||
4
|
||
Sid ut perspiciatis unde 4
|
||
4
|
||
Sed do eiusmod tempor."""
|
||
result = cleaner.run(documents=[Document(content=text)])
|
||
assert result["documents"][0].content == expected_text
|
||
|
||
@pytest.mark.unit
|
||
def test_copy_metadata(self):
|
||
cleaner = DocumentCleaner()
|
||
documents = [
|
||
Document(content="Text. ", meta={"name": "doc 0"}),
|
||
Document(content="Text. ", meta={"name": "doc 1"}),
|
||
]
|
||
result = cleaner.run(documents=documents)
|
||
assert len(result["documents"]) == 2
|
||
assert result["documents"][0].id != result["documents"][1].id
|
||
for doc, cleaned_doc in zip(documents, result["documents"]):
|
||
assert doc.meta == cleaned_doc.meta
|
||
assert cleaned_doc.content == "Text."
|