2024-05-09 15:40:36 +02:00
|
|
|
|
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
|
|
|
|
#
|
|
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2025-05-26 17:22:51 +01:00
|
|
|
|
|
2023-10-13 12:39:55 +02:00
|
|
|
|
import logging
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
|
from haystack import Document
|
2024-11-25 13:08:59 +01:00
|
|
|
|
from haystack.dataclasses import ByteStream, SparseEmbedding
|
2023-11-24 14:48:43 +01:00
|
|
|
|
from haystack.components.preprocessors import DocumentCleaner
|
2023-10-13 12:39:55 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestDocumentCleaner:
|
|
|
|
|
def test_init(self):
|
|
|
|
|
cleaner = DocumentCleaner()
|
2023-11-03 11:33:20 +01:00
|
|
|
|
assert cleaner.remove_empty_lines is True
|
|
|
|
|
assert cleaner.remove_extra_whitespaces is True
|
|
|
|
|
assert cleaner.remove_repeated_substrings is False
|
2023-10-13 12:39:55 +02:00
|
|
|
|
assert cleaner.remove_substrings is None
|
|
|
|
|
assert cleaner.remove_regex is None
|
2024-05-16 19:18:48 +02:00
|
|
|
|
assert cleaner.keep_id is False
|
2023-10-13 12:39:55 +02:00
|
|
|
|
|
|
|
|
|
def test_non_text_document(self, caplog):
|
|
|
|
|
with caplog.at_level(logging.WARNING):
|
|
|
|
|
cleaner = DocumentCleaner()
|
|
|
|
|
cleaner.run(documents=[Document()])
|
2023-10-31 12:44:04 +01:00
|
|
|
|
assert "DocumentCleaner only cleans text documents but document.content for document ID" in caplog.text
|
2023-10-13 12:39:55 +02:00
|
|
|
|
|
|
|
|
|
def test_single_document(self):
|
|
|
|
|
with pytest.raises(TypeError, match="DocumentCleaner expects a List of Documents as input."):
|
|
|
|
|
cleaner = DocumentCleaner()
|
|
|
|
|
cleaner.run(documents=Document())
|
|
|
|
|
|
|
|
|
|
def test_empty_list(self):
|
|
|
|
|
cleaner = DocumentCleaner()
|
|
|
|
|
result = cleaner.run(documents=[])
|
|
|
|
|
assert result == {"documents": []}
|
|
|
|
|
|
|
|
|
|
def test_remove_empty_lines(self):
|
|
|
|
|
cleaner = DocumentCleaner(remove_extra_whitespaces=False)
|
|
|
|
|
result = cleaner.run(
|
|
|
|
|
documents=[
|
|
|
|
|
Document(
|
2024-08-07 14:50:14 +02:00
|
|
|
|
content="This is a text with some words. \f"
|
2023-10-13 12:39:55 +02:00
|
|
|
|
""
|
|
|
|
|
"There is a second sentence. "
|
|
|
|
|
""
|
|
|
|
|
"And there is a third sentence."
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
assert len(result["documents"]) == 1
|
|
|
|
|
assert (
|
2023-10-31 12:44:04 +01:00
|
|
|
|
result["documents"][0].content
|
2024-08-07 14:50:14 +02:00
|
|
|
|
== "This is a text with some words. \fThere is a second sentence. And there is a third sentence."
|
2023-10-13 12:39:55 +02:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def test_remove_whitespaces(self):
|
|
|
|
|
cleaner = DocumentCleaner(remove_empty_lines=False)
|
|
|
|
|
result = cleaner.run(
|
|
|
|
|
documents=[
|
|
|
|
|
Document(
|
2023-10-31 12:44:04 +01:00
|
|
|
|
content=" This is a text with some words. "
|
2023-10-13 12:39:55 +02:00
|
|
|
|
""
|
|
|
|
|
"There is a second sentence. "
|
|
|
|
|
""
|
2024-08-07 14:50:14 +02:00
|
|
|
|
"And there is a third sentence.\f "
|
2023-10-13 12:39:55 +02:00
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
assert len(result["documents"]) == 1
|
2023-10-31 12:44:04 +01:00
|
|
|
|
assert result["documents"][0].content == (
|
2025-01-09 17:25:55 +01:00
|
|
|
|
"This is a text with some words. There is a second sentence. And there is a third sentence.\f"
|
2023-10-13 12:39:55 +02:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def test_remove_substrings(self):
|
|
|
|
|
cleaner = DocumentCleaner(remove_substrings=["This", "A", "words", "🪲"])
|
2024-08-07 14:50:14 +02:00
|
|
|
|
result = cleaner.run(documents=[Document(content="This is a text with some words.\f🪲")])
|
2023-10-13 12:39:55 +02:00
|
|
|
|
assert len(result["documents"]) == 1
|
2024-08-07 14:50:14 +02:00
|
|
|
|
assert result["documents"][0].content == " is a text with some .\f"
|
2023-10-13 12:39:55 +02:00
|
|
|
|
|
|
|
|
|
def test_remove_regex(self):
|
|
|
|
|
cleaner = DocumentCleaner(remove_regex=r"\s\s+")
|
2024-08-07 14:50:14 +02:00
|
|
|
|
result = cleaner.run(documents=[Document(content="This is a text \f with some words.")])
|
2023-10-13 12:39:55 +02:00
|
|
|
|
assert len(result["documents"]) == 1
|
2024-08-07 14:50:14 +02:00
|
|
|
|
assert result["documents"][0].content == "This is a text\fwith some words."
|
2023-10-13 12:39:55 +02:00
|
|
|
|
|
|
|
|
|
def test_remove_repeated_substrings(self):
|
|
|
|
|
cleaner = DocumentCleaner(
|
|
|
|
|
remove_empty_lines=False, remove_extra_whitespaces=False, remove_repeated_substrings=True
|
|
|
|
|
)
|
|
|
|
|
|
2024-08-07 14:50:14 +02:00
|
|
|
|
text = """First Page\fThis is a header.
|
2023-10-13 12:39:55 +02:00
|
|
|
|
Page of
|
|
|
|
|
2
|
|
|
|
|
4
|
|
|
|
|
Lorem ipsum dolor sit amet
|
|
|
|
|
This is a footer number 1
|
|
|
|
|
This is footer number 2This is a header.
|
|
|
|
|
Page of
|
|
|
|
|
3
|
|
|
|
|
4
|
|
|
|
|
Sid ut perspiciatis unde
|
|
|
|
|
This is a footer number 1
|
|
|
|
|
This is footer number 2This is a header.
|
|
|
|
|
Page of
|
|
|
|
|
4
|
|
|
|
|
4
|
|
|
|
|
Sed do eiusmod tempor.
|
|
|
|
|
This is a footer number 1
|
|
|
|
|
This is footer number 2"""
|
|
|
|
|
|
2024-08-07 14:50:14 +02:00
|
|
|
|
expected_text = """First Page\f 2
|
2023-10-13 12:39:55 +02:00
|
|
|
|
4
|
|
|
|
|
Lorem ipsum dolor sit amet 3
|
|
|
|
|
4
|
|
|
|
|
Sid ut perspiciatis unde 4
|
|
|
|
|
4
|
|
|
|
|
Sed do eiusmod tempor."""
|
2023-10-31 12:44:04 +01:00
|
|
|
|
result = cleaner.run(documents=[Document(content=text)])
|
|
|
|
|
assert result["documents"][0].content == expected_text
|
2023-10-17 11:03:48 +02:00
|
|
|
|
|
2023-10-20 15:16:06 +02:00
|
|
|
|
def test_copy_metadata(self):
|
2023-10-17 11:03:48 +02:00
|
|
|
|
cleaner = DocumentCleaner()
|
|
|
|
|
documents = [
|
2023-10-31 12:44:04 +01:00
|
|
|
|
Document(content="Text. ", meta={"name": "doc 0"}),
|
|
|
|
|
Document(content="Text. ", meta={"name": "doc 1"}),
|
2023-10-17 11:03:48 +02:00
|
|
|
|
]
|
|
|
|
|
result = cleaner.run(documents=documents)
|
|
|
|
|
assert len(result["documents"]) == 2
|
|
|
|
|
assert result["documents"][0].id != result["documents"][1].id
|
|
|
|
|
for doc, cleaned_doc in zip(documents, result["documents"]):
|
2023-10-31 12:44:04 +01:00
|
|
|
|
assert doc.meta == cleaned_doc.meta
|
|
|
|
|
assert cleaned_doc.content == "Text."
|
2024-05-16 19:18:48 +02:00
|
|
|
|
|
|
|
|
|
def test_keep_id_does_not_alter_document_ids(self):
|
|
|
|
|
cleaner = DocumentCleaner(keep_id=True)
|
|
|
|
|
documents = [Document(content="Text. ", id="1"), Document(content="Text. ", id="2")]
|
|
|
|
|
result = cleaner.run(documents=documents)
|
|
|
|
|
assert len(result["documents"]) == 2
|
|
|
|
|
assert result["documents"][0].id == "1"
|
|
|
|
|
assert result["documents"][1].id == "2"
|
2024-08-05 21:00:39 +10:00
|
|
|
|
|
|
|
|
|
def test_unicode_normalization(self):
|
|
|
|
|
text = """\
|
|
|
|
|
アイウエオ
|
|
|
|
|
Comment ça va
|
|
|
|
|
مرحبا بالعالم
|
|
|
|
|
em Space"""
|
|
|
|
|
|
|
|
|
|
expected_text_NFC = """\
|
|
|
|
|
アイウエオ
|
|
|
|
|
Comment ça va
|
|
|
|
|
مرحبا بالعالم
|
|
|
|
|
em Space"""
|
|
|
|
|
|
|
|
|
|
expected_text_NFD = """\
|
|
|
|
|
アイウエオ
|
|
|
|
|
Comment ça va
|
|
|
|
|
مرحبا بالعالم
|
|
|
|
|
em Space"""
|
|
|
|
|
|
|
|
|
|
expected_text_NFKC = """\
|
|
|
|
|
アイウエオ
|
|
|
|
|
Comment ça va
|
|
|
|
|
مرحبا بالعالم
|
|
|
|
|
em Space"""
|
|
|
|
|
|
|
|
|
|
expected_text_NFKD = """\
|
|
|
|
|
アイウエオ
|
|
|
|
|
Comment ça va
|
|
|
|
|
مرحبا بالعالم
|
|
|
|
|
em Space"""
|
|
|
|
|
|
|
|
|
|
nfc_cleaner = DocumentCleaner(unicode_normalization="NFC", remove_extra_whitespaces=False)
|
|
|
|
|
nfd_cleaner = DocumentCleaner(unicode_normalization="NFD", remove_extra_whitespaces=False)
|
|
|
|
|
nfkc_cleaner = DocumentCleaner(unicode_normalization="NFKC", remove_extra_whitespaces=False)
|
|
|
|
|
nfkd_cleaner = DocumentCleaner(unicode_normalization="NFKD", remove_extra_whitespaces=False)
|
|
|
|
|
|
|
|
|
|
nfc_result = nfc_cleaner.run(documents=[Document(content=text)])
|
|
|
|
|
nfd_result = nfd_cleaner.run(documents=[Document(content=text)])
|
|
|
|
|
nfkc_result = nfkc_cleaner.run(documents=[Document(content=text)])
|
|
|
|
|
nfkd_result = nfkd_cleaner.run(documents=[Document(content=text)])
|
|
|
|
|
|
|
|
|
|
assert nfc_result["documents"][0].content == expected_text_NFC
|
|
|
|
|
assert nfd_result["documents"][0].content == expected_text_NFD
|
|
|
|
|
assert nfkc_result["documents"][0].content == expected_text_NFKC
|
|
|
|
|
assert nfkd_result["documents"][0].content == expected_text_NFKD
|
|
|
|
|
|
|
|
|
|
def test_ascii_only(self):
|
|
|
|
|
text = """\
|
|
|
|
|
アイウエオ
|
|
|
|
|
Comment ça va
|
|
|
|
|
Á
|
|
|
|
|
مرحبا بالعالم
|
|
|
|
|
em Space"""
|
|
|
|
|
|
|
|
|
|
expected_text = """\
|
|
|
|
|
\n\
|
|
|
|
|
Comment ca va
|
|
|
|
|
A
|
|
|
|
|
\n\
|
|
|
|
|
em Space"""
|
|
|
|
|
|
|
|
|
|
cleaner = DocumentCleaner(ascii_only=True, remove_extra_whitespaces=False, remove_empty_lines=False)
|
|
|
|
|
result = cleaner.run(documents=[Document(content=text)])
|
|
|
|
|
assert result["documents"][0].content == expected_text
|
2024-11-25 13:08:59 +01:00
|
|
|
|
|
|
|
|
|
def test_other_document_fields_are_not_lost(self):
|
|
|
|
|
cleaner = DocumentCleaner(keep_id=True)
|
|
|
|
|
document = Document(
|
2025-01-09 17:25:55 +01:00
|
|
|
|
content="This is a text with some words. \nThere is a second sentence. \nAnd there is a third sentence.\n",
|
2024-11-25 13:08:59 +01:00
|
|
|
|
blob=ByteStream.from_string("some_data"),
|
|
|
|
|
meta={"data": 1},
|
|
|
|
|
score=0.1,
|
|
|
|
|
embedding=[0.1, 0.2, 0.3],
|
|
|
|
|
sparse_embedding=SparseEmbedding([0, 2], [0.1, 0.3]),
|
|
|
|
|
)
|
|
|
|
|
res = cleaner.run(documents=[document])
|
|
|
|
|
|
|
|
|
|
assert len(res) == 1
|
|
|
|
|
assert len(res["documents"])
|
|
|
|
|
assert res["documents"][0].id == document.id
|
|
|
|
|
assert res["documents"][0].content == (
|
|
|
|
|
"This is a text with some words. There is a second sentence. And there is a third sentence."
|
|
|
|
|
)
|
|
|
|
|
assert res["documents"][0].blob == document.blob
|
|
|
|
|
assert res["documents"][0].meta == document.meta
|
|
|
|
|
assert res["documents"][0].score == document.score
|
|
|
|
|
assert res["documents"][0].embedding == document.embedding
|
|
|
|
|
assert res["documents"][0].sparse_embedding == document.sparse_embedding
|