mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-06 08:30:31 +00:00
71 lines
2.3 KiB
Python
71 lines
2.3 KiB
Python
![]() |
from haystack.components.preprocessors import TextCleaner
|
||
|
|
||
|
|
||
|
def test_init_default():
|
||
|
cleaner = TextCleaner()
|
||
|
assert cleaner._remove_regexps is None
|
||
|
assert not cleaner._convert_to_lowercase
|
||
|
assert not cleaner._remove_punctuation
|
||
|
assert not cleaner._remove_numbers
|
||
|
assert cleaner._regex is None
|
||
|
assert cleaner._translator is None
|
||
|
|
||
|
|
||
|
def test_run():
|
||
|
cleaner = TextCleaner()
|
||
|
texts = ["Some text", "Some other text", "Yet another text"]
|
||
|
result = cleaner.run(texts=texts)
|
||
|
assert len(result) == 1
|
||
|
assert result["texts"] == texts
|
||
|
|
||
|
|
||
|
def test_run_with_empty_inputs():
|
||
|
cleaner = TextCleaner()
|
||
|
result = cleaner.run(texts=[])
|
||
|
assert len(result) == 1
|
||
|
assert result["texts"] == []
|
||
|
|
||
|
|
||
|
def test_run_with_regex():
|
||
|
cleaner = TextCleaner(remove_regexps=[r"\d+"])
|
||
|
result = cleaner.run(texts=["Open123 Source", "HaystackAI"])
|
||
|
assert len(result) == 1
|
||
|
assert result["texts"] == ["Open Source", "HaystackAI"]
|
||
|
|
||
|
|
||
|
def test_run_with_multiple_regexps():
|
||
|
cleaner = TextCleaner(remove_regexps=[r"\d+", r"[^\w\s]"])
|
||
|
result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
|
||
|
assert len(result) == 1
|
||
|
assert result["texts"] == ["Open Source", "HaystackAI"]
|
||
|
|
||
|
|
||
|
def test_run_with_convert_to_lowercase():
|
||
|
cleaner = TextCleaner(convert_to_lowercase=True)
|
||
|
result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
|
||
|
assert len(result) == 1
|
||
|
assert result["texts"] == ["open123! source", "haystack.ai"]
|
||
|
|
||
|
|
||
|
def test_run_with_remove_punctuation():
|
||
|
cleaner = TextCleaner(remove_punctuation=True)
|
||
|
result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
|
||
|
assert len(result) == 1
|
||
|
assert result["texts"] == ["Open123 Source", "HaystackAI"]
|
||
|
|
||
|
|
||
|
def test_run_with_remove_numbers():
|
||
|
cleaner = TextCleaner(remove_numbers=True)
|
||
|
result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
|
||
|
assert len(result) == 1
|
||
|
assert result["texts"] == ["Open! Source", "Haystack.AI"]
|
||
|
|
||
|
|
||
|
def test_run_with_multiple_parameters():
|
||
|
cleaner = TextCleaner(
|
||
|
remove_regexps=[r"\d+", r"[^\w\s]"], convert_to_lowercase=True, remove_punctuation=True, remove_numbers=True
|
||
|
)
|
||
|
result = cleaner.run(texts=["Open%123. !$Source", "Haystack.AI##"])
|
||
|
assert len(result) == 1
|
||
|
assert result["texts"] == ["open source", "haystackai"]
|