feat: Add TextCleaner component (#6997)

* Add TextCleaner component * Update docstrings and simplify run logic * Update docstrings
2026-01-06 12:07:04 +00:00 · 2024-02-15 16:10:38 +01:00 · 2024-02-15 16:10:38 +01:00 · c82f787b41
commit c82f787b41
parent 2a4e6a1de2
4 changed files with 147 additions and 3 deletions
--- a/haystack/components/preprocessors/init.py
+++ b/haystack/components/preprocessors/init.py
@ -1,4 +1,5 @@
-from haystack.components.preprocessors.document_cleaner import DocumentCleaner
-from haystack.components.preprocessors.document_splitter import DocumentSplitter
+from .document_cleaner import DocumentCleaner
+from .document_splitter import DocumentSplitter
+from .text_cleaner import TextCleaner

-__all__ = ["DocumentSplitter", "DocumentCleaner"]
+__all__ = ["DocumentSplitter", "DocumentCleaner", "TextCleaner"]
--- a/haystack/components/preprocessors/text_cleaner.py
+++ b/haystack/components/preprocessors/text_cleaner.py
@ -0,0 +1,67 @@
+import re
+import string
+from typing import Any, Dict, List, Optional
+
+from haystack import component
+
+
+@component
+class TextCleaner:
+    """
+    A preprocessor component to clean text data. It can remove substrings matching a list of regular expressions,
+    convert text to lowercase, remove punctuation, and remove numbers.
+    This is useful to cleanup text data before evaluation.
+    """
+
+    def __init__(
+        self,
+        remove_regexps: Optional[List[str]] = None,
+        convert_to_lowercase: bool = False,
+        remove_punctuation: bool = False,
+        remove_numbers: bool = False,
+    ):
+        """
+        Creates a new instance of TextCleaner.
+
+        :param remove_regexps: A list of regular expressions. If provided, it removes substrings
+            matching these regular expressions from the text. Defaults to None.
+        :param convert_to_lowercase: If True, converts all characters to lowercase. Defaults to False.
+        :param remove_punctuation: If True, removes punctuation from the text. Defaults to False.
+        :param remove_numbers: If True, removes numerical digits from the text. Defaults to False.
+        """
+        self._remove_regexps = remove_regexps
+        self._convert_to_lowercase = convert_to_lowercase
+        self._remove_punctuation = remove_punctuation
+        self._remove_numbers = remove_numbers
+
+        self._regex = None
+        if remove_regexps:
+            self._regex = re.compile("|".join(remove_regexps), flags=re.IGNORECASE)
+        to_remove = ""
+        if remove_punctuation:
+            to_remove = string.punctuation
+        if remove_numbers:
+            to_remove += string.digits
+
+        self._translator = str.maketrans("", "", to_remove) if to_remove else None
+
+    @component.output_types(texts=List[str])
+    def run(self, texts: List[str]) -> Dict[str, Any]:
+        r"""
+        Run the TextCleaner on the given list of strings.
+
+        :param texts: List of strings to clean.
+        :returns: A dictionary with the following outputs:
+                * `texts` - The cleaned list of strings.
+        """
+
+        if self._regex:
+            texts = [self._regex.sub("", text) for text in texts]
+
+        if self._convert_to_lowercase:
+            texts = [text.lower() for text in texts]
+
+        if self._translator:
+            texts = [text.translate(self._translator) for text in texts]
+
+        return {"texts": texts}
--- a/releasenotes/notes/text-cleaner-eee0eecbdec21427.yaml
+++ b/releasenotes/notes/text-cleaner-eee0eecbdec21427.yaml
@ -0,0 +1,6 @@
+---
+features:
+  - |
+    Add `TextCleaner` Component to clean list of strings. It can remove substrings matching a list of regular expressions,
+    convert text to lowercase, remove punctuation, and remove numbers.
+    This is mostly useful to clean generator predictions before evaluation.
--- a/test/components/preprocessors/test_text_cleaner.py
+++ b/test/components/preprocessors/test_text_cleaner.py
@ -0,0 +1,70 @@
+from haystack.components.preprocessors import TextCleaner
+
+
+def test_init_default():
+    cleaner = TextCleaner()
+    assert cleaner._remove_regexps is None
+    assert not cleaner._convert_to_lowercase
+    assert not cleaner._remove_punctuation
+    assert not cleaner._remove_numbers
+    assert cleaner._regex is None
+    assert cleaner._translator is None
+
+
+def test_run():
+    cleaner = TextCleaner()
+    texts = ["Some text", "Some other text", "Yet another text"]
+    result = cleaner.run(texts=texts)
+    assert len(result) == 1
+    assert result["texts"] == texts
+
+
+def test_run_with_empty_inputs():
+    cleaner = TextCleaner()
+    result = cleaner.run(texts=[])
+    assert len(result) == 1
+    assert result["texts"] == []
+
+
+def test_run_with_regex():
+    cleaner = TextCleaner(remove_regexps=[r"\d+"])
+    result = cleaner.run(texts=["Open123 Source", "HaystackAI"])
+    assert len(result) == 1
+    assert result["texts"] == ["Open Source", "HaystackAI"]
+
+
+def test_run_with_multiple_regexps():
+    cleaner = TextCleaner(remove_regexps=[r"\d+", r"[^\w\s]"])
+    result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
+    assert len(result) == 1
+    assert result["texts"] == ["Open Source", "HaystackAI"]
+
+
+def test_run_with_convert_to_lowercase():
+    cleaner = TextCleaner(convert_to_lowercase=True)
+    result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
+    assert len(result) == 1
+    assert result["texts"] == ["open123! source", "haystack.ai"]
+
+
+def test_run_with_remove_punctuation():
+    cleaner = TextCleaner(remove_punctuation=True)
+    result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
+    assert len(result) == 1
+    assert result["texts"] == ["Open123 Source", "HaystackAI"]
+
+
+def test_run_with_remove_numbers():
+    cleaner = TextCleaner(remove_numbers=True)
+    result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
+    assert len(result) == 1
+    assert result["texts"] == ["Open! Source", "Haystack.AI"]
+
+
+def test_run_with_multiple_parameters():
+    cleaner = TextCleaner(
+        remove_regexps=[r"\d+", r"[^\w\s]"], convert_to_lowercase=True, remove_punctuation=True, remove_numbers=True
+    )
+    result = cleaner.run(texts=["Open%123. !$Source", "Haystack.AI##"])
+    assert len(result) == 1
+    assert result["texts"] == ["open source", "haystackai"]