mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 12:07:04 +00:00
feat: Add TextCleaner component (#6997)
* Add TextCleaner component * Update docstrings and simplify run logic * Update docstrings
This commit is contained in:
parent
2a4e6a1de2
commit
c82f787b41
@ -1,4 +1,5 @@
|
||||
from haystack.components.preprocessors.document_cleaner import DocumentCleaner
|
||||
from haystack.components.preprocessors.document_splitter import DocumentSplitter
|
||||
from .document_cleaner import DocumentCleaner
|
||||
from .document_splitter import DocumentSplitter
|
||||
from .text_cleaner import TextCleaner
|
||||
|
||||
__all__ = ["DocumentSplitter", "DocumentCleaner"]
|
||||
__all__ = ["DocumentSplitter", "DocumentCleaner", "TextCleaner"]
|
||||
|
||||
67
haystack/components/preprocessors/text_cleaner.py
Normal file
67
haystack/components/preprocessors/text_cleaner.py
Normal file
@ -0,0 +1,67 @@
|
||||
import re
|
||||
import string
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from haystack import component
|
||||
|
||||
|
||||
@component
|
||||
class TextCleaner:
|
||||
"""
|
||||
A preprocessor component to clean text data. It can remove substrings matching a list of regular expressions,
|
||||
convert text to lowercase, remove punctuation, and remove numbers.
|
||||
This is useful to cleanup text data before evaluation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
remove_regexps: Optional[List[str]] = None,
|
||||
convert_to_lowercase: bool = False,
|
||||
remove_punctuation: bool = False,
|
||||
remove_numbers: bool = False,
|
||||
):
|
||||
"""
|
||||
Creates a new instance of TextCleaner.
|
||||
|
||||
:param remove_regexps: A list of regular expressions. If provided, it removes substrings
|
||||
matching these regular expressions from the text. Defaults to None.
|
||||
:param convert_to_lowercase: If True, converts all characters to lowercase. Defaults to False.
|
||||
:param remove_punctuation: If True, removes punctuation from the text. Defaults to False.
|
||||
:param remove_numbers: If True, removes numerical digits from the text. Defaults to False.
|
||||
"""
|
||||
self._remove_regexps = remove_regexps
|
||||
self._convert_to_lowercase = convert_to_lowercase
|
||||
self._remove_punctuation = remove_punctuation
|
||||
self._remove_numbers = remove_numbers
|
||||
|
||||
self._regex = None
|
||||
if remove_regexps:
|
||||
self._regex = re.compile("|".join(remove_regexps), flags=re.IGNORECASE)
|
||||
to_remove = ""
|
||||
if remove_punctuation:
|
||||
to_remove = string.punctuation
|
||||
if remove_numbers:
|
||||
to_remove += string.digits
|
||||
|
||||
self._translator = str.maketrans("", "", to_remove) if to_remove else None
|
||||
|
||||
@component.output_types(texts=List[str])
|
||||
def run(self, texts: List[str]) -> Dict[str, Any]:
|
||||
r"""
|
||||
Run the TextCleaner on the given list of strings.
|
||||
|
||||
:param texts: List of strings to clean.
|
||||
:returns: A dictionary with the following outputs:
|
||||
* `texts` - The cleaned list of strings.
|
||||
"""
|
||||
|
||||
if self._regex:
|
||||
texts = [self._regex.sub("", text) for text in texts]
|
||||
|
||||
if self._convert_to_lowercase:
|
||||
texts = [text.lower() for text in texts]
|
||||
|
||||
if self._translator:
|
||||
texts = [text.translate(self._translator) for text in texts]
|
||||
|
||||
return {"texts": texts}
|
||||
6
releasenotes/notes/text-cleaner-eee0eecbdec21427.yaml
Normal file
6
releasenotes/notes/text-cleaner-eee0eecbdec21427.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Add `TextCleaner` Component to clean list of strings. It can remove substrings matching a list of regular expressions,
|
||||
convert text to lowercase, remove punctuation, and remove numbers.
|
||||
This is mostly useful to clean generator predictions before evaluation.
|
||||
70
test/components/preprocessors/test_text_cleaner.py
Normal file
70
test/components/preprocessors/test_text_cleaner.py
Normal file
@ -0,0 +1,70 @@
|
||||
from haystack.components.preprocessors import TextCleaner
|
||||
|
||||
|
||||
def test_init_default():
|
||||
cleaner = TextCleaner()
|
||||
assert cleaner._remove_regexps is None
|
||||
assert not cleaner._convert_to_lowercase
|
||||
assert not cleaner._remove_punctuation
|
||||
assert not cleaner._remove_numbers
|
||||
assert cleaner._regex is None
|
||||
assert cleaner._translator is None
|
||||
|
||||
|
||||
def test_run():
|
||||
cleaner = TextCleaner()
|
||||
texts = ["Some text", "Some other text", "Yet another text"]
|
||||
result = cleaner.run(texts=texts)
|
||||
assert len(result) == 1
|
||||
assert result["texts"] == texts
|
||||
|
||||
|
||||
def test_run_with_empty_inputs():
|
||||
cleaner = TextCleaner()
|
||||
result = cleaner.run(texts=[])
|
||||
assert len(result) == 1
|
||||
assert result["texts"] == []
|
||||
|
||||
|
||||
def test_run_with_regex():
|
||||
cleaner = TextCleaner(remove_regexps=[r"\d+"])
|
||||
result = cleaner.run(texts=["Open123 Source", "HaystackAI"])
|
||||
assert len(result) == 1
|
||||
assert result["texts"] == ["Open Source", "HaystackAI"]
|
||||
|
||||
|
||||
def test_run_with_multiple_regexps():
|
||||
cleaner = TextCleaner(remove_regexps=[r"\d+", r"[^\w\s]"])
|
||||
result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
|
||||
assert len(result) == 1
|
||||
assert result["texts"] == ["Open Source", "HaystackAI"]
|
||||
|
||||
|
||||
def test_run_with_convert_to_lowercase():
|
||||
cleaner = TextCleaner(convert_to_lowercase=True)
|
||||
result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
|
||||
assert len(result) == 1
|
||||
assert result["texts"] == ["open123! source", "haystack.ai"]
|
||||
|
||||
|
||||
def test_run_with_remove_punctuation():
|
||||
cleaner = TextCleaner(remove_punctuation=True)
|
||||
result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
|
||||
assert len(result) == 1
|
||||
assert result["texts"] == ["Open123 Source", "HaystackAI"]
|
||||
|
||||
|
||||
def test_run_with_remove_numbers():
|
||||
cleaner = TextCleaner(remove_numbers=True)
|
||||
result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
|
||||
assert len(result) == 1
|
||||
assert result["texts"] == ["Open! Source", "Haystack.AI"]
|
||||
|
||||
|
||||
def test_run_with_multiple_parameters():
|
||||
cleaner = TextCleaner(
|
||||
remove_regexps=[r"\d+", r"[^\w\s]"], convert_to_lowercase=True, remove_punctuation=True, remove_numbers=True
|
||||
)
|
||||
result = cleaner.run(texts=["Open%123. !$Source", "Haystack.AI##"])
|
||||
assert len(result) == 1
|
||||
assert result["texts"] == ["open source", "haystackai"]
|
||||
Loading…
x
Reference in New Issue
Block a user