Remove substrings basic implementation (#2152)

* Remove substrings basic implementation * Update Documentation & Code Style * Remove substrings basic tests * Simplify test
2026-01-08 13:06:29 +00:00 · 2022-03-08 15:49:56 +01:00 · 2022-03-08 15:49:56 +01:00 · ecec9b4e2c
commit ecec9b4e2c
parent 6c0094b5ad
5 changed files with 59 additions and 3 deletions
--- a/docs/_src/api/api/preprocessor.md
+++ b/docs/_src/api/api/preprocessor.md
@ -37,7 +37,7 @@ class PreProcessor(BasePreProcessor)
 #### process

 ```python
-def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict]
+def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, remove_substrings: List[str] = [], split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict]
 ```

 Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
@ -47,7 +47,7 @@ Perform document cleaning and splitting. Can take a single document or a list of
 #### clean

 ```python
-def clean(document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool) -> dict
+def clean(document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool, remove_substrings: List[str]) -> dict
 ```

 Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
--- a/haystack/nodes/preprocessor/preprocessor.py
+++ b/haystack/nodes/preprocessor/preprocessor.py
@ -43,6 +43,7 @@ class PreProcessor(BasePreProcessor):
        clean_whitespace: bool = True,
        clean_header_footer: bool = False,
        clean_empty_lines: bool = True,
+        remove_substrings: List[str] = [],
        split_by: str = "word",
        split_length: int = 200,
        split_overlap: int = 0,
@ -56,6 +57,7 @@ class PreProcessor(BasePreProcessor):
                                     or similar.
        :param clean_whitespace: Strip whitespaces before or after each line in the text.
        :param clean_empty_lines: Remove more than two empty lines in the text.
+        :param remove_substrings: Remove specified substrings from the text.
        :param split_by: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
        :param split_length: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by ->
                           "sentence", then each output document will have 10 sentences.
@ -76,6 +78,7 @@ class PreProcessor(BasePreProcessor):
            clean_whitespace=clean_whitespace,
            clean_header_footer=clean_header_footer,
            clean_empty_lines=clean_empty_lines,
+            remove_substrings=remove_substrings,
            split_by=split_by,
            split_length=split_length,
            split_overlap=split_overlap,
@ -90,6 +93,7 @@ class PreProcessor(BasePreProcessor):
        self.clean_whitespace = clean_whitespace
        self.clean_header_footer = clean_header_footer
        self.clean_empty_lines = clean_empty_lines
+        self.remove_substrings = remove_substrings
        self.split_by = split_by
        self.split_length = split_length
        self.split_overlap = split_overlap
@ -103,6 +107,7 @@ class PreProcessor(BasePreProcessor):
        clean_whitespace: Optional[bool] = None,
        clean_header_footer: Optional[bool] = None,
        clean_empty_lines: Optional[bool] = None,
+        remove_substrings: List[str] = [],
        split_by: Optional[str] = None,
        split_length: Optional[int] = None,
        split_overlap: Optional[int] = None,
@ -117,6 +122,7 @@ class PreProcessor(BasePreProcessor):
            "clean_whitespace": clean_whitespace,
            "clean_header_footer": clean_header_footer,
            "clean_empty_lines": clean_empty_lines,
+            "remove_substrings": remove_substrings,
            "split_by": split_by,
            "split_length": split_length,
            "split_overlap": split_overlap,
@ -141,6 +147,7 @@ class PreProcessor(BasePreProcessor):
        clean_whitespace: Optional[bool] = None,
        clean_header_footer: Optional[bool] = None,
        clean_empty_lines: Optional[bool] = None,
+        remove_substrings: List[str] = [],
        split_by: Optional[str] = None,
        split_length: Optional[int] = None,
        split_overlap: Optional[int] = None,
@ -153,6 +160,8 @@ class PreProcessor(BasePreProcessor):
            clean_header_footer = self.clean_header_footer
        if clean_empty_lines is None:
            clean_empty_lines = self.clean_empty_lines
+        if not remove_substrings:
+            remove_substrings = self.remove_substrings
        if split_by is None:
            split_by = self.split_by
        if split_length is None:
@ -167,6 +176,7 @@ class PreProcessor(BasePreProcessor):
            clean_whitespace=clean_whitespace,
            clean_header_footer=clean_header_footer,
            clean_empty_lines=clean_empty_lines,
+            remove_substrings=remove_substrings,
        )
        split_documents = self.split(
            document=cleaned_document,
@ -181,7 +191,14 @@ class PreProcessor(BasePreProcessor):
        nested_docs = [self._process_single(d, **kwargs) for d in tqdm(documents, unit="docs")]
        return [d for x in nested_docs for d in x]

-    def clean(self, document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool) -> dict:
+    def clean(
+        self,
+        document: dict,
+        clean_whitespace: bool,
+        clean_header_footer: bool,
+        clean_empty_lines: bool,
+        remove_substrings: List[str],
+    ) -> dict:
        """
        Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
        and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
@ -204,6 +221,9 @@ class PreProcessor(BasePreProcessor):
        if clean_empty_lines:
            text = re.sub(r"\n\n+", "\n\n", text)

+        for substring in remove_substrings:
+            text = text.replace(substring, "")
+
        document["content"] = text
        return document

--- a/json-schemas/haystack-pipeline-1.1.0.schema.json
+++ b/json-schemas/haystack-pipeline-1.1.0.schema.json
@ -1381,6 +1381,14 @@
              "default": true,
              "type": "boolean"
            },
+            "remove_substrings": {
+              "title": "Remove Substrings",
+              "default": [],
+              "type": "array",
+              "items": {
+                "type": "string"
+              }
+            },
            "split_by": {
              "title": "Split By",
              "default": "word",
--- a/json-schemas/haystack-pipeline-1.2.1rc0.schema.json
+++ b/json-schemas/haystack-pipeline-1.2.1rc0.schema.json
@ -1432,6 +1432,14 @@
              "default": true,
              "type": "boolean"
            },
+            "remove_substrings": {
+              "title": "Remove Substrings",
+              "default": [],
+              "type": "array",
+              "items": {
+                "type": "string"
+              }
+            },
            "split_by": {
              "title": "Split By",
              "default": "word",
--- a/test/test_preprocessor.py
+++ b/test/test_preprocessor.py
@ -88,3 +88,23 @@ def test_clean_header_footer():

    assert "This is a header." not in documents[0]["content"]
    assert "footer" not in documents[0]["content"]
+
+
+def test_remove_substrings():
+    document = Document("This is a header. Some additional text. wiki. Some emoji ✨ 🪲 Weird whitespace\b\b\b.")
+
+    # check that the file contains the substrings we are about to remove
+    assert "This is a header." in document["content"]
+    assert "wiki" in document["content"]
+    assert "🪲" in document["content"]
+    assert "whitespace" in document["content"]
+    assert "✨" in document["content"]
+
+    preprocessor = PreProcessor(remove_substrings=["This is a header.", "wiki", "🪲"])
+    documents = preprocessor.process(document)
+
+    assert "This is a header." not in document["content"]
+    assert "wiki" not in document["content"]
+    assert "🪲" not in document["content"]
+    assert "whitespace" in document["content"]
+    assert "✨" in document["content"]