diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md index c92599f6a..bc9888d28 100644 --- a/docs/_src/api/api/preprocessor.md +++ b/docs/_src/api/api/preprocessor.md @@ -37,7 +37,7 @@ class PreProcessor(BasePreProcessor) #### process ```python -def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict] +def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, remove_substrings: List[str] = [], split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict] ``` Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents. @@ -47,7 +47,7 @@ Perform document cleaning and splitting. Can take a single document or a list of #### clean ```python -def clean(document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool) -> dict +def clean(document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool, remove_substrings: List[str]) -> dict ``` Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py index 346227782..57229b64a 100644 --- a/haystack/nodes/preprocessor/preprocessor.py +++ b/haystack/nodes/preprocessor/preprocessor.py @@ -43,6 +43,7 @@ class PreProcessor(BasePreProcessor): clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, + remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, @@ -56,6 +57,7 @@ class PreProcessor(BasePreProcessor): or similar. :param clean_whitespace: Strip whitespaces before or after each line in the text. :param clean_empty_lines: Remove more than two empty lines in the text. + :param remove_substrings: Remove specified substrings from the text. :param split_by: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting. :param split_length: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by -> "sentence", then each output document will have 10 sentences. @@ -76,6 +78,7 @@ class PreProcessor(BasePreProcessor): clean_whitespace=clean_whitespace, clean_header_footer=clean_header_footer, clean_empty_lines=clean_empty_lines, + remove_substrings=remove_substrings, split_by=split_by, split_length=split_length, split_overlap=split_overlap, @@ -90,6 +93,7 @@ class PreProcessor(BasePreProcessor): self.clean_whitespace = clean_whitespace self.clean_header_footer = clean_header_footer self.clean_empty_lines = clean_empty_lines + self.remove_substrings = remove_substrings self.split_by = split_by self.split_length = split_length self.split_overlap = split_overlap @@ -103,6 +107,7 @@ class PreProcessor(BasePreProcessor): clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, + remove_substrings: List[str] = [], split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, @@ -117,6 +122,7 @@ class PreProcessor(BasePreProcessor): "clean_whitespace": clean_whitespace, "clean_header_footer": clean_header_footer, "clean_empty_lines": clean_empty_lines, + "remove_substrings": remove_substrings, "split_by": split_by, "split_length": split_length, "split_overlap": split_overlap, @@ -141,6 +147,7 @@ class PreProcessor(BasePreProcessor): clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, + remove_substrings: List[str] = [], split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, @@ -153,6 +160,8 @@ class PreProcessor(BasePreProcessor): clean_header_footer = self.clean_header_footer if clean_empty_lines is None: clean_empty_lines = self.clean_empty_lines + if not remove_substrings: + remove_substrings = self.remove_substrings if split_by is None: split_by = self.split_by if split_length is None: @@ -167,6 +176,7 @@ class PreProcessor(BasePreProcessor): clean_whitespace=clean_whitespace, clean_header_footer=clean_header_footer, clean_empty_lines=clean_empty_lines, + remove_substrings=remove_substrings, ) split_documents = self.split( document=cleaned_document, @@ -181,7 +191,14 @@ class PreProcessor(BasePreProcessor): nested_docs = [self._process_single(d, **kwargs) for d in tqdm(documents, unit="docs")] return [d for x in nested_docs for d in x] - def clean(self, document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool) -> dict: + def clean( + self, + document: dict, + clean_whitespace: bool, + clean_header_footer: bool, + clean_empty_lines: bool, + remove_substrings: List[str], + ) -> dict: """ Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__(). @@ -204,6 +221,9 @@ class PreProcessor(BasePreProcessor): if clean_empty_lines: text = re.sub(r"\n\n+", "\n\n", text) + for substring in remove_substrings: + text = text.replace(substring, "") + document["content"] = text return document diff --git a/json-schemas/haystack-pipeline-1.1.0.schema.json b/json-schemas/haystack-pipeline-1.1.0.schema.json index 6352802fc..088561ddc 100644 --- a/json-schemas/haystack-pipeline-1.1.0.schema.json +++ b/json-schemas/haystack-pipeline-1.1.0.schema.json @@ -1381,6 +1381,14 @@ "default": true, "type": "boolean" }, + "remove_substrings": { + "title": "Remove Substrings", + "default": [], + "type": "array", + "items": { + "type": "string" + } + }, "split_by": { "title": "Split By", "default": "word", diff --git a/json-schemas/haystack-pipeline-1.2.1rc0.schema.json b/json-schemas/haystack-pipeline-1.2.1rc0.schema.json index 6312ed505..45353fcc7 100644 --- a/json-schemas/haystack-pipeline-1.2.1rc0.schema.json +++ b/json-schemas/haystack-pipeline-1.2.1rc0.schema.json @@ -1432,6 +1432,14 @@ "default": true, "type": "boolean" }, + "remove_substrings": { + "title": "Remove Substrings", + "default": [], + "type": "array", + "items": { + "type": "string" + } + }, "split_by": { "title": "Split By", "default": "word", diff --git a/test/test_preprocessor.py b/test/test_preprocessor.py index a3f8c6c05..2e56a9c7e 100644 --- a/test/test_preprocessor.py +++ b/test/test_preprocessor.py @@ -88,3 +88,23 @@ def test_clean_header_footer(): assert "This is a header." not in documents[0]["content"] assert "footer" not in documents[0]["content"] + + +def test_remove_substrings(): + document = Document("This is a header. Some additional text. wiki. Some emoji ✨ 🪲 Weird whitespace\b\b\b.") + + # check that the file contains the substrings we are about to remove + assert "This is a header." in document["content"] + assert "wiki" in document["content"] + assert "🪲" in document["content"] + assert "whitespace" in document["content"] + assert "✨" in document["content"] + + preprocessor = PreProcessor(remove_substrings=["This is a header.", "wiki", "🪲"]) + documents = preprocessor.process(document) + + assert "This is a header." not in document["content"] + assert "wiki" not in document["content"] + assert "🪲" not in document["content"] + assert "whitespace" in document["content"] + assert "✨" in document["content"]