diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md index bc9888d28..f650f031c 100644 --- a/docs/_src/api/api/preprocessor.md +++ b/docs/_src/api/api/preprocessor.md @@ -15,7 +15,7 @@ class BasePreProcessor(BaseComponent) #### process ```python -def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict] +def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict] ``` Perform document cleaning and splitting. Takes a single document as input and returns a list of documents. diff --git a/haystack/nodes/preprocessor/base.py b/haystack/nodes/preprocessor/base.py index ce4f1dcef..f2d73a37e 100644 --- a/haystack/nodes/preprocessor/base.py +++ b/haystack/nodes/preprocessor/base.py @@ -12,6 +12,7 @@ class BasePreProcessor(BaseComponent): clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, + remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, @@ -23,7 +24,12 @@ class BasePreProcessor(BaseComponent): raise NotImplementedError def clean( - self, document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool + self, + document: dict, + clean_whitespace: bool, + clean_header_footer: bool, + clean_empty_lines: bool, + remove_substrings: List[str], ) -> Dict[str, Any]: raise NotImplementedError diff --git a/test/test_preprocessor.py b/test/test_preprocessor.py index 2e56a9c7e..f9d006b30 100644 --- a/test/test_preprocessor.py +++ b/test/test_preprocessor.py @@ -1,5 +1,6 @@ from pathlib import Path +from haystack import Document from haystack.nodes.file_converter.pdf import PDFToTextConverter from haystack.nodes.preprocessor.preprocessor import PreProcessor