Fix PreProcessor test (#2290)

* Adding Document import, missing from recent PR

* Fix mypy signature warning too

* reduce diff to minimum

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Sara Zan 2022-03-09 13:46:47 +01:00 committed by GitHub
parent 004e7f33af
commit e85b948a4c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 9 additions and 2 deletions

View File

@ -15,7 +15,7 @@ class BasePreProcessor(BaseComponent)
#### process
```python
def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict]
def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict]
```
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.

View File

@ -12,6 +12,7 @@ class BasePreProcessor(BaseComponent):
clean_whitespace: Optional[bool] = True,
clean_header_footer: Optional[bool] = False,
clean_empty_lines: Optional[bool] = True,
remove_substrings: List[str] = [],
split_by: Optional[str] = "word",
split_length: Optional[int] = 1000,
split_overlap: Optional[int] = None,
@ -23,7 +24,12 @@ class BasePreProcessor(BaseComponent):
raise NotImplementedError
def clean(
self, document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool
self,
document: dict,
clean_whitespace: bool,
clean_header_footer: bool,
clean_empty_lines: bool,
remove_substrings: List[str],
) -> Dict[str, Any]:
raise NotImplementedError

View File

@ -1,5 +1,6 @@
from pathlib import Path
from haystack import Document
from haystack.nodes.file_converter.pdf import PDFToTextConverter
from haystack.nodes.preprocessor.preprocessor import PreProcessor