mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-03 11:19:57 +00:00
Fix PreProcessor test (#2290)
* Adding Document import, missing from recent PR * Fix mypy signature warning too * reduce diff to minimum * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
004e7f33af
commit
e85b948a4c
@ -15,7 +15,7 @@ class BasePreProcessor(BaseComponent)
|
||||
#### process
|
||||
|
||||
```python
|
||||
def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict]
|
||||
def process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict]
|
||||
```
|
||||
|
||||
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
|
||||
|
||||
@ -12,6 +12,7 @@ class BasePreProcessor(BaseComponent):
|
||||
clean_whitespace: Optional[bool] = True,
|
||||
clean_header_footer: Optional[bool] = False,
|
||||
clean_empty_lines: Optional[bool] = True,
|
||||
remove_substrings: List[str] = [],
|
||||
split_by: Optional[str] = "word",
|
||||
split_length: Optional[int] = 1000,
|
||||
split_overlap: Optional[int] = None,
|
||||
@ -23,7 +24,12 @@ class BasePreProcessor(BaseComponent):
|
||||
raise NotImplementedError
|
||||
|
||||
def clean(
|
||||
self, document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool
|
||||
self,
|
||||
document: dict,
|
||||
clean_whitespace: bool,
|
||||
clean_header_footer: bool,
|
||||
clean_empty_lines: bool,
|
||||
remove_substrings: List[str],
|
||||
) -> Dict[str, Any]:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
from pathlib import Path
|
||||
|
||||
from haystack import Document
|
||||
from haystack.nodes.file_converter.pdf import PDFToTextConverter
|
||||
from haystack.nodes.preprocessor.preprocessor import PreProcessor
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user