mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-01 10:19:23 +00:00
Make PreProcessor.process() work on lists of documents (#1163)
* Add process_batch method * Rename methods * Fix doc string, satisfy mypy * Fix mypy CI * Fix typp * Update tutorial * Fix argument name * Change arg name * Incorporate reviewer feedback
This commit is contained in:
parent
afee4f36ce
commit
efc03f72db
@ -1,4 +1,4 @@
|
||||
from typing import List, Dict, Any, Optional
|
||||
from typing import List, Dict, Any, Optional, Union
|
||||
|
||||
from haystack import BaseComponent
|
||||
|
||||
@ -8,7 +8,7 @@ class BasePreProcessor(BaseComponent):
|
||||
|
||||
def process(
|
||||
self,
|
||||
document: dict,
|
||||
documents: Union[dict, List[dict]],
|
||||
clean_whitespace: Optional[bool] = True,
|
||||
clean_header_footer: Optional[bool] = False,
|
||||
clean_empty_lines: Optional[bool] = True,
|
||||
@ -50,7 +50,7 @@ class BasePreProcessor(BaseComponent):
|
||||
**kwargs,
|
||||
):
|
||||
documents = self.process(
|
||||
document=document,
|
||||
documents=document,
|
||||
clean_whitespace=clean_whitespace,
|
||||
clean_header_footer=clean_header_footer,
|
||||
clean_empty_lines=clean_empty_lines,
|
||||
|
||||
@ -3,7 +3,7 @@ import re
|
||||
from copy import deepcopy
|
||||
from functools import partial, reduce
|
||||
from itertools import chain
|
||||
from typing import List, Optional, Generator, Set
|
||||
from typing import List, Optional, Generator, Set, Union
|
||||
|
||||
import nltk
|
||||
from more_itertools import windowed
|
||||
@ -92,7 +92,7 @@ class PreProcessor(BasePreProcessor):
|
||||
|
||||
def process(
|
||||
self,
|
||||
document: dict,
|
||||
documents: Union[dict, List[dict]],
|
||||
clean_whitespace: Optional[bool] = None,
|
||||
clean_header_footer: Optional[bool] = None,
|
||||
clean_empty_lines: Optional[bool] = None,
|
||||
@ -101,9 +101,51 @@ class PreProcessor(BasePreProcessor):
|
||||
split_overlap: Optional[int] = None,
|
||||
split_respect_sentence_boundary: Optional[bool] = None,
|
||||
) -> List[dict]:
|
||||
|
||||
"""
|
||||
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
|
||||
Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
|
||||
"""
|
||||
|
||||
kwargs = {
|
||||
"clean_whitespace": clean_whitespace,
|
||||
"clean_header_footer": clean_header_footer,
|
||||
"clean_empty_lines": clean_empty_lines,
|
||||
"split_by": split_by,
|
||||
"split_length": split_length,
|
||||
"split_overlap": split_overlap,
|
||||
"split_respect_sentence_boundary": split_respect_sentence_boundary
|
||||
}
|
||||
|
||||
ret = []
|
||||
|
||||
if type(documents) == dict:
|
||||
ret = self._process_single(
|
||||
document=documents,
|
||||
**kwargs #type: ignore
|
||||
)
|
||||
elif type(documents) == list:
|
||||
ret = self._process_batch(
|
||||
documents=list(documents),
|
||||
**kwargs
|
||||
)
|
||||
|
||||
else:
|
||||
raise Exception("documents provided to PreProcessor.prepreprocess() is not of type list nor Document")
|
||||
|
||||
return ret
|
||||
|
||||
def _process_single(
|
||||
self,
|
||||
document,
|
||||
clean_whitespace: Optional[bool] = None,
|
||||
clean_header_footer: Optional[bool] = None,
|
||||
clean_empty_lines: Optional[bool] = None,
|
||||
split_by: Optional[str] = None,
|
||||
split_length: Optional[int] = None,
|
||||
split_overlap: Optional[int] = None,
|
||||
split_respect_sentence_boundary: Optional[bool] = None,
|
||||
) -> List[dict]:
|
||||
|
||||
if clean_whitespace is None:
|
||||
clean_whitespace = self.clean_whitespace
|
||||
if clean_header_footer is None:
|
||||
@ -134,6 +176,14 @@ class PreProcessor(BasePreProcessor):
|
||||
)
|
||||
return split_documents
|
||||
|
||||
def _process_batch(
|
||||
self,
|
||||
documents: List[dict],
|
||||
**kwargs
|
||||
) -> List[dict]:
|
||||
nested_docs = [self.process(d, **kwargs) for d in documents]
|
||||
return [d for x in nested_docs for d in x]
|
||||
|
||||
def clean(
|
||||
self,
|
||||
document: dict,
|
||||
|
||||
@ -455,8 +455,7 @@
|
||||
" split_length=100,\n",
|
||||
" split_respect_sentence_boundary=True\n",
|
||||
")\n",
|
||||
"nested_docs = [preprocessor.process(d) for d in all_docs]\n",
|
||||
"docs = [d for x in nested_docs for d in x]\n",
|
||||
"docs = preprocessor.process(all_docs)\n",
|
||||
"\n",
|
||||
"print(f\"n_files_input: {len(all_docs)}\\nn_docs_output: {len(docs)}\")"
|
||||
],
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user