mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-04 11:49:23 +00:00
Make PreProcessor.process() work on lists of documents (#1163)
* Add process_batch method * Rename methods * Fix doc string, satisfy mypy * Fix mypy CI * Fix typp * Update tutorial * Fix argument name * Change arg name * Incorporate reviewer feedback
This commit is contained in:
parent
afee4f36ce
commit
efc03f72db
@ -1,4 +1,4 @@
|
|||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional, Union
|
||||||
|
|
||||||
from haystack import BaseComponent
|
from haystack import BaseComponent
|
||||||
|
|
||||||
@ -8,7 +8,7 @@ class BasePreProcessor(BaseComponent):
|
|||||||
|
|
||||||
def process(
|
def process(
|
||||||
self,
|
self,
|
||||||
document: dict,
|
documents: Union[dict, List[dict]],
|
||||||
clean_whitespace: Optional[bool] = True,
|
clean_whitespace: Optional[bool] = True,
|
||||||
clean_header_footer: Optional[bool] = False,
|
clean_header_footer: Optional[bool] = False,
|
||||||
clean_empty_lines: Optional[bool] = True,
|
clean_empty_lines: Optional[bool] = True,
|
||||||
@ -50,7 +50,7 @@ class BasePreProcessor(BaseComponent):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
documents = self.process(
|
documents = self.process(
|
||||||
document=document,
|
documents=document,
|
||||||
clean_whitespace=clean_whitespace,
|
clean_whitespace=clean_whitespace,
|
||||||
clean_header_footer=clean_header_footer,
|
clean_header_footer=clean_header_footer,
|
||||||
clean_empty_lines=clean_empty_lines,
|
clean_empty_lines=clean_empty_lines,
|
||||||
|
|||||||
@ -3,7 +3,7 @@ import re
|
|||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from functools import partial, reduce
|
from functools import partial, reduce
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from typing import List, Optional, Generator, Set
|
from typing import List, Optional, Generator, Set, Union
|
||||||
|
|
||||||
import nltk
|
import nltk
|
||||||
from more_itertools import windowed
|
from more_itertools import windowed
|
||||||
@ -92,7 +92,7 @@ class PreProcessor(BasePreProcessor):
|
|||||||
|
|
||||||
def process(
|
def process(
|
||||||
self,
|
self,
|
||||||
document: dict,
|
documents: Union[dict, List[dict]],
|
||||||
clean_whitespace: Optional[bool] = None,
|
clean_whitespace: Optional[bool] = None,
|
||||||
clean_header_footer: Optional[bool] = None,
|
clean_header_footer: Optional[bool] = None,
|
||||||
clean_empty_lines: Optional[bool] = None,
|
clean_empty_lines: Optional[bool] = None,
|
||||||
@ -101,9 +101,51 @@ class PreProcessor(BasePreProcessor):
|
|||||||
split_overlap: Optional[int] = None,
|
split_overlap: Optional[int] = None,
|
||||||
split_respect_sentence_boundary: Optional[bool] = None,
|
split_respect_sentence_boundary: Optional[bool] = None,
|
||||||
) -> List[dict]:
|
) -> List[dict]:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
|
Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
kwargs = {
|
||||||
|
"clean_whitespace": clean_whitespace,
|
||||||
|
"clean_header_footer": clean_header_footer,
|
||||||
|
"clean_empty_lines": clean_empty_lines,
|
||||||
|
"split_by": split_by,
|
||||||
|
"split_length": split_length,
|
||||||
|
"split_overlap": split_overlap,
|
||||||
|
"split_respect_sentence_boundary": split_respect_sentence_boundary
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = []
|
||||||
|
|
||||||
|
if type(documents) == dict:
|
||||||
|
ret = self._process_single(
|
||||||
|
document=documents,
|
||||||
|
**kwargs #type: ignore
|
||||||
|
)
|
||||||
|
elif type(documents) == list:
|
||||||
|
ret = self._process_batch(
|
||||||
|
documents=list(documents),
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise Exception("documents provided to PreProcessor.prepreprocess() is not of type list nor Document")
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _process_single(
|
||||||
|
self,
|
||||||
|
document,
|
||||||
|
clean_whitespace: Optional[bool] = None,
|
||||||
|
clean_header_footer: Optional[bool] = None,
|
||||||
|
clean_empty_lines: Optional[bool] = None,
|
||||||
|
split_by: Optional[str] = None,
|
||||||
|
split_length: Optional[int] = None,
|
||||||
|
split_overlap: Optional[int] = None,
|
||||||
|
split_respect_sentence_boundary: Optional[bool] = None,
|
||||||
|
) -> List[dict]:
|
||||||
|
|
||||||
if clean_whitespace is None:
|
if clean_whitespace is None:
|
||||||
clean_whitespace = self.clean_whitespace
|
clean_whitespace = self.clean_whitespace
|
||||||
if clean_header_footer is None:
|
if clean_header_footer is None:
|
||||||
@ -134,6 +176,14 @@ class PreProcessor(BasePreProcessor):
|
|||||||
)
|
)
|
||||||
return split_documents
|
return split_documents
|
||||||
|
|
||||||
|
def _process_batch(
|
||||||
|
self,
|
||||||
|
documents: List[dict],
|
||||||
|
**kwargs
|
||||||
|
) -> List[dict]:
|
||||||
|
nested_docs = [self.process(d, **kwargs) for d in documents]
|
||||||
|
return [d for x in nested_docs for d in x]
|
||||||
|
|
||||||
def clean(
|
def clean(
|
||||||
self,
|
self,
|
||||||
document: dict,
|
document: dict,
|
||||||
|
|||||||
@ -455,8 +455,7 @@
|
|||||||
" split_length=100,\n",
|
" split_length=100,\n",
|
||||||
" split_respect_sentence_boundary=True\n",
|
" split_respect_sentence_boundary=True\n",
|
||||||
")\n",
|
")\n",
|
||||||
"nested_docs = [preprocessor.process(d) for d in all_docs]\n",
|
"docs = preprocessor.process(all_docs)\n",
|
||||||
"docs = [d for x in nested_docs for d in x]\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"print(f\"n_files_input: {len(all_docs)}\\nn_docs_output: {len(docs)}\")"
|
"print(f\"n_files_input: {len(all_docs)}\\nn_docs_output: {len(docs)}\")"
|
||||||
],
|
],
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user