Make PreProcessor.process() work on lists of documents (#1163)

* Add process_batch method

* Rename methods

* Fix doc string, satisfy mypy

* Fix mypy CI

* Fix typp

* Update tutorial

* Fix argument name

* Change arg name

* Incorporate reviewer feedback
This commit is contained in:
Branden Chan 2021-06-23 18:13:51 +02:00 committed by GitHub
parent afee4f36ce
commit efc03f72db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 57 additions and 8 deletions

View File

@ -1,4 +1,4 @@
from typing import List, Dict, Any, Optional
from typing import List, Dict, Any, Optional, Union
from haystack import BaseComponent
@ -8,7 +8,7 @@ class BasePreProcessor(BaseComponent):
def process(
self,
document: dict,
documents: Union[dict, List[dict]],
clean_whitespace: Optional[bool] = True,
clean_header_footer: Optional[bool] = False,
clean_empty_lines: Optional[bool] = True,
@ -50,7 +50,7 @@ class BasePreProcessor(BaseComponent):
**kwargs,
):
documents = self.process(
document=document,
documents=document,
clean_whitespace=clean_whitespace,
clean_header_footer=clean_header_footer,
clean_empty_lines=clean_empty_lines,

View File

@ -3,7 +3,7 @@ import re
from copy import deepcopy
from functools import partial, reduce
from itertools import chain
from typing import List, Optional, Generator, Set
from typing import List, Optional, Generator, Set, Union
import nltk
from more_itertools import windowed
@ -92,7 +92,7 @@ class PreProcessor(BasePreProcessor):
def process(
self,
document: dict,
documents: Union[dict, List[dict]],
clean_whitespace: Optional[bool] = None,
clean_header_footer: Optional[bool] = None,
clean_empty_lines: Optional[bool] = None,
@ -101,9 +101,51 @@ class PreProcessor(BasePreProcessor):
split_overlap: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = None,
) -> List[dict]:
"""
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
"""
kwargs = {
"clean_whitespace": clean_whitespace,
"clean_header_footer": clean_header_footer,
"clean_empty_lines": clean_empty_lines,
"split_by": split_by,
"split_length": split_length,
"split_overlap": split_overlap,
"split_respect_sentence_boundary": split_respect_sentence_boundary
}
ret = []
if type(documents) == dict:
ret = self._process_single(
document=documents,
**kwargs #type: ignore
)
elif type(documents) == list:
ret = self._process_batch(
documents=list(documents),
**kwargs
)
else:
raise Exception("documents provided to PreProcessor.prepreprocess() is not of type list nor Document")
return ret
def _process_single(
self,
document,
clean_whitespace: Optional[bool] = None,
clean_header_footer: Optional[bool] = None,
clean_empty_lines: Optional[bool] = None,
split_by: Optional[str] = None,
split_length: Optional[int] = None,
split_overlap: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = None,
) -> List[dict]:
if clean_whitespace is None:
clean_whitespace = self.clean_whitespace
if clean_header_footer is None:
@ -134,6 +176,14 @@ class PreProcessor(BasePreProcessor):
)
return split_documents
def _process_batch(
self,
documents: List[dict],
**kwargs
) -> List[dict]:
nested_docs = [self.process(d, **kwargs) for d in documents]
return [d for x in nested_docs for d in x]
def clean(
self,
document: dict,

View File

@ -455,8 +455,7 @@
" split_length=100,\n",
" split_respect_sentence_boundary=True\n",
")\n",
"nested_docs = [preprocessor.process(d) for d in all_docs]\n",
"docs = [d for x in nested_docs for d in x]\n",
"docs = preprocessor.process(all_docs)\n",
"\n",
"print(f\"n_files_input: {len(all_docs)}\\nn_docs_output: {len(docs)}\")"
],