From efc03f72dbd59fd7ad2eedbf162cfc4f6c159ad6 Mon Sep 17 00:00:00 2001 From: Branden Chan <33759007+brandenchan@users.noreply.github.com> Date: Wed, 23 Jun 2021 18:13:51 +0200 Subject: [PATCH] Make PreProcessor.process() work on lists of documents (#1163) * Add process_batch method * Rename methods * Fix doc string, satisfy mypy * Fix mypy CI * Fix typp * Update tutorial * Fix argument name * Change arg name * Incorporate reviewer feedback --- haystack/preprocessor/base.py | 6 +-- haystack/preprocessor/preprocessor.py | 56 +++++++++++++++++++++++-- tutorials/Tutorial8_Preprocessing.ipynb | 3 +- 3 files changed, 57 insertions(+), 8 deletions(-) diff --git a/haystack/preprocessor/base.py b/haystack/preprocessor/base.py index 6acaf14c2..6e77235db 100644 --- a/haystack/preprocessor/base.py +++ b/haystack/preprocessor/base.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any, Optional +from typing import List, Dict, Any, Optional, Union from haystack import BaseComponent @@ -8,7 +8,7 @@ class BasePreProcessor(BaseComponent): def process( self, - document: dict, + documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, @@ -50,7 +50,7 @@ class BasePreProcessor(BaseComponent): **kwargs, ): documents = self.process( - document=document, + documents=document, clean_whitespace=clean_whitespace, clean_header_footer=clean_header_footer, clean_empty_lines=clean_empty_lines, diff --git a/haystack/preprocessor/preprocessor.py b/haystack/preprocessor/preprocessor.py index 7ec1433c7..95fb7aa52 100644 --- a/haystack/preprocessor/preprocessor.py +++ b/haystack/preprocessor/preprocessor.py @@ -3,7 +3,7 @@ import re from copy import deepcopy from functools import partial, reduce from itertools import chain -from typing import List, Optional, Generator, Set +from typing import List, Optional, Generator, Set, Union import nltk from more_itertools import windowed @@ -92,7 +92,7 @@ class PreProcessor(BasePreProcessor): def process( self, - document: dict, + documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, @@ -101,9 +101,51 @@ class PreProcessor(BasePreProcessor): split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None, ) -> List[dict]: + """ - Perform document cleaning and splitting. Takes a single document as input and returns a list of documents. + Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents. """ + + kwargs = { + "clean_whitespace": clean_whitespace, + "clean_header_footer": clean_header_footer, + "clean_empty_lines": clean_empty_lines, + "split_by": split_by, + "split_length": split_length, + "split_overlap": split_overlap, + "split_respect_sentence_boundary": split_respect_sentence_boundary + } + + ret = [] + + if type(documents) == dict: + ret = self._process_single( + document=documents, + **kwargs #type: ignore + ) + elif type(documents) == list: + ret = self._process_batch( + documents=list(documents), + **kwargs + ) + + else: + raise Exception("documents provided to PreProcessor.prepreprocess() is not of type list nor Document") + + return ret + + def _process_single( + self, + document, + clean_whitespace: Optional[bool] = None, + clean_header_footer: Optional[bool] = None, + clean_empty_lines: Optional[bool] = None, + split_by: Optional[str] = None, + split_length: Optional[int] = None, + split_overlap: Optional[int] = None, + split_respect_sentence_boundary: Optional[bool] = None, + ) -> List[dict]: + if clean_whitespace is None: clean_whitespace = self.clean_whitespace if clean_header_footer is None: @@ -134,6 +176,14 @@ class PreProcessor(BasePreProcessor): ) return split_documents + def _process_batch( + self, + documents: List[dict], + **kwargs + ) -> List[dict]: + nested_docs = [self.process(d, **kwargs) for d in documents] + return [d for x in nested_docs for d in x] + def clean( self, document: dict, diff --git a/tutorials/Tutorial8_Preprocessing.ipynb b/tutorials/Tutorial8_Preprocessing.ipynb index b6417d623..44c195c21 100644 --- a/tutorials/Tutorial8_Preprocessing.ipynb +++ b/tutorials/Tutorial8_Preprocessing.ipynb @@ -455,8 +455,7 @@ " split_length=100,\n", " split_respect_sentence_boundary=True\n", ")\n", - "nested_docs = [preprocessor.process(d) for d in all_docs]\n", - "docs = [d for x in nested_docs for d in x]\n", + "docs = preprocessor.process(all_docs)\n", "\n", "print(f\"n_files_input: {len(all_docs)}\\nn_docs_output: {len(docs)}\")" ],