Add PreProcessor to simplify splitting and cleaning of docs (#473)

* Add PreProcessing * Adjust PDF conversion tests * Add tests for Preprocessing * Add requirement * Fix tests * Ignore decoding errors for TextConverter * Rename split_size to split_length * Adjust tests Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
2025-11-11 15:23:41 +00:00 · 2020-10-15 10:42:08 +02:00 · 2020-10-15 10:42:08 +02:00 · 974b37eded
commit 974b37eded
parent 2e9f3c1512
10 changed files with 317 additions and 220 deletions
--- a/haystack/file_converter/base.py
+++ b/haystack/file_converter/base.py
@ -1,8 +1,6 @@
 from abc import abstractmethod
 from functools import partial, reduce
 from itertools import chain
 from pathlib import Path
-from typing import List, Optional, Tuple, Generator, Set, Dict, Any
+from typing import List, Optional, Dict, Any
 import langdetect
@ -12,26 +10,13 @@ class BaseConverter:
    Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore.
    """
-    def __init__(
+    def __init__(self, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None):
        self,
        remove_numeric_tables: Optional[bool] = None,
        remove_header_footer: Optional[bool] = None,
        remove_whitespace: Optional[bool] = None,
        remove_empty_lines: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
    ):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
                                     for the longest common string. This heuristic uses exact matches and therefore
                                     works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
                                     or similar.
        :param remove_whitespace: strip whitespaces before or after each line in the text.
        :param remove_empty_lines: remove more than two empty lines in the text.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
@ -39,9 +24,6 @@ class BaseConverter:
                                in garbled text.
        """
        self.remove_numeric_tables = remove_numeric_tables
        self.remove_header_footer = remove_header_footer
        self.remove_whitespace = remove_whitespace
        self.remove_empty_lines = remove_empty_lines
        self.valid_languages = valid_languages
    @abstractmethod
@ -74,81 +56,4 @@ class BaseConverter:
        else:
            return False
    def find_and_remove_header_footer(
            self, pages: List[str], n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
    ) -> Tuple[List[str], Optional[str], Optional[str]]:
        """
        Heuristic to find footers and headers across different pages by searching for the longest common string.
        For headers we only search in the first n_chars characters (for footer: last n_chars).
        Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
         but won't detect "Page 3 of 4" or similar.
        :param pages: list of strings, one string per page
        :param n_chars: number of first/last characters where the header/footer shall be searched in
        :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
        :param n_last_pages_to_ignore: number of last pages to ignore
        :return: (cleaned pages, found_header_str, found_footer_str)
        """
        # header
        start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
        found_header = self._find_longest_common_ngram(start_of_pages)
        if found_header:
            pages = [page.replace(found_header, "") for page in pages]
        # footer
        end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
        found_footer = self._find_longest_common_ngram(end_of_pages)
        if found_footer:
            pages = [page.replace(found_footer, "") for page in pages]
        return pages, found_header, found_footer
    def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
        """
        Return ngram (of tokens - currently splitted by whitespace)
        :param seq: str, string from which the ngram shall be created
        :param n: int, n of ngram
        :return: str, ngram as string
        """
        # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
        # we add a space here and remove it after creation of the ngrams again (see below)
        seq = seq.replace("\n", " \n")
        seq = seq.replace("\t", " \t")
        words = seq.split(" ")
        ngrams = (
            " ".join(words[i: i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
        )
        return ngrams
    def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
        lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
        ngrams = map(partial(self._ngram, seq), lengths)
        res = set(chain.from_iterable(ngrams))
        return res
    def _find_longest_common_ngram(self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3) -> Optional[str]:
        """
        Find the longest common ngram across different text sequences (e.g. start of pages).
        Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
        :param sequences: list[str], list of strings that shall be searched for common n_grams
        :param max_ngram: int, maximum length of ngram to consider
        :param min_ngram: minimum length of ngram to consider
        :return: str, common string of all sections
        """
        sequences = [s for s in sequences if s]  # filter empty sequences
        if not sequences:
            return None
        seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
        intersection = reduce(set.intersection, seqs_ngrams)
        try:
            longest = max(intersection, key=len)
        except ValueError:
            # no common sequence found
            longest = ""
        return longest if longest.strip() else None
--- a/haystack/file_converter/pdf.py
+++ b/haystack/file_converter/pdf.py
@ -1,5 +1,4 @@
 import logging
 import re
 import subprocess
 from pathlib import Path
 from typing import List, Optional, Dict, Any
@ -10,26 +9,13 @@ logger = logging.getLogger(__name__)
 class PDFToTextConverter(BaseConverter):
-    def __init__(
+    def __init__(self, remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None):
        self,
        remove_numeric_tables: Optional[bool] = False,
        remove_whitespace: Optional[bool] = None,
        remove_empty_lines: Optional[bool] = None,
        remove_header_footer: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
    ):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param remove_whitespace: strip whitespaces before or after each line in the text.
        :param remove_empty_lines: remove more than two empty lines in the text.
        :param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
                                     for the longest common string. This heuristic uses exact matches and therefore
                                     works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
                                     or similar.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
@ -52,13 +38,7 @@ class PDFToTextConverter(BaseConverter):
                """
            )
-        super().__init__(
+        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
            remove_numeric_tables=remove_numeric_tables,
            remove_whitespace=remove_whitespace,
            remove_empty_lines=remove_empty_lines,
            remove_header_footer=remove_header_footer,
            valid_languages=valid_languages,
        )
    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
@ -87,17 +67,9 @@ class PDFToTextConverter(BaseConverter):
                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
                        logger.debug(f"Removing line '{line}' from {file_path}")
                        continue
                if self.remove_whitespace:
                    line = line.strip()
                cleaned_lines.append(line)
            page = "\n".join(cleaned_lines)
            if self.remove_empty_lines:
                page = re.sub(r"\n\n+", "\n\n", page)
            cleaned_pages.append(page)
        if self.valid_languages:
@ -108,12 +80,6 @@ class PDFToTextConverter(BaseConverter):
                    f"been decoded in the correct text format."
                )
        if self.remove_header_footer:
            cleaned_pages, header, footer = self.find_and_remove_header_footer(
                cleaned_pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
            )
            logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
        text = "\f".join(cleaned_pages)
        document = {"text": text, "meta": meta}
        return document
--- a/haystack/file_converter/tika.py
+++ b/haystack/file_converter/tika.py
@ -1,8 +1,7 @@
 import logging
 import re
 from html.parser import HTMLParser
 from pathlib import Path
-from typing import List, Optional, Tuple, Dict, Any
+from typing import List, Optional, Dict, Any
 import requests
 from tika import parser as tikaparser
@ -44,10 +43,7 @@ class TikaConverter(BaseConverter):
        self,
        tika_url: str = "http://localhost:9998/tika",
        remove_numeric_tables: Optional[bool] = False,
-        remove_whitespace: Optional[bool] = None,
+        valid_languages: Optional[List[str]] = None
        remove_empty_lines: Optional[bool] = None,
        remove_header_footer: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
    ):
        """
        :param tika_url: URL of the Tika server
@ -56,12 +52,6 @@ class TikaConverter(BaseConverter):
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param remove_whitespace: strip whitespaces before or after each line in the text.
        :param remove_empty_lines: remove more than two empty lines in the text.
        :param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
                                     for the longest common string. This heuristic uses exact matches and therefore
                                     works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
                                     or similar.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
@ -73,13 +63,7 @@ class TikaConverter(BaseConverter):
            raise Exception(f"Apache Tika server is not reachable at the URL '{tika_url}'. To run it locally"
                            f"with Docker, execute: 'docker run -p 9998:9998 apache/tika:1.24.1'")
        self.tika_url = tika_url
-        super().__init__(
+        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
            remove_numeric_tables=remove_numeric_tables,
            remove_whitespace=remove_whitespace,
            remove_empty_lines=remove_empty_lines,
            remove_header_footer=remove_header_footer,
            valid_languages=valid_languages,
        )
    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
        """
@ -106,16 +90,9 @@ class TikaConverter(BaseConverter):
                        logger.debug(f"Removing line '{line}' from {file_path}")
                        continue
                if self.remove_whitespace:
                    line = line.strip()
                cleaned_lines.append(line)
            page = "\n".join(cleaned_lines)
            # always clean up empty lines:
            page = re.sub(r"\n\n+", "\n\n", page)
            cleaned_pages.append(page)
        if self.valid_languages:
@ -126,12 +103,6 @@ class TikaConverter(BaseConverter):
                    f"been decoded in the correct text format."
                )
        if self.remove_header_footer:
            cleaned_pages, header, footer = self.find_and_remove_header_footer(
                cleaned_pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
            )
            logger.info(f"Removed header '{header}' and footer '{footer}' in {file_path}")
        text = "\f".join(cleaned_pages)
        document = {"text": text, "meta": {**parsed["metadata"], **(meta or {})}}
        return document
--- a/haystack/file_converter/txt.py
+++ b/haystack/file_converter/txt.py
@ -1,5 +1,4 @@
 import logging
 import re
 from pathlib import Path
 from typing import List, Optional, Any, Dict
@ -9,39 +8,22 @@ logger = logging.getLogger(__name__)
 class TextConverter(BaseConverter):
-    def __init__(
+    def __init__(self, remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None):
            self,
            remove_numeric_tables: Optional[bool] = False,
            remove_whitespace: Optional[bool] = None,
            remove_empty_lines: Optional[bool] = None,
            remove_header_footer: Optional[bool] = None,
            valid_languages: Optional[List[str]] = None,
    ):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param remove_whitespace: strip whitespaces before or after each line in the text.
        :param remove_empty_lines: remove more than two empty lines in the text.
        :param remove_header_footer: use heuristic to remove footers and headers across different pages by searching
                                     for the longest common string. This heuristic uses exact matches and therefore
                                     works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
                                     or similar.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        """
-        super().__init__(
+
-            remove_numeric_tables=remove_numeric_tables,
+        super().__init__(remove_numeric_tables=remove_numeric_tables,
-            remove_whitespace=remove_whitespace,
+                         valid_languages=valid_languages)
            remove_empty_lines=remove_empty_lines,
            remove_header_footer=remove_header_footer,
            valid_languages=valid_languages,
        )
    def convert(self,
                file_path: Path,
@ -57,7 +39,7 @@ class TextConverter(BaseConverter):
        :return: Dict of format {"text": "The text from file", "meta": meta}}
        """
-        with open(file_path, encoding=encoding) as f:
+        with open(file_path, encoding=encoding, errors="ignore") as f:
            text = f.read()
            pages = text.split("\f")
@ -75,16 +57,9 @@ class TextConverter(BaseConverter):
                        logger.debug(f"Removing line '{line}' from {file_path}")
                        continue
                if self.remove_whitespace:
                    line = line.strip()
                cleaned_lines.append(line)
            page = "\n".join(cleaned_lines)
            if self.remove_empty_lines:
                page = re.sub(r"\n\n+", "\n\n", page)
            cleaned_pages.append(page)
        if self.valid_languages:
@ -95,12 +70,6 @@ class TextConverter(BaseConverter):
                    f"been decoded in the correct text format."
                )
        if self.remove_header_footer:
            cleaned_pages, header, footer = self.find_and_remove_header_footer(
                cleaned_pages, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
            )
            logger.info(f"Removed header '{header}' and footer {footer} in {file_path}")
        text = "".join(pages)
        document = {"text": text, "meta": meta}
        return document
--- a/haystack/preprocessor/base.py
+++ b/haystack/preprocessor/base.py
@ -0,0 +1,14 @@
 from typing import List, Dict, Any
 class BasePreProcessor:
    def process(self, document: dict) -> List[dict]:
        cleaned_document = self.clean(document)
        split_documents = self.split(cleaned_document)
        return split_documents
    def clean(self, document: Dict[str, Any]) -> Dict[str, Any]:
        raise NotImplementedError
    def split(self, document: Dict[str, Any]) -> List[Dict[str, Any]]:
        raise NotImplementedError
--- a/haystack/preprocessor/preprocessor.py
+++ b/haystack/preprocessor/preprocessor.py
@ -0,0 +1,222 @@
 import logging
 import re
 from copy import deepcopy
 from functools import partial, reduce
 from itertools import chain
 from typing import List, Optional, Generator, Set
 import nltk
 from more_itertools import windowed
 from haystack.preprocessor.base import BasePreProcessor
 logger = logging.getLogger(__name__)
 class PreProcessor(BasePreProcessor):
    def __init__(
        self,
        clean_whitespace: Optional[bool] = True,
        clean_header_footer: Optional[bool] = False,
        clean_empty_lines: Optional[bool] = True,
        split_by: Optional[str] = "passage",
        split_length: Optional[int] = 10,
        split_stride: Optional[int] = None,
        split_respect_sentence_boundary: Optional[bool] = False,
    ):
        """
        :param clean_header_footer: use heuristic to remove footers and headers across different pages by searching
                                     for the longest common string. This heuristic uses exact matches and therefore
                                     works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
                                     or similar.
        :param clean_whitespace: strip whitespaces before or after each line in the text.
        :param clean_empty_lines: remove more than two empty lines in the text.
        :param split_by: split the document by "word", "sentence", or "passage". Set to None to disable splitting.
        :param split_length: n number of splits to merge as a single document. For instance, if n -> 10 & split_by ->
                           "sentence", then each output document will have 10 sentences.
        :param split_stride: length of striding window over the splits. For example, if split_by -> `word`,
                             split_length -> 5 & split_stride -> 2, then the splits would be like:
                             [w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12].
                             Set the value to None to disable striding behaviour.
        :param split_respect_sentence_boundary: whether to split in partial sentences when if split_by -> `word`. If set
                                                to True, the individual split would always have complete sentence &
                                                the number of words being less than or equal to the split_length.
        """
        nltk.download("punkt")
        self.clean_whitespace = clean_whitespace
        self.clean_header_footer = clean_header_footer
        self.clean_empty_lines = clean_empty_lines
        self.split_by = split_by
        self.split_length = split_length
        self.split_stride = split_stride
        self.split_respect_sentence_boundary = split_respect_sentence_boundary
    def clean(self, document: dict) -> dict:
        text = document["text"]
        if self.clean_header_footer:
            text = self._find_and_remove_header_footer(
                text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
            )
        if self.clean_whitespace:
            lines = text.splitlines()
            cleaned_lines = []
            for line in lines:
                line = line.strip()
                cleaned_lines.append(line)
            text = "\n".join(cleaned_lines)
        if self.clean_empty_lines:
            text = re.sub(r"\n\n+", "\n\n", text)
        document["text"] = text
        return document
    def split(self, document: dict) -> List[dict]:
        if not self.split_by:
            return [document]
        if not self.split_length:
            raise Exception("split_length needs be set when using split_by.")
        text = document["text"]
        if self.split_respect_sentence_boundary:  # split by words ensuring no sub sentence splits
            if self.split_by == "word":
                sentences = nltk.tokenize.sent_tokenize(text)
                word_count = 0
                text_splits = []
                current_slice = ""
                for sen in sentences:
                    current_word_count = len(sen.split(" "))
                    if current_word_count > self.split_length:
                        logger.warning(f"A sentence found with word count higher than the split length.")
                    if word_count + current_word_count > self.split_length:
                        text_splits.append(current_slice)
                        current_slice = ""
                        word_count = 0
                    current_slice += sen
                    word_count += len(sen.split(" "))
                if current_slice:
                    text_splits.append(current_slice)
            else:
                raise NotImplementedError(
                    "'split_respect_sentence_boundary' parameter is only compatible with " "split_by='word'."
                )
        else:
            # create individual "elements" of passage, sentence, or word
            if self.split_by == "passage":
                elements = text.split("\n\n")
            elif self.split_by == "sentence":
                elements = nltk.tokenize.sent_tokenize(text)
            elif self.split_by == "word":
                elements = text.split(" ")
            else:
                raise NotImplementedError("PreProcessor only supports 'passage' or 'sentence' split_by options.")
            # concatenate individual elements based on split_length & split_stride
            if self.split_stride:
                segments = windowed(elements, n=self.split_length, step=self.split_length - self.split_stride)
            else:
                segments = windowed(elements, n=self.split_length, step=self.split_length)
            text_splits = []
            for seg in segments:
                txt = " ".join([t for t in seg if t])
                text_splits.append(txt)
        # create new document dicts for each text split
        documents = []
        for i, txt in enumerate(text_splits):
            doc = deepcopy(document)
            doc["text"] = txt
            if "meta" not in doc.keys():
                doc["meta"] = {}
            doc["meta"]["_split_id"] = i
            documents.append(doc)
        return documents
    def _find_and_remove_header_footer(
        self, text: str, n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
    ) -> str:
        """
        Heuristic to find footers and headers across different pages by searching for the longest common string.
        For headers we only search in the first n_chars characters (for footer: last n_chars).
        Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
         but won't detect "Page 3 of 4" or similar.
        :param n_chars: number of first/last characters where the header/footer shall be searched in
        :param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
        :param n_last_pages_to_ignore: number of last pages to ignore
        :return: (cleaned pages, found_header_str, found_footer_str)
        """
        pages = text.split("\f")
        # header
        start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
        found_header = self._find_longest_common_ngram(start_of_pages)
        if found_header:
            pages = [page.replace(found_header, "") for page in pages]
        # footer
        end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
        found_footer = self._find_longest_common_ngram(end_of_pages)
        if found_footer:
            pages = [page.replace(found_footer, "") for page in pages]
        logger.debug(f"Removed header '{found_header}' and footer {found_footer} in document")
        text = "\f".join(pages)
        return text
    def _ngram(self, seq: str, n: int) -> Generator[str, None, None]:
        """
        Return ngram (of tokens - currently split by whitespace)
        :param seq: str, string from which the ngram shall be created
        :param n: int, n of ngram
        :return: str, ngram as string
        """
        # In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
        # we add a space here and remove it after creation of the ngrams again (see below)
        seq = seq.replace("\n", " \n")
        seq = seq.replace("\t", " \t")
        words = seq.split(" ")
        ngrams = (
            " ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
        )
        return ngrams
    def _allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
        lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
        ngrams = map(partial(self._ngram, seq), lengths)
        res = set(chain.from_iterable(ngrams))
        return res
    def _find_longest_common_ngram(
        self, sequences: List[str], max_ngram: int = 30, min_ngram: int = 3
    ) -> Optional[str]:
        """
        Find the longest common ngram across different text sequences (e.g. start of pages).
        Considering all ngrams between the specified range. Helpful for finding footers, headers etc.
        :param sequences: list[str], list of strings that shall be searched for common n_grams
        :param max_ngram: int, maximum length of ngram to consider
        :param min_ngram: minimum length of ngram to consider
        :return: str, common string of all sections
        """
        sequences = [s for s in sequences if s]  # filter empty sequences
        if not sequences:
            return None
        seqs_ngrams = map(partial(self._allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
        intersection = reduce(set.intersection, seqs_ngrams)
        try:
            longest = max(intersection, key=len)
        except ValueError:
            # no common sequence found
            longest = ""
        return longest if longest.strip() else None
--- a/haystack/preprocessor/utils.py
+++ b/haystack/preprocessor/utils.py
@ -153,7 +153,7 @@ def tika_convert_files_to_dicts(
    :return: None
    """
-    converter = TikaConverter(remove_header_footer=True)
+    converter = TikaConverter()
    paths = [p for p in Path(dir_path).glob("**/*")]
    allowed_suffixes = [".pdf", ".txt"]
    file_paths: List[Path] = []
--- a/requirements.txt
+++ b/requirements.txt
@ -20,3 +20,5 @@ faiss-cpu; sys_platform != 'win32' and sys_platform != 'cygwin'
 tika
 uvloop; sys_platform != 'win32' and sys_platform != 'cygwin'
 httptools
 nltk
 more_itertools
--- a/test/test_pdf_conversion.py
+++ b/test/test_pdf_conversion.py
@ -40,21 +40,3 @@ def test_language_validation(Converter, xpdf_fixture, caplog):
    assert "The language for samples/pdf/sample_pdf_1.pdf is not one of ['de']." in caplog.text
@pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
 def test_header_footer_removal(Converter, xpdf_fixture):
    converter = Converter(remove_header_footer=True)
    converter_no_removal = Converter(remove_header_footer=False)
    document1 = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
    document2 = converter_no_removal.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))  # file contains no header/footer
    pages1 = document1["text"].split("\f")
    pages2 = document2["text"].split("\f")
    for p1, p2 in zip(pages1, pages2):
        assert p2 == p2
    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf"))  # file contains header and footer
    pages = document["text"].split("\f")
    assert len(pages) == 4
    for page in pages:
        assert "This is a header." not in page
        assert "footer" not in page
--- a/test/test_preprocessor.py
+++ b/test/test_preprocessor.py
@ -0,0 +1,66 @@
 from pathlib import Path
 from haystack.file_converter.pdf import PDFToTextConverter
 from haystack.preprocessor.preprocessor import PreProcessor
 TEXT = """
 This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in 
 paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1.
 This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in 
 paragraph_2. This is a sample sentence in paragraph_2. This is a sample sentence in paragraph_2.
 This is a sample sentence in paragraph_3. This is a sample sentence in paragraph_3. This is a sample sentence in 
 paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test with using an abbreviation like Dr. 
 in the sentence. 
 """
 def test_preprocess_sentence_split():
    document = {"text": TEXT}
    preprocessor = PreProcessor(split_length=1, split_stride=0, split_by="sentence")
    documents = preprocessor.process(document)
    assert len(documents) == 15
    preprocessor = PreProcessor(
        split_length=10, split_stride=0, split_by="sentence"
    )
    documents = preprocessor.process(document)
    assert len(documents) == 2
 def test_preprocess_word_split():
    document = {"text": TEXT}
    preprocessor = PreProcessor(split_length=10, split_stride=0, split_by="word", split_respect_sentence_boundary=False)
    documents = preprocessor.process(document)
    assert len(documents) == 11
    preprocessor = PreProcessor(split_length=10, split_stride=0, split_by="word", split_respect_sentence_boundary=True)
    documents = preprocessor.process(document)
    for doc in documents:
        assert len(doc["text"].split(" ")) <= 10 or doc["text"].startswith("This is to trick")
    assert len(documents) == 15
 def test_preprocess_passage_split():
    document = {"text": TEXT}
    preprocessor = PreProcessor(split_length=1, split_stride=0, split_by="passage")
    documents = preprocessor.process(document)
    assert len(documents) == 3
    preprocessor = PreProcessor(split_length=2, split_stride=0, split_by="passage")
    documents = preprocessor.process(document)
    assert len(documents) == 2
 def test_clean_header_footer():
    converter = PDFToTextConverter()
    document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf"))  # file contains header/footer
    preprocessor = PreProcessor(clean_header_footer=True, split_by=None)
    documents = preprocessor.process(document)
    assert len(documents) == 1
    assert "This is a header." not in documents[0]["text"]
    assert "footer" not in documents[0]["text"]