Add support for indexing pipelines (#816)

2025-11-03 19:29:32 +00:00 · 2021-02-16 16:24:28 +01:00 · 2021-02-16 16:24:28 +01:00 · 07907f9eac
commit 07907f9eac
parent 7030c94325
15 changed files with 433 additions and 110 deletions
--- a/docs/_src/api/api/file_converter.md
+++ b/docs/_src/api/api/file_converter.md
@ -5,7 +5,7 @@
 ## BaseConverter Objects

 ```python
-class BaseConverter()
+class BaseConverter(BaseComponent)
 ```

 Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore.
@ -14,7 +14,7 @@ Base class for implementing file converts to transform input documents to text f
 #### \_\_init\_\_

 ```python
- | __init__(remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None)
+ | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
 ```

 **Arguments**:
@ -35,7 +35,7 @@ in garbled text.

 ```python
 | @abstractmethod
- | convert(file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]
+ | convert(file_path: Path, meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any]
 ```

 Convert a file to a dictionary containing the text and any associated meta data.
@ -47,6 +47,16 @@ supplied meta data like author, url, external IDs can be supplied as a dictionar

 - `file_path`: path of the file to convert
 - `meta`: dictionary of meta data key-value pairs to append in the returned document.
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.

 <a name="base.BaseConverter.validate_language"></a>
 #### validate\_language
@ -71,7 +81,7 @@ class TextConverter(BaseConverter)
 #### \_\_init\_\_

 ```python
- | __init__(remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None)
+ | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
 ```

 **Arguments**:
@ -91,16 +101,25 @@ in garbled text.
 #### convert

 ```python
- | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, encoding: str = "utf-8") -> Dict[str, Any]
+ | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: str = "utf-8") -> Dict[str, Any]
 ```

 Reads text from a txt file and executes optional preprocessing steps.

 **Arguments**:

- `file_path`: Path of the file to convert
- `meta`: Optional meta data that should be associated with the the document (e.g. name)
- `encoding`: Encoding of the file
+- `file_path`: path of the file to convert
+- `meta`: dictionary of meta data key-value pairs to append in the returned document.
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.

 **Returns**:

@ -120,7 +139,7 @@ class DocxToTextConverter(BaseConverter)
 #### convert

 ```python
- | convert(file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]
+ | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any]
 ```

 Extract text from a .docx file.
@ -130,6 +149,17 @@ For compliance with other converters we nevertheless opted for keeping the metho
 **Arguments**:

 - `file_path`: Path to the .docx file you want to convert
+- `meta`: dictionary of meta data key-value pairs to append in the returned document.
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.

 <a name="tika"></a>
 # Module tika
@ -145,7 +175,7 @@ class TikaConverter(BaseConverter)
 #### \_\_init\_\_

 ```python
- | __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None)
+ | __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
 ```

 **Arguments**:
@ -166,12 +196,23 @@ in garbled text.
 #### convert

 ```python
- | convert(file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]
+ | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any]
 ```

 **Arguments**:

- `file_path`: Path of file to be converted.
+- `file_path`: path of the file to convert
+- `meta`: dictionary of meta data key-value pairs to append in the returned document.
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.

 **Returns**:

@ -191,7 +232,7 @@ class PDFToTextConverter(BaseConverter)
 #### \_\_init\_\_

 ```python
- | __init__(remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None)
+ | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
 ```

 **Arguments**:
@ -211,7 +252,7 @@ in garbled text.
 #### convert

 ```python
- | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, encoding: str = "Latin1") -> Dict[str, Any]
+ | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: str = "Latin1") -> Dict[str, Any]
 ```

 Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
@ -221,6 +262,16 @@ Extract text from a .pdf file using the pdftotext library (https://www.xpdfreade
 - `file_path`: Path to the .pdf file you want to convert
 - `meta`: Optional dictionary with metadata that shall be attached to all resulting documents.
 Can be any custom keys and values.
+- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
+The tabular structures in documents might be noise for the reader model if it
+does not have table parsing capability for finding answers. However, tables
+may also have long strings that could possible candidate for searching answers.
+The rows containing strings are thus retained in this option.
+- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
+(https://en.wikipedia.org/wiki/ISO_639-1) format.
+This option can be used to add test for encoding errors. If the extracted text is
+not one of the valid languages, then it might likely be encoding error resulting
+in garbled text.
 - `encoding`: Encoding that will be passed as -enc parameter to pdftotext. "Latin 1" is the default encoding
 of pdftotext. While this works well on many PDFs, it might be needed to switch to "UTF-8" or
 others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...).
--- a/docs/_src/api/api/preprocessor.md
+++ b/docs/_src/api/api/preprocessor.md
@ -5,14 +5,14 @@
 ## BasePreProcessor Objects

 ```python
-class BasePreProcessor()
+class BasePreProcessor(BaseComponent)
 ```

 <a name="base.BasePreProcessor.process"></a>
 #### process

 ```python
- | process(document: dict) -> List[dict]
+ | process(document: dict, clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict]
 ```

 Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
@ -31,7 +31,7 @@ class PreProcessor(BasePreProcessor)
 #### \_\_init\_\_

 ```python
- | __init__(clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True)
+ | __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, split_by: str = "word", split_length: int = 1000, split_overlap: int = 0, split_respect_sentence_boundary: bool = True)
 ```

 **Arguments**:
@ -50,16 +50,25 @@ Setting this to a positive number essentially enables the sliding window approac
 For example, if split_by -> `word`,
 split_length -> 5 & split_overlap -> 2, then the splits would be like:
 [w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12].
-Set the value to None to ensure there is no overlap among the documents after splitting.
+Set the value to 0 to ensure there is no overlap among the documents after splitting.
 - `split_respect_sentence_boundary`: Whether to split in partial sentences if split_by -> `word`. If set
 to True, the individual split will always have complete sentences &
 the number of words will be <= split_length.

+<a name="preprocessor.PreProcessor.process"></a>
+#### process
+
+```python
+ | process(document: dict, clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict]
+```
+
+Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
+
 <a name="preprocessor.PreProcessor.clean"></a>
 #### clean

 ```python
- | clean(document: dict) -> dict
+ | clean(document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool) -> dict
 ```

 Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
@ -69,7 +78,7 @@ and empty lines. Its exact functionality is defined by the parameters passed int
 #### split

 ```python
- | split(document: dict) -> List[dict]
+ | split(document: dict, split_by: str, split_length: int, split_overlap: int, split_respect_sentence_boundary: bool) -> List[dict]
 ```

 Perform document splitting on a single document. This method can split on different units, at different lengths,
--- a/haystack/document_store/base.py
+++ b/haystack/document_store/base.py
@ -206,5 +206,6 @@ class BaseDocumentStore(BaseComponent):
    def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
        pass

-    def run(self, **kwargs):
-        raise NotImplementedError
+    def run(self, documents: List[dict], index: Optional[str] = None, **kwargs):
+        self.write_documents(documents=documents, index=index)
+        return kwargs, "output_1"
--- a/haystack/file_converter/base.py
+++ b/haystack/file_converter/base.py
@ -4,13 +4,17 @@ from typing import List, Optional, Dict, Any

 import langdetect

+from haystack import BaseComponent

-class BaseConverter:
+
+class BaseConverter(BaseComponent):
    """
    Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore.
    """

-    def __init__(self, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None):
+    outgoing_edges = 1
+
+    def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
@ -27,7 +31,13 @@ class BaseConverter:
        self.valid_languages = valid_languages

    @abstractmethod
-    def convert(self, file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]:
+    def convert(
+        self,
+        file_path: Path,
+        meta: Optional[Dict[str, str]],
+        remove_numeric_tables: Optional[bool] = None,
+        valid_languages: Optional[List[str]] = None,
+    ) -> Dict[str, Any]:
        """
        Convert a file to a dictionary containing the text and any associated meta data.

@ -36,6 +46,16 @@ class BaseConverter:

        :param file_path: path of the file to convert
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
        """
        pass

@ -56,4 +76,20 @@ class BaseConverter:
        else:
            return False

+    def run(
+        self,
+        file_path: Path,
+        meta: Optional[Dict[str, str]] = None,
+        remove_numeric_tables: Optional[bool] = None,
+        valid_languages: Optional[List[str]] = None,
+        **kwargs
+    ):
+        document = self.convert(
+            file_path=file_path,
+            meta=meta,
+            remove_numeric_tables=remove_numeric_tables,
+            valid_languages=valid_languages,
+        )

+        result = {"document": document, **kwargs}
+        return result, "output_1"
--- a/haystack/file_converter/docx.py
+++ b/haystack/file_converter/docx.py
@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import Dict, Optional, Any
+from typing import Dict, Optional, Any, List

 import docx

@ -10,14 +10,39 @@ logger = logging.getLogger(__name__)


 class DocxToTextConverter(BaseConverter):
-    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+    def convert(
+        self,
+        file_path: Path,
+        meta: Optional[Dict[str, str]] = None,
+        remove_numeric_tables: Optional[bool] = None,
+        valid_languages: Optional[List[str]] = None,
+    ) -> Dict[str, Any]:
        """
        Extract text from a .docx file.
        Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
        For compliance with other converters we nevertheless opted for keeping the methods name.

        :param file_path: Path to the .docx file you want to convert
+        :param meta: dictionary of meta data key-value pairs to append in the returned document.
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
        """
+        if remove_numeric_tables is None:
+            remove_numeric_tables = self.remove_numeric_tables
+        if valid_languages is None:
+            valid_languages = self.valid_languages
+        if remove_numeric_tables is True:
+            raise Exception("'remove_numeric_tables' is not supported by DocxToTextConverter.")
+        if valid_languages is True:
+            raise Exception("Language validation using 'valid_languages' is not supported by DocxToTextConverter.")

        file = docx.Document(file_path)  # Creating word reader object.
        paragraphs = [para.text for para in file.paragraphs]
--- a/haystack/file_converter/pdf.py
+++ b/haystack/file_converter/pdf.py
@ -9,7 +9,7 @@ logger = logging.getLogger(__name__)


 class PDFToTextConverter(BaseConverter):
-    def __init__(self, remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None):
+    def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
@ -40,13 +40,30 @@ class PDFToTextConverter(BaseConverter):

        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)

-    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None, encoding: str = "Latin1") -> Dict[str, Any]:
+    def convert(
+        self,
+        file_path: Path,
+        meta: Optional[Dict[str, str]] = None,
+        remove_numeric_tables: Optional[bool] = None,
+        valid_languages: Optional[List[str]] = None,
+        encoding: str = "Latin1",
+    ) -> Dict[str, Any]:
        """
        Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)

        :param file_path: Path to the .pdf file you want to convert
        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
                     Can be any custom keys and values.
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.
        :param encoding: Encoding that will be passed as -enc parameter to pdftotext. "Latin 1" is the default encoding
                         of pdftotext. While this works well on many PDFs, it might be needed to switch to "UTF-8" or
                         others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...).
@ -56,6 +73,10 @@ class PDFToTextConverter(BaseConverter):
        """

        pages = self._read_pdf(file_path, layout=False, encoding=encoding)
+        if remove_numeric_tables is None:
+            remove_numeric_tables = self.remove_numeric_tables
+        if valid_languages is None:
+            valid_languages = self.valid_languages

        cleaned_pages = []
        for page in pages:
@ -76,7 +97,7 @@ class PDFToTextConverter(BaseConverter):
                digits = [word for word in words if any(i.isdigit() for i in word)]

                # remove lines having > 40% of words as digits AND not ending with a period(.)
-                if self.remove_numeric_tables:
+                if remove_numeric_tables:
                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
                        logger.debug(f"Removing line '{line}' from {file_path}")
                        continue
@ -85,7 +106,7 @@ class PDFToTextConverter(BaseConverter):
            page = "\n".join(cleaned_lines)
            cleaned_pages.append(page)

-        if self.valid_languages:
+        if valid_languages:
            document_text = "".join(cleaned_pages)
            if not self.validate_language(document_text):
                logger.warning(
--- a/haystack/file_converter/tika.py
+++ b/haystack/file_converter/tika.py
@ -42,7 +42,7 @@ class TikaConverter(BaseConverter):
    def __init__(
        self,
        tika_url: str = "http://localhost:9998/tika",
-        remove_numeric_tables: Optional[bool] = False,
+        remove_numeric_tables: bool = False,
        valid_languages: Optional[List[str]] = None
    ):
        """
@ -65,12 +65,34 @@ class TikaConverter(BaseConverter):
        self.tika_url = tika_url
        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)

-    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+    def convert(
+        self,
+        file_path: Path,
+        meta: Optional[Dict[str, str]] = None,
+        remove_numeric_tables: Optional[bool] = None,
+        valid_languages: Optional[List[str]] = None,
+    ) -> Dict[str, Any]:
        """
-        :param file_path: Path of file to be converted.
+        :param file_path: path of the file to convert
+        :param meta: dictionary of meta data key-value pairs to append in the returned document.
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+       :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.

        :return: a list of pages and the extracted meta data of the file.
        """
+        if remove_numeric_tables is None:
+            remove_numeric_tables = self.remove_numeric_tables
+        if valid_languages is None:
+            valid_languages = self.valid_languages
+
        parsed = tikaparser.from_file(file_path.as_posix(), self.tika_url, xmlContent=True)
        parser = TikaXHTMLParser()
        parser.feed(parsed["content"])
@ -85,7 +107,7 @@ class TikaConverter(BaseConverter):
                digits = [word for word in words if any(i.isdigit() for i in word)]

                # remove lines having > 40% of words as digits AND not ending with a period(.)
-                if self.remove_numeric_tables:
+                if remove_numeric_tables:
                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
                        logger.debug(f"Removing line '{line}' from {file_path}")
                        continue
@ -95,11 +117,11 @@ class TikaConverter(BaseConverter):
            page = "\n".join(cleaned_lines)
            cleaned_pages.append(page)

-        if self.valid_languages:
+        if valid_languages:
            document_text = "".join(cleaned_pages)
            if not self.validate_language(document_text):
                logger.warning(
-                    f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
+                    f"The language for {file_path} is not one of {valid_languages}. The file may not have "
                    f"been decoded in the correct text format."
                )

--- a/haystack/file_converter/txt.py
+++ b/haystack/file_converter/txt.py
@ -8,7 +8,7 @@ logger = logging.getLogger(__name__)


 class TextConverter(BaseConverter):
-    def __init__(self, remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None):
+    def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
@ -22,23 +22,40 @@ class TextConverter(BaseConverter):
                                in garbled text.
        """

-        super().__init__(remove_numeric_tables=remove_numeric_tables,
-                         valid_languages=valid_languages)
+        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)

-    def convert(self,
-                file_path: Path,
-                meta: Optional[Dict[str, str]] = None,
-                encoding: str = "utf-8") -> Dict[str, Any]:
+    def convert(
+        self,
+        file_path: Path,
+        meta: Optional[Dict[str, str]] = None,
+        remove_numeric_tables: Optional[bool] = None,
+        valid_languages: Optional[List[str]] = None,
+        encoding: str = "utf-8",
+    ) -> Dict[str, Any]:
        """
        Reads text from a txt file and executes optional preprocessing steps.

-        :param file_path: Path of the file to convert
-        :param meta: Optional meta data that should be associated with the the document (e.g. name)
-        :param encoding: Encoding of the file
+        :param file_path: path of the file to convert
+        :param meta: dictionary of meta data key-value pairs to append in the returned document.
+        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
+                                      The tabular structures in documents might be noise for the reader model if it
+                                      does not have table parsing capability for finding answers. However, tables
+                                      may also have long strings that could possible candidate for searching answers.
+                                      The rows containing strings are thus retained in this option.
+        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
+                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
+                                This option can be used to add test for encoding errors. If the extracted text is
+                                not one of the valid languages, then it might likely be encoding error resulting
+                                in garbled text.

        :return: Dict of format {"text": "The text from file", "meta": meta}}

        """
+        if remove_numeric_tables is None:
+            remove_numeric_tables = self.remove_numeric_tables
+        if valid_languages is None:
+            valid_languages = self.valid_languages
+
        with open(file_path, encoding=encoding, errors="ignore") as f:
            text = f.read()
            pages = text.split("\f")
@ -52,7 +69,7 @@ class TextConverter(BaseConverter):
                digits = [word for word in words if any(i.isdigit() for i in word)]

                # remove lines having > 40% of words as digits AND not ending with a period(.)
-                if self.remove_numeric_tables:
+                if remove_numeric_tables:
                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
                        logger.debug(f"Removing line '{line}' from {file_path}")
                        continue
@ -62,7 +79,7 @@ class TextConverter(BaseConverter):
            page = "\n".join(cleaned_lines)
            cleaned_pages.append(page)

-        if self.valid_languages:
+        if valid_languages:
            document_text = "".join(cleaned_pages)
            if not self.validate_language(document_text):
                logger.warning(
--- a/haystack/pipeline.py
+++ b/haystack/pipeline.py
@ -26,10 +26,18 @@ class Pipeline(ABC):
    Reader from multiple Retrievers, or re-ranking of candidate documents.
    """

-    def __init__(self):
+    def __init__(self, pipeline_type: str = "Query"):
        self.graph = DiGraph()
-        self.root_node_id = "Query"
-        self.graph.add_node("Query", component=QueryNode())
+        if pipeline_type == "Query":
+            self.root_node_id = "Query"
+            self.graph.add_node("Query", component=RootNode())
+        elif pipeline_type == "Indexing":
+            self.root_node_id = "File"
+            self.graph.add_node("File", component=RootNode())
+        else:
+            raise Exception(f"pipeline_type '{pipeline_type}' is not valid. Supported types are 'Query' & 'Indexing'.")
+
+        self.pipeline_type = pipeline_type
        self.components: dict = {}

    def add_node(self, component, name: str, inputs: List[str]):
@ -49,6 +57,10 @@ class Pipeline(ABC):
        """
        self.graph.add_node(name, component=component, inputs=inputs)

+        if len(self.graph.nodes) == 2:  # first node added; connect with Root
+            self.graph.add_edge(self.root_node_id, name, label="output_1")
+            return
+
        for i in inputs:
            if "." in i:
                [input_node_name, input_edge_name] = i.split(".")
@ -89,7 +101,7 @@ class Pipeline(ABC):
    def run(self, **kwargs):
        has_next_node = True
        current_node_id = self.root_node_id
-        input_dict = kwargs
+        input_dict = {"pipeline_type": self.pipeline_type, **kwargs}
        output_dict = None

        while has_next_node:
@ -207,14 +219,13 @@ class Pipeline(ABC):
            name = definition.pop("name")
            definitions[name] = definition

-        pipeline = cls()
+        pipeline = cls(pipeline_type=pipeline_config["type"])

        components: dict = {}  # instances of component objects.
        for node_config in pipeline_config["nodes"]:
            name = node_config["name"]
            component = cls._load_or_get_component(name=name, definitions=definitions, components=components)
-            if "DocumentStore" not in definitions[name]["type"]:  # DocumentStore is not an explicit node in a Pipeline
-                pipeline.add_node(component=component, name=node_config["name"], inputs=node_config["inputs"])
+            pipeline.add_node(component=component, name=node_config["name"], inputs=node_config.get("inputs", []))

        return pipeline

@ -499,7 +510,7 @@ class TranslationWrapperPipeline(BaseStandardPipeline):
        return output


-class QueryNode:
+class RootNode:
    outgoing_edges = 1

    def run(self, **kwargs):
--- a/haystack/preprocessor/base.py
+++ b/haystack/preprocessor/base.py
@ -1,17 +1,44 @@
-from typing import List, Dict, Any
+from typing import List, Dict, Any, Optional
+
+from haystack import BaseComponent


-class BasePreProcessor:
-    def process(self, document: dict) -> List[dict]:
+class BasePreProcessor(BaseComponent):
+    outgoing_edges = 1
+
+    def process(
+        self,
+        document: dict,
+        clean_whitespace: Optional[bool] = True,
+        clean_header_footer: Optional[bool] = False,
+        clean_empty_lines: Optional[bool] = True,
+        split_by: Optional[str] = "word",
+        split_length: Optional[int] = 1000,
+        split_overlap: Optional[int] = None,
+        split_respect_sentence_boundary: Optional[bool] = True,
+    ) -> List[dict]:
        """
        Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
        """
-        cleaned_document = self.clean(document)
-        split_documents = self.split(cleaned_document)
-        return split_documents
-
-    def clean(self, document: Dict[str, Any]) -> Dict[str, Any]:
        raise NotImplementedError

-    def split(self, document: Dict[str, Any]) -> List[Dict[str, Any]]:
+    def clean(
+        self, document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool,
+    ) -> Dict[str, Any]:
        raise NotImplementedError
+
+    def split(
+        self,
+        document: dict,
+        split_by: str,
+        split_length: int,
+        split_overlap: int,
+        split_respect_sentence_boundary: bool,
+    ) -> List[Dict[str, Any]]:
+        raise NotImplementedError
+
+    def run(self, document: dict, **kwargs):
+        documents = self.process(document)
+
+        result = {"documents": documents, **kwargs}
+        return result, "output_1"
--- a/haystack/preprocessor/preprocessor.py
+++ b/haystack/preprocessor/preprocessor.py
@ -16,13 +16,13 @@ logger = logging.getLogger(__name__)
 class PreProcessor(BasePreProcessor):
    def __init__(
        self,
-        clean_whitespace: Optional[bool] = True,
-        clean_header_footer: Optional[bool] = False,
-        clean_empty_lines: Optional[bool] = True,
-        split_by: Optional[str] = "word",
-        split_length: Optional[int] = 1000,
-        split_overlap: Optional[int] = None,
-        split_respect_sentence_boundary: Optional[bool] = True,
+        clean_whitespace: bool = True,
+        clean_header_footer: bool = False,
+        clean_empty_lines: bool = True,
+        split_by: str = "word",
+        split_length: int = 1000,
+        split_overlap: int = 0,
+        split_respect_sentence_boundary: bool = True,
    ):
        """
        :param clean_header_footer: Use heuristic to remove footers and headers across different pages by searching
@ -39,7 +39,7 @@ class PreProcessor(BasePreProcessor):
                              For example, if split_by -> `word`,
                              split_length -> 5 & split_overlap -> 2, then the splits would be like:
                              [w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12].
-                              Set the value to None to ensure there is no overlap among the documents after splitting.
+                              Set the value to 0 to ensure there is no overlap among the documents after splitting.
        :param split_respect_sentence_boundary: Whether to split in partial sentences if split_by -> `word`. If set
                                                to True, the individual split will always have complete sentences &
                                                the number of words will be <= split_length.
@ -53,18 +53,68 @@ class PreProcessor(BasePreProcessor):
        self.split_overlap = split_overlap
        self.split_respect_sentence_boundary = split_respect_sentence_boundary

-    def clean(self, document: dict) -> dict:
+    def process(
+        self,
+        document: dict,
+        clean_whitespace: Optional[bool] = None,
+        clean_header_footer: Optional[bool] = None,
+        clean_empty_lines: Optional[bool] = None,
+        split_by: Optional[str] = None,
+        split_length: Optional[int] = None,
+        split_overlap: Optional[int] = None,
+        split_respect_sentence_boundary: Optional[bool] = None,
+    ) -> List[dict]:
+        """
+        Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
+        """
+        if clean_whitespace is None:
+            clean_whitespace = self.clean_whitespace
+        if clean_header_footer is None:
+            clean_header_footer = self.clean_header_footer
+        if clean_empty_lines is None:
+            clean_empty_lines = self.clean_empty_lines
+        if split_by is None:
+            split_by = self.split_by
+        if split_length is None:
+            split_length = self.split_length
+        if split_overlap is None:
+            split_overlap = self.split_overlap
+        if split_respect_sentence_boundary is None:
+            split_respect_sentence_boundary = self.split_respect_sentence_boundary
+
+        cleaned_document = self.clean(
+            document=document,
+            clean_whitespace=clean_whitespace,
+            clean_header_footer=clean_header_footer,
+            clean_empty_lines=clean_empty_lines,
+        )
+        split_documents = self.split(
+            document=cleaned_document,
+            split_by=split_by,
+            split_length=split_length,
+            split_overlap=split_overlap,
+            split_respect_sentence_boundary=split_respect_sentence_boundary,
+        )
+        return split_documents
+
+    def clean(
+        self,
+        document: dict,
+        clean_whitespace: bool,
+        clean_header_footer: bool,
+        clean_empty_lines: bool,
+    ) -> dict:
        """
        Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
        and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
        """
        text = document["text"]
-        if self.clean_header_footer:
+        if clean_header_footer:
            text = self._find_and_remove_header_footer(
                text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
            )

-        if self.clean_whitespace:
+        if clean_whitespace:
            lines = text.splitlines()

            cleaned_lines = []
@ -73,30 +123,37 @@ class PreProcessor(BasePreProcessor):
                cleaned_lines.append(line)
            text = "\n".join(cleaned_lines)

-        if self.clean_empty_lines:
+        if clean_empty_lines:
            text = re.sub(r"\n\n+", "\n\n", text)

        document["text"] = text
        return document

-    def split(self, document: dict) -> List[dict]:
+    def split(
+        self,
+        document: dict,
+        split_by: str,
+        split_length: int,
+        split_overlap: int,
+        split_respect_sentence_boundary: bool,
+    ) -> List[dict]:
        """Perform document splitting on a single document. This method can split on different units, at different lengths,
        with different strides. It can also respect sentence boundaries. Its exact functionality is defined by
        the parameters passed into PreProcessor.__init__(). Takes a single document as input and returns a list of documents. """

-        if not self.split_by:
+        if not split_by:
            return [document]

-        if not self.split_length:
+        if not split_length:
            raise Exception("split_length needs be set when using split_by.")

-        if self.split_respect_sentence_boundary and self.split_by not in("word","sentence"):
+        if split_respect_sentence_boundary and split_by not in("word","sentence"):
            raise NotImplementedError("'split_respect_sentence_boundary=True' is only compatible with"
                                      " split_by='word' or split_by='sentence'.")

        text = document["text"]

-        if self.split_respect_sentence_boundary and self.split_by == "word":
+        if split_respect_sentence_boundary and split_by == "word":
            # split by words ensuring no sub sentence splits
            sentences = nltk.tokenize.sent_tokenize(text)
            word_count = 0
@ -104,17 +161,17 @@ class PreProcessor(BasePreProcessor):
            current_slice: List[str] = []
            for sen in sentences:
                current_word_count = len(sen.split(" "))
-                if current_word_count > self.split_length:
+                if current_word_count > split_length:
                    logger.warning(f"A sentence found with word count higher than the split length.")
-                if word_count + current_word_count > self.split_length:
+                if word_count + current_word_count > split_length:
                    list_splits.append(current_slice)
-                    #Enable split_stride with split_by='word' while respecting sentence boundaries.
-                    if self.split_overlap:
+                    # Enable split_stride with split_by='word' while respecting sentence boundaries.
+                    if split_overlap:
                        overlap = []
                        w_count = 0
                        for s in current_slice[::-1]:
                            sen_len = len(s.split(" "))
-                            if w_count < self.split_overlap:
+                            if w_count < split_overlap:
                                overlap.append(s)
                                w_count += sen_len
                            else:
@ -136,20 +193,20 @@ class PreProcessor(BasePreProcessor):
                    text_splits.append(txt)
        else:
            # create individual "elements" of passage, sentence, or word
-            if self.split_by == "passage":
+            if split_by == "passage":
                elements = text.split("\n\n")
-            elif self.split_by == "sentence":
+            elif split_by == "sentence":
                elements = nltk.tokenize.sent_tokenize(text)
-            elif self.split_by == "word":
+            elif split_by == "word":
                elements = text.split(" ")
            else:
                raise NotImplementedError("PreProcessor only supports 'passage', 'sentence' or 'word' split_by options.")

            # concatenate individual elements based on split_length & split_stride
-            if self.split_overlap:
-                segments = windowed(elements, n=self.split_length, step=self.split_length - self.split_overlap)
+            if split_overlap:
+                segments = windowed(elements, n=split_length, step=split_length - split_overlap)
            else:
-                segments = windowed(elements, n=self.split_length, step=self.split_length)
+                segments = windowed(elements, n=split_length, step=split_length)
            text_splits = []
            for seg in segments:
                txt = " ".join([t for t in seg if t])
--- a/haystack/retriever/base.py
+++ b/haystack/retriever/base.py
@ -4,7 +4,7 @@ import logging
 from time import perf_counter
 from functools import wraps
 from tqdm import tqdm
-
+from copy import deepcopy
 from haystack import Document, BaseComponent
 from haystack.document_store.base import BaseDocumentStore

@ -168,12 +168,21 @@ class BaseRetriever(BaseComponent):
        else:
            return metrics

-    def run(
-            self,
-            query: str,
-            filters: Optional[dict] = None,
-            top_k_retriever: Optional[int] = None,
-            **kwargs,
+    def run(self, pipeline_type: str, **kwargs):
+        if pipeline_type == "Query":
+            output, stream = self.run_query(**kwargs)
+        elif pipeline_type == "Indexing":
+            output, stream = self.run_indexing(**kwargs)
+        else:
+            raise Exception(f"Invalid pipeline_type '{pipeline_type}'.")
+        return output, stream
+
+    def run_query(
+        self,
+        query: str,
+        filters: Optional[dict] = None,
+        top_k_retriever: Optional[int] = None,
+        **kwargs,
    ):
        if top_k_retriever:
            documents = self.retrieve(query=query, filters=filters, top_k=top_k_retriever)
@ -188,3 +197,14 @@ class BaseRetriever(BaseComponent):
        }

        return output, "output_1"
+
+    def run_indexing(self, documents: List[dict], **kwargs):
+        if self.__class__.__name__ in ["DensePassageRetriever", "EmbeddingRetriever"]:
+            documents = deepcopy(documents)
+            document_objects = [Document.from_dict(doc) for doc in documents]
+            embeddings = self.embed_passages(document_objects)  # type: ignore
+            for doc, emb in zip(documents, embeddings):
+                doc["embedding"] = emb
+
+        output = {**kwargs, "documents": documents}
+        return output, "output_1"
--- a/test/samples/pipeline/test_pipeline.yaml
+++ b/test/samples/pipeline/test_pipeline.yaml
@ -14,12 +14,34 @@ components:
  - name: TestDocumentStore
    type: ElasticsearchDocumentStore
    params:
-      index: haystack_test
+      index: haystack_test_pipeline
+  - name: TestPDFConverter
+    type: PDFToTextConverter
+    params:
+      remove_numeric_tables: false
+  - name: TestPreprocessor
+    type: PreProcessor
+    params:
+      clean_whitespace: true
+

 pipelines:
  - name: test_query_pipeline
+    type: Query
    nodes:
      - name: TestESRetriever
        inputs: [Query]
      - name: TestReader
-        inputs: [TestESRetriever]
+        inputs: [TestESRetriever]
+
+  - name: test_indexing_pipeline
+    type: Indexing
+    nodes:
+      - name: TestPDFConverter
+        inputs: [File]
+      - name: TestPreprocessor
+        inputs: [TestPDFConverter]
+      - name: TestESRetriever
+        inputs: [TestPreprocessor]
+      - name: TestDocumentStore
+        inputs: [TestESRetriever]
--- a/test/test_pipeline.py
+++ b/test/test_pipeline.py
@ -11,12 +11,16 @@ from haystack.retriever.sparse import ElasticsearchRetriever

@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
 def test_load_yaml(document_store_with_docs):
+    # test correct load of indexing pipeline from yaml
+    pipeline = Pipeline.load_from_yaml(Path("samples/pipeline/test_pipeline.yaml"),
+                                       pipeline_name="test_indexing_pipeline")
+    pipeline.run(file_path=Path("samples/pdf/sample_pdf_1.pdf"), top_k_retriever=10, top_k_reader=3)

-    # # test correct load from yaml
-    pipeline = Pipeline.load_from_yaml(Path("samples/pipeline/test_pipeline.yaml", pipeline_name="my_query"))
-    prediction = pipeline.run(query="Who lives in Berlin?", top_k_retriever=10, top_k_reader=3)
-    assert prediction["query"] == "Who lives in Berlin?"
-    assert prediction["answers"][0]["answer"] == "Carla"
+    # test correct load of query pipeline from yaml
+    pipeline = Pipeline.load_from_yaml(Path("samples/pipeline/test_pipeline.yaml"), pipeline_name="test_query_pipeline")
+    prediction = pipeline.run(query="Who made the PDF specification?", top_k_retriever=10, top_k_reader=3)
+    assert prediction["query"] == "Who made the PDF specification?"
+    assert prediction["answers"][0]["answer"] == "Adobe Systems"

    # test invalid pipeline name
    with pytest.raises(Exception):
--- a/test/test_retriever.py
+++ b/test/test_retriever.py
@ -93,7 +93,7 @@ def test_elasticsearch_custom_query(elasticsearch_fixture):
                            "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}],
                            "filter": [{"terms": {"year": ${years}}}]}}}""",
    )
-    results = retriever.run(query="test", filters={"years": ["2020", "2021"]})[0]["documents"]
+    results = retriever.retrieve(query="test", filters={"years": ["2020", "2021"]})
    assert len(results) == 4

    # test custom "term" query
@ -108,7 +108,7 @@ def test_elasticsearch_custom_query(elasticsearch_fixture):
                                "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}],
                                "filter": [{"term": {"year": ${years}}}]}}}""",
    )
-    results = retriever.run(query="test", filters={"years": "2021"})[0]["documents"]
+    results = retriever.retrieve(query="test", filters={"years": "2021"})
    assert len(results) == 3