mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-11-03 19:29:32 +00:00 
			
		
		
		
	Add support for indexing pipelines (#816)
This commit is contained in:
		
							parent
							
								
									7030c94325
								
							
						
					
					
						commit
						07907f9eac
					
				@ -5,7 +5,7 @@
 | 
			
		||||
## BaseConverter Objects
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
class BaseConverter()
 | 
			
		||||
class BaseConverter(BaseComponent)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore.
 | 
			
		||||
@ -14,7 +14,7 @@ Base class for implementing file converts to transform input documents to text f
 | 
			
		||||
#### \_\_init\_\_
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | __init__(remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None)
 | 
			
		||||
 | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
**Arguments**:
 | 
			
		||||
@ -35,7 +35,7 @@ in garbled text.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | @abstractmethod
 | 
			
		||||
 | convert(file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]
 | 
			
		||||
 | convert(file_path: Path, meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Convert a file to a dictionary containing the text and any associated meta data.
 | 
			
		||||
@ -47,6 +47,16 @@ supplied meta data like author, url, external IDs can be supplied as a dictionar
 | 
			
		||||
 | 
			
		||||
- `file_path`: path of the file to convert
 | 
			
		||||
- `meta`: dictionary of meta data key-value pairs to append in the returned document.
 | 
			
		||||
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
The rows containing strings are thus retained in this option.
 | 
			
		||||
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
 | 
			
		||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
 | 
			
		||||
This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
in garbled text.
 | 
			
		||||
 | 
			
		||||
<a name="base.BaseConverter.validate_language"></a>
 | 
			
		||||
#### validate\_language
 | 
			
		||||
@ -71,7 +81,7 @@ class TextConverter(BaseConverter)
 | 
			
		||||
#### \_\_init\_\_
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | __init__(remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None)
 | 
			
		||||
 | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
**Arguments**:
 | 
			
		||||
@ -91,16 +101,25 @@ in garbled text.
 | 
			
		||||
#### convert
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, encoding: str = "utf-8") -> Dict[str, Any]
 | 
			
		||||
 | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: str = "utf-8") -> Dict[str, Any]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Reads text from a txt file and executes optional preprocessing steps.
 | 
			
		||||
 | 
			
		||||
**Arguments**:
 | 
			
		||||
 | 
			
		||||
- `file_path`: Path of the file to convert
 | 
			
		||||
- `meta`: Optional meta data that should be associated with the the document (e.g. name)
 | 
			
		||||
- `encoding`: Encoding of the file
 | 
			
		||||
- `file_path`: path of the file to convert
 | 
			
		||||
- `meta`: dictionary of meta data key-value pairs to append in the returned document.
 | 
			
		||||
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
The rows containing strings are thus retained in this option.
 | 
			
		||||
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
 | 
			
		||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
 | 
			
		||||
This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
in garbled text.
 | 
			
		||||
 | 
			
		||||
**Returns**:
 | 
			
		||||
 | 
			
		||||
@ -120,7 +139,7 @@ class DocxToTextConverter(BaseConverter)
 | 
			
		||||
#### convert
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | convert(file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]
 | 
			
		||||
 | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Extract text from a .docx file.
 | 
			
		||||
@ -130,6 +149,17 @@ For compliance with other converters we nevertheless opted for keeping the metho
 | 
			
		||||
**Arguments**:
 | 
			
		||||
 | 
			
		||||
- `file_path`: Path to the .docx file you want to convert
 | 
			
		||||
- `meta`: dictionary of meta data key-value pairs to append in the returned document.
 | 
			
		||||
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
The rows containing strings are thus retained in this option.
 | 
			
		||||
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
 | 
			
		||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
 | 
			
		||||
This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
in garbled text.
 | 
			
		||||
 | 
			
		||||
<a name="tika"></a>
 | 
			
		||||
# Module tika
 | 
			
		||||
@ -145,7 +175,7 @@ class TikaConverter(BaseConverter)
 | 
			
		||||
#### \_\_init\_\_
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None)
 | 
			
		||||
 | __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
**Arguments**:
 | 
			
		||||
@ -166,12 +196,23 @@ in garbled text.
 | 
			
		||||
#### convert
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | convert(file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]
 | 
			
		||||
 | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
**Arguments**:
 | 
			
		||||
 | 
			
		||||
- `file_path`: Path of file to be converted.
 | 
			
		||||
- `file_path`: path of the file to convert
 | 
			
		||||
- `meta`: dictionary of meta data key-value pairs to append in the returned document.
 | 
			
		||||
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
The rows containing strings are thus retained in this option.
 | 
			
		||||
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
 | 
			
		||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
 | 
			
		||||
This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
in garbled text.
 | 
			
		||||
 | 
			
		||||
**Returns**:
 | 
			
		||||
 | 
			
		||||
@ -191,7 +232,7 @@ class PDFToTextConverter(BaseConverter)
 | 
			
		||||
#### \_\_init\_\_
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | __init__(remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None)
 | 
			
		||||
 | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
**Arguments**:
 | 
			
		||||
@ -211,7 +252,7 @@ in garbled text.
 | 
			
		||||
#### convert
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, encoding: str = "Latin1") -> Dict[str, Any]
 | 
			
		||||
 | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: str = "Latin1") -> Dict[str, Any]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
 | 
			
		||||
@ -221,6 +262,16 @@ Extract text from a .pdf file using the pdftotext library (https://www.xpdfreade
 | 
			
		||||
- `file_path`: Path to the .pdf file you want to convert
 | 
			
		||||
- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents.
 | 
			
		||||
Can be any custom keys and values.
 | 
			
		||||
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
The rows containing strings are thus retained in this option.
 | 
			
		||||
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
 | 
			
		||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
 | 
			
		||||
This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
in garbled text.
 | 
			
		||||
- `encoding`: Encoding that will be passed as -enc parameter to pdftotext. "Latin 1" is the default encoding
 | 
			
		||||
of pdftotext. While this works well on many PDFs, it might be needed to switch to "UTF-8" or
 | 
			
		||||
others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...).
 | 
			
		||||
 | 
			
		||||
@ -5,14 +5,14 @@
 | 
			
		||||
## BasePreProcessor Objects
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
class BasePreProcessor()
 | 
			
		||||
class BasePreProcessor(BaseComponent)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
<a name="base.BasePreProcessor.process"></a>
 | 
			
		||||
#### process
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | process(document: dict) -> List[dict]
 | 
			
		||||
 | process(document: dict, clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
 | 
			
		||||
@ -31,7 +31,7 @@ class PreProcessor(BasePreProcessor)
 | 
			
		||||
#### \_\_init\_\_
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | __init__(clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True)
 | 
			
		||||
 | __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, split_by: str = "word", split_length: int = 1000, split_overlap: int = 0, split_respect_sentence_boundary: bool = True)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
**Arguments**:
 | 
			
		||||
@ -50,16 +50,25 @@ Setting this to a positive number essentially enables the sliding window approac
 | 
			
		||||
For example, if split_by -> `word`,
 | 
			
		||||
split_length -> 5 & split_overlap -> 2, then the splits would be like:
 | 
			
		||||
[w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12].
 | 
			
		||||
Set the value to None to ensure there is no overlap among the documents after splitting.
 | 
			
		||||
Set the value to 0 to ensure there is no overlap among the documents after splitting.
 | 
			
		||||
- `split_respect_sentence_boundary`: Whether to split in partial sentences if split_by -> `word`. If set
 | 
			
		||||
to True, the individual split will always have complete sentences &
 | 
			
		||||
the number of words will be <= split_length.
 | 
			
		||||
 | 
			
		||||
<a name="preprocessor.PreProcessor.process"></a>
 | 
			
		||||
#### process
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | process(document: dict, clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
 | 
			
		||||
 | 
			
		||||
<a name="preprocessor.PreProcessor.clean"></a>
 | 
			
		||||
#### clean
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | clean(document: dict) -> dict
 | 
			
		||||
 | clean(document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool) -> dict
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
 | 
			
		||||
@ -69,7 +78,7 @@ and empty lines. Its exact functionality is defined by the parameters passed int
 | 
			
		||||
#### split
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
 | split(document: dict) -> List[dict]
 | 
			
		||||
 | split(document: dict, split_by: str, split_length: int, split_overlap: int, split_respect_sentence_boundary: bool) -> List[dict]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Perform document splitting on a single document. This method can split on different units, at different lengths,
 | 
			
		||||
 | 
			
		||||
@ -206,5 +206,6 @@ class BaseDocumentStore(BaseComponent):
 | 
			
		||||
    def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    def run(self, **kwargs):
 | 
			
		||||
        raise NotImplementedError
 | 
			
		||||
    def run(self, documents: List[dict], index: Optional[str] = None, **kwargs):
 | 
			
		||||
        self.write_documents(documents=documents, index=index)
 | 
			
		||||
        return kwargs, "output_1"
 | 
			
		||||
 | 
			
		||||
@ -4,13 +4,17 @@ from typing import List, Optional, Dict, Any
 | 
			
		||||
 | 
			
		||||
import langdetect
 | 
			
		||||
 | 
			
		||||
from haystack import BaseComponent
 | 
			
		||||
 | 
			
		||||
class BaseConverter:
 | 
			
		||||
 | 
			
		||||
class BaseConverter(BaseComponent):
 | 
			
		||||
    """
 | 
			
		||||
    Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None):
 | 
			
		||||
    outgoing_edges = 1
 | 
			
		||||
 | 
			
		||||
    def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None):
 | 
			
		||||
        """
 | 
			
		||||
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
                                      The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
@ -27,7 +31,13 @@ class BaseConverter:
 | 
			
		||||
        self.valid_languages = valid_languages
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def convert(self, file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]:
 | 
			
		||||
    def convert(
 | 
			
		||||
        self,
 | 
			
		||||
        file_path: Path,
 | 
			
		||||
        meta: Optional[Dict[str, str]],
 | 
			
		||||
        remove_numeric_tables: Optional[bool] = None,
 | 
			
		||||
        valid_languages: Optional[List[str]] = None,
 | 
			
		||||
    ) -> Dict[str, Any]:
 | 
			
		||||
        """
 | 
			
		||||
        Convert a file to a dictionary containing the text and any associated meta data.
 | 
			
		||||
 | 
			
		||||
@ -36,6 +46,16 @@ class BaseConverter:
 | 
			
		||||
 | 
			
		||||
        :param file_path: path of the file to convert
 | 
			
		||||
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
 | 
			
		||||
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
                                      The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
                                      does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
                                      may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
                                      The rows containing strings are thus retained in this option.
 | 
			
		||||
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
 | 
			
		||||
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
 | 
			
		||||
                                This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
                                not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
                                in garbled text.
 | 
			
		||||
        """
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
@ -56,4 +76,20 @@ class BaseConverter:
 | 
			
		||||
        else:
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    def run(
 | 
			
		||||
        self,
 | 
			
		||||
        file_path: Path,
 | 
			
		||||
        meta: Optional[Dict[str, str]] = None,
 | 
			
		||||
        remove_numeric_tables: Optional[bool] = None,
 | 
			
		||||
        valid_languages: Optional[List[str]] = None,
 | 
			
		||||
        **kwargs
 | 
			
		||||
    ):
 | 
			
		||||
        document = self.convert(
 | 
			
		||||
            file_path=file_path,
 | 
			
		||||
            meta=meta,
 | 
			
		||||
            remove_numeric_tables=remove_numeric_tables,
 | 
			
		||||
            valid_languages=valid_languages,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        result = {"document": document, **kwargs}
 | 
			
		||||
        return result, "output_1"
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,6 @@
 | 
			
		||||
import logging
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import Dict, Optional, Any
 | 
			
		||||
from typing import Dict, Optional, Any, List
 | 
			
		||||
 | 
			
		||||
import docx
 | 
			
		||||
 | 
			
		||||
@ -10,14 +10,39 @@ logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DocxToTextConverter(BaseConverter):
 | 
			
		||||
    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
 | 
			
		||||
    def convert(
 | 
			
		||||
        self,
 | 
			
		||||
        file_path: Path,
 | 
			
		||||
        meta: Optional[Dict[str, str]] = None,
 | 
			
		||||
        remove_numeric_tables: Optional[bool] = None,
 | 
			
		||||
        valid_languages: Optional[List[str]] = None,
 | 
			
		||||
    ) -> Dict[str, Any]:
 | 
			
		||||
        """
 | 
			
		||||
        Extract text from a .docx file.
 | 
			
		||||
        Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
 | 
			
		||||
        For compliance with other converters we nevertheless opted for keeping the methods name.
 | 
			
		||||
 | 
			
		||||
        :param file_path: Path to the .docx file you want to convert
 | 
			
		||||
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
 | 
			
		||||
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
                                      The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
                                      does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
                                      may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
                                      The rows containing strings are thus retained in this option.
 | 
			
		||||
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
 | 
			
		||||
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
 | 
			
		||||
                                This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
                                not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
                                in garbled text.
 | 
			
		||||
        """
 | 
			
		||||
        if remove_numeric_tables is None:
 | 
			
		||||
            remove_numeric_tables = self.remove_numeric_tables
 | 
			
		||||
        if valid_languages is None:
 | 
			
		||||
            valid_languages = self.valid_languages
 | 
			
		||||
        if remove_numeric_tables is True:
 | 
			
		||||
            raise Exception("'remove_numeric_tables' is not supported by DocxToTextConverter.")
 | 
			
		||||
        if valid_languages is True:
 | 
			
		||||
            raise Exception("Language validation using 'valid_languages' is not supported by DocxToTextConverter.")
 | 
			
		||||
 | 
			
		||||
        file = docx.Document(file_path)  # Creating word reader object.
 | 
			
		||||
        paragraphs = [para.text for para in file.paragraphs]
 | 
			
		||||
 | 
			
		||||
@ -9,7 +9,7 @@ logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PDFToTextConverter(BaseConverter):
 | 
			
		||||
    def __init__(self, remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None):
 | 
			
		||||
    def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None):
 | 
			
		||||
        """
 | 
			
		||||
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
                                      The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
@ -40,13 +40,30 @@ class PDFToTextConverter(BaseConverter):
 | 
			
		||||
 | 
			
		||||
        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
 | 
			
		||||
 | 
			
		||||
    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None, encoding: str = "Latin1") -> Dict[str, Any]:
 | 
			
		||||
    def convert(
 | 
			
		||||
        self,
 | 
			
		||||
        file_path: Path,
 | 
			
		||||
        meta: Optional[Dict[str, str]] = None,
 | 
			
		||||
        remove_numeric_tables: Optional[bool] = None,
 | 
			
		||||
        valid_languages: Optional[List[str]] = None,
 | 
			
		||||
        encoding: str = "Latin1",
 | 
			
		||||
    ) -> Dict[str, Any]:
 | 
			
		||||
        """
 | 
			
		||||
        Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
 | 
			
		||||
 | 
			
		||||
        :param file_path: Path to the .pdf file you want to convert
 | 
			
		||||
        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
 | 
			
		||||
                     Can be any custom keys and values.
 | 
			
		||||
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
                                      The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
                                      does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
                                      may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
                                      The rows containing strings are thus retained in this option.
 | 
			
		||||
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
 | 
			
		||||
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
 | 
			
		||||
                                This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
                                not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
                                in garbled text.
 | 
			
		||||
        :param encoding: Encoding that will be passed as -enc parameter to pdftotext. "Latin 1" is the default encoding
 | 
			
		||||
                         of pdftotext. While this works well on many PDFs, it might be needed to switch to "UTF-8" or
 | 
			
		||||
                         others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...).
 | 
			
		||||
@ -56,6 +73,10 @@ class PDFToTextConverter(BaseConverter):
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        pages = self._read_pdf(file_path, layout=False, encoding=encoding)
 | 
			
		||||
        if remove_numeric_tables is None:
 | 
			
		||||
            remove_numeric_tables = self.remove_numeric_tables
 | 
			
		||||
        if valid_languages is None:
 | 
			
		||||
            valid_languages = self.valid_languages
 | 
			
		||||
 | 
			
		||||
        cleaned_pages = []
 | 
			
		||||
        for page in pages:
 | 
			
		||||
@ -76,7 +97,7 @@ class PDFToTextConverter(BaseConverter):
 | 
			
		||||
                digits = [word for word in words if any(i.isdigit() for i in word)]
 | 
			
		||||
 | 
			
		||||
                # remove lines having > 40% of words as digits AND not ending with a period(.)
 | 
			
		||||
                if self.remove_numeric_tables:
 | 
			
		||||
                if remove_numeric_tables:
 | 
			
		||||
                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
 | 
			
		||||
                        logger.debug(f"Removing line '{line}' from {file_path}")
 | 
			
		||||
                        continue
 | 
			
		||||
@ -85,7 +106,7 @@ class PDFToTextConverter(BaseConverter):
 | 
			
		||||
            page = "\n".join(cleaned_lines)
 | 
			
		||||
            cleaned_pages.append(page)
 | 
			
		||||
 | 
			
		||||
        if self.valid_languages:
 | 
			
		||||
        if valid_languages:
 | 
			
		||||
            document_text = "".join(cleaned_pages)
 | 
			
		||||
            if not self.validate_language(document_text):
 | 
			
		||||
                logger.warning(
 | 
			
		||||
 | 
			
		||||
@ -42,7 +42,7 @@ class TikaConverter(BaseConverter):
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        tika_url: str = "http://localhost:9998/tika",
 | 
			
		||||
        remove_numeric_tables: Optional[bool] = False,
 | 
			
		||||
        remove_numeric_tables: bool = False,
 | 
			
		||||
        valid_languages: Optional[List[str]] = None
 | 
			
		||||
    ):
 | 
			
		||||
        """
 | 
			
		||||
@ -65,12 +65,34 @@ class TikaConverter(BaseConverter):
 | 
			
		||||
        self.tika_url = tika_url
 | 
			
		||||
        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
 | 
			
		||||
 | 
			
		||||
    def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
 | 
			
		||||
    def convert(
 | 
			
		||||
        self,
 | 
			
		||||
        file_path: Path,
 | 
			
		||||
        meta: Optional[Dict[str, str]] = None,
 | 
			
		||||
        remove_numeric_tables: Optional[bool] = None,
 | 
			
		||||
        valid_languages: Optional[List[str]] = None,
 | 
			
		||||
    ) -> Dict[str, Any]:
 | 
			
		||||
        """
 | 
			
		||||
        :param file_path: Path of file to be converted.
 | 
			
		||||
        :param file_path: path of the file to convert
 | 
			
		||||
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
 | 
			
		||||
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
                                      The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
                                      does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
                                      may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
                                      The rows containing strings are thus retained in this option.
 | 
			
		||||
       :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
 | 
			
		||||
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
 | 
			
		||||
                                This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
                                not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
                                in garbled text.
 | 
			
		||||
 | 
			
		||||
        :return: a list of pages and the extracted meta data of the file.
 | 
			
		||||
        """
 | 
			
		||||
        if remove_numeric_tables is None:
 | 
			
		||||
            remove_numeric_tables = self.remove_numeric_tables
 | 
			
		||||
        if valid_languages is None:
 | 
			
		||||
            valid_languages = self.valid_languages
 | 
			
		||||
 | 
			
		||||
        parsed = tikaparser.from_file(file_path.as_posix(), self.tika_url, xmlContent=True)
 | 
			
		||||
        parser = TikaXHTMLParser()
 | 
			
		||||
        parser.feed(parsed["content"])
 | 
			
		||||
@ -85,7 +107,7 @@ class TikaConverter(BaseConverter):
 | 
			
		||||
                digits = [word for word in words if any(i.isdigit() for i in word)]
 | 
			
		||||
 | 
			
		||||
                # remove lines having > 40% of words as digits AND not ending with a period(.)
 | 
			
		||||
                if self.remove_numeric_tables:
 | 
			
		||||
                if remove_numeric_tables:
 | 
			
		||||
                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
 | 
			
		||||
                        logger.debug(f"Removing line '{line}' from {file_path}")
 | 
			
		||||
                        continue
 | 
			
		||||
@ -95,11 +117,11 @@ class TikaConverter(BaseConverter):
 | 
			
		||||
            page = "\n".join(cleaned_lines)
 | 
			
		||||
            cleaned_pages.append(page)
 | 
			
		||||
 | 
			
		||||
        if self.valid_languages:
 | 
			
		||||
        if valid_languages:
 | 
			
		||||
            document_text = "".join(cleaned_pages)
 | 
			
		||||
            if not self.validate_language(document_text):
 | 
			
		||||
                logger.warning(
 | 
			
		||||
                    f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
 | 
			
		||||
                    f"The language for {file_path} is not one of {valid_languages}. The file may not have "
 | 
			
		||||
                    f"been decoded in the correct text format."
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -8,7 +8,7 @@ logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TextConverter(BaseConverter):
 | 
			
		||||
    def __init__(self, remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None):
 | 
			
		||||
    def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None):
 | 
			
		||||
        """
 | 
			
		||||
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
                                      The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
@ -22,23 +22,40 @@ class TextConverter(BaseConverter):
 | 
			
		||||
                                in garbled text.
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        super().__init__(remove_numeric_tables=remove_numeric_tables,
 | 
			
		||||
                         valid_languages=valid_languages)
 | 
			
		||||
        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
 | 
			
		||||
 | 
			
		||||
    def convert(self,
 | 
			
		||||
                file_path: Path,
 | 
			
		||||
                meta: Optional[Dict[str, str]] = None,
 | 
			
		||||
                encoding: str = "utf-8") -> Dict[str, Any]:
 | 
			
		||||
    def convert(
 | 
			
		||||
        self,
 | 
			
		||||
        file_path: Path,
 | 
			
		||||
        meta: Optional[Dict[str, str]] = None,
 | 
			
		||||
        remove_numeric_tables: Optional[bool] = None,
 | 
			
		||||
        valid_languages: Optional[List[str]] = None,
 | 
			
		||||
        encoding: str = "utf-8",
 | 
			
		||||
    ) -> Dict[str, Any]:
 | 
			
		||||
        """
 | 
			
		||||
        Reads text from a txt file and executes optional preprocessing steps.
 | 
			
		||||
 | 
			
		||||
        :param file_path: Path of the file to convert
 | 
			
		||||
        :param meta: Optional meta data that should be associated with the the document (e.g. name)
 | 
			
		||||
        :param encoding: Encoding of the file
 | 
			
		||||
        :param file_path: path of the file to convert
 | 
			
		||||
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
 | 
			
		||||
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
                                      The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
                                      does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
                                      may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
                                      The rows containing strings are thus retained in this option.
 | 
			
		||||
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
 | 
			
		||||
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
 | 
			
		||||
                                This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
                                not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
                                in garbled text.
 | 
			
		||||
 | 
			
		||||
        :return: Dict of format {"text": "The text from file", "meta": meta}}
 | 
			
		||||
 | 
			
		||||
        """
 | 
			
		||||
        if remove_numeric_tables is None:
 | 
			
		||||
            remove_numeric_tables = self.remove_numeric_tables
 | 
			
		||||
        if valid_languages is None:
 | 
			
		||||
            valid_languages = self.valid_languages
 | 
			
		||||
 | 
			
		||||
        with open(file_path, encoding=encoding, errors="ignore") as f:
 | 
			
		||||
            text = f.read()
 | 
			
		||||
            pages = text.split("\f")
 | 
			
		||||
@ -52,7 +69,7 @@ class TextConverter(BaseConverter):
 | 
			
		||||
                digits = [word for word in words if any(i.isdigit() for i in word)]
 | 
			
		||||
 | 
			
		||||
                # remove lines having > 40% of words as digits AND not ending with a period(.)
 | 
			
		||||
                if self.remove_numeric_tables:
 | 
			
		||||
                if remove_numeric_tables:
 | 
			
		||||
                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
 | 
			
		||||
                        logger.debug(f"Removing line '{line}' from {file_path}")
 | 
			
		||||
                        continue
 | 
			
		||||
@ -62,7 +79,7 @@ class TextConverter(BaseConverter):
 | 
			
		||||
            page = "\n".join(cleaned_lines)
 | 
			
		||||
            cleaned_pages.append(page)
 | 
			
		||||
 | 
			
		||||
        if self.valid_languages:
 | 
			
		||||
        if valid_languages:
 | 
			
		||||
            document_text = "".join(cleaned_pages)
 | 
			
		||||
            if not self.validate_language(document_text):
 | 
			
		||||
                logger.warning(
 | 
			
		||||
 | 
			
		||||
@ -26,10 +26,18 @@ class Pipeline(ABC):
 | 
			
		||||
    Reader from multiple Retrievers, or re-ranking of candidate documents.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
    def __init__(self, pipeline_type: str = "Query"):
 | 
			
		||||
        self.graph = DiGraph()
 | 
			
		||||
        self.root_node_id = "Query"
 | 
			
		||||
        self.graph.add_node("Query", component=QueryNode())
 | 
			
		||||
        if pipeline_type == "Query":
 | 
			
		||||
            self.root_node_id = "Query"
 | 
			
		||||
            self.graph.add_node("Query", component=RootNode())
 | 
			
		||||
        elif pipeline_type == "Indexing":
 | 
			
		||||
            self.root_node_id = "File"
 | 
			
		||||
            self.graph.add_node("File", component=RootNode())
 | 
			
		||||
        else:
 | 
			
		||||
            raise Exception(f"pipeline_type '{pipeline_type}' is not valid. Supported types are 'Query' & 'Indexing'.")
 | 
			
		||||
 | 
			
		||||
        self.pipeline_type = pipeline_type
 | 
			
		||||
        self.components: dict = {}
 | 
			
		||||
 | 
			
		||||
    def add_node(self, component, name: str, inputs: List[str]):
 | 
			
		||||
@ -49,6 +57,10 @@ class Pipeline(ABC):
 | 
			
		||||
        """
 | 
			
		||||
        self.graph.add_node(name, component=component, inputs=inputs)
 | 
			
		||||
 | 
			
		||||
        if len(self.graph.nodes) == 2:  # first node added; connect with Root
 | 
			
		||||
            self.graph.add_edge(self.root_node_id, name, label="output_1")
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        for i in inputs:
 | 
			
		||||
            if "." in i:
 | 
			
		||||
                [input_node_name, input_edge_name] = i.split(".")
 | 
			
		||||
@ -89,7 +101,7 @@ class Pipeline(ABC):
 | 
			
		||||
    def run(self, **kwargs):
 | 
			
		||||
        has_next_node = True
 | 
			
		||||
        current_node_id = self.root_node_id
 | 
			
		||||
        input_dict = kwargs
 | 
			
		||||
        input_dict = {"pipeline_type": self.pipeline_type, **kwargs}
 | 
			
		||||
        output_dict = None
 | 
			
		||||
 | 
			
		||||
        while has_next_node:
 | 
			
		||||
@ -207,14 +219,13 @@ class Pipeline(ABC):
 | 
			
		||||
            name = definition.pop("name")
 | 
			
		||||
            definitions[name] = definition
 | 
			
		||||
 | 
			
		||||
        pipeline = cls()
 | 
			
		||||
        pipeline = cls(pipeline_type=pipeline_config["type"])
 | 
			
		||||
 | 
			
		||||
        components: dict = {}  # instances of component objects.
 | 
			
		||||
        for node_config in pipeline_config["nodes"]:
 | 
			
		||||
            name = node_config["name"]
 | 
			
		||||
            component = cls._load_or_get_component(name=name, definitions=definitions, components=components)
 | 
			
		||||
            if "DocumentStore" not in definitions[name]["type"]:  # DocumentStore is not an explicit node in a Pipeline
 | 
			
		||||
                pipeline.add_node(component=component, name=node_config["name"], inputs=node_config["inputs"])
 | 
			
		||||
            pipeline.add_node(component=component, name=node_config["name"], inputs=node_config.get("inputs", []))
 | 
			
		||||
 | 
			
		||||
        return pipeline
 | 
			
		||||
 | 
			
		||||
@ -499,7 +510,7 @@ class TranslationWrapperPipeline(BaseStandardPipeline):
 | 
			
		||||
        return output
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class QueryNode:
 | 
			
		||||
class RootNode:
 | 
			
		||||
    outgoing_edges = 1
 | 
			
		||||
 | 
			
		||||
    def run(self, **kwargs):
 | 
			
		||||
 | 
			
		||||
@ -1,17 +1,44 @@
 | 
			
		||||
from typing import List, Dict, Any
 | 
			
		||||
from typing import List, Dict, Any, Optional
 | 
			
		||||
 | 
			
		||||
from haystack import BaseComponent
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BasePreProcessor:
 | 
			
		||||
    def process(self, document: dict) -> List[dict]:
 | 
			
		||||
class BasePreProcessor(BaseComponent):
 | 
			
		||||
    outgoing_edges = 1
 | 
			
		||||
 | 
			
		||||
    def process(
 | 
			
		||||
        self,
 | 
			
		||||
        document: dict,
 | 
			
		||||
        clean_whitespace: Optional[bool] = True,
 | 
			
		||||
        clean_header_footer: Optional[bool] = False,
 | 
			
		||||
        clean_empty_lines: Optional[bool] = True,
 | 
			
		||||
        split_by: Optional[str] = "word",
 | 
			
		||||
        split_length: Optional[int] = 1000,
 | 
			
		||||
        split_overlap: Optional[int] = None,
 | 
			
		||||
        split_respect_sentence_boundary: Optional[bool] = True,
 | 
			
		||||
    ) -> List[dict]:
 | 
			
		||||
        """
 | 
			
		||||
        Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
 | 
			
		||||
        """
 | 
			
		||||
        cleaned_document = self.clean(document)
 | 
			
		||||
        split_documents = self.split(cleaned_document)
 | 
			
		||||
        return split_documents
 | 
			
		||||
 | 
			
		||||
    def clean(self, document: Dict[str, Any]) -> Dict[str, Any]:
 | 
			
		||||
        raise NotImplementedError
 | 
			
		||||
 | 
			
		||||
    def split(self, document: Dict[str, Any]) -> List[Dict[str, Any]]:
 | 
			
		||||
    def clean(
 | 
			
		||||
        self, document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool,
 | 
			
		||||
    ) -> Dict[str, Any]:
 | 
			
		||||
        raise NotImplementedError
 | 
			
		||||
 | 
			
		||||
    def split(
 | 
			
		||||
        self,
 | 
			
		||||
        document: dict,
 | 
			
		||||
        split_by: str,
 | 
			
		||||
        split_length: int,
 | 
			
		||||
        split_overlap: int,
 | 
			
		||||
        split_respect_sentence_boundary: bool,
 | 
			
		||||
    ) -> List[Dict[str, Any]]:
 | 
			
		||||
        raise NotImplementedError
 | 
			
		||||
 | 
			
		||||
    def run(self, document: dict, **kwargs):
 | 
			
		||||
        documents = self.process(document)
 | 
			
		||||
 | 
			
		||||
        result = {"documents": documents, **kwargs}
 | 
			
		||||
        return result, "output_1"
 | 
			
		||||
 | 
			
		||||
@ -16,13 +16,13 @@ logger = logging.getLogger(__name__)
 | 
			
		||||
class PreProcessor(BasePreProcessor):
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        clean_whitespace: Optional[bool] = True,
 | 
			
		||||
        clean_header_footer: Optional[bool] = False,
 | 
			
		||||
        clean_empty_lines: Optional[bool] = True,
 | 
			
		||||
        split_by: Optional[str] = "word",
 | 
			
		||||
        split_length: Optional[int] = 1000,
 | 
			
		||||
        split_overlap: Optional[int] = None,
 | 
			
		||||
        split_respect_sentence_boundary: Optional[bool] = True,
 | 
			
		||||
        clean_whitespace: bool = True,
 | 
			
		||||
        clean_header_footer: bool = False,
 | 
			
		||||
        clean_empty_lines: bool = True,
 | 
			
		||||
        split_by: str = "word",
 | 
			
		||||
        split_length: int = 1000,
 | 
			
		||||
        split_overlap: int = 0,
 | 
			
		||||
        split_respect_sentence_boundary: bool = True,
 | 
			
		||||
    ):
 | 
			
		||||
        """
 | 
			
		||||
        :param clean_header_footer: Use heuristic to remove footers and headers across different pages by searching
 | 
			
		||||
@ -39,7 +39,7 @@ class PreProcessor(BasePreProcessor):
 | 
			
		||||
                              For example, if split_by -> `word`,
 | 
			
		||||
                              split_length -> 5 & split_overlap -> 2, then the splits would be like:
 | 
			
		||||
                              [w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12].
 | 
			
		||||
                              Set the value to None to ensure there is no overlap among the documents after splitting.
 | 
			
		||||
                              Set the value to 0 to ensure there is no overlap among the documents after splitting.
 | 
			
		||||
        :param split_respect_sentence_boundary: Whether to split in partial sentences if split_by -> `word`. If set
 | 
			
		||||
                                                to True, the individual split will always have complete sentences &
 | 
			
		||||
                                                the number of words will be <= split_length.
 | 
			
		||||
@ -53,18 +53,68 @@ class PreProcessor(BasePreProcessor):
 | 
			
		||||
        self.split_overlap = split_overlap
 | 
			
		||||
        self.split_respect_sentence_boundary = split_respect_sentence_boundary
 | 
			
		||||
 | 
			
		||||
    def clean(self, document: dict) -> dict:
 | 
			
		||||
    def process(
 | 
			
		||||
        self,
 | 
			
		||||
        document: dict,
 | 
			
		||||
        clean_whitespace: Optional[bool] = None,
 | 
			
		||||
        clean_header_footer: Optional[bool] = None,
 | 
			
		||||
        clean_empty_lines: Optional[bool] = None,
 | 
			
		||||
        split_by: Optional[str] = None,
 | 
			
		||||
        split_length: Optional[int] = None,
 | 
			
		||||
        split_overlap: Optional[int] = None,
 | 
			
		||||
        split_respect_sentence_boundary: Optional[bool] = None,
 | 
			
		||||
    ) -> List[dict]:
 | 
			
		||||
        """
 | 
			
		||||
        Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
 | 
			
		||||
        """
 | 
			
		||||
        if clean_whitespace is None:
 | 
			
		||||
            clean_whitespace = self.clean_whitespace
 | 
			
		||||
        if clean_header_footer is None:
 | 
			
		||||
            clean_header_footer = self.clean_header_footer
 | 
			
		||||
        if clean_empty_lines is None:
 | 
			
		||||
            clean_empty_lines = self.clean_empty_lines
 | 
			
		||||
        if split_by is None:
 | 
			
		||||
            split_by = self.split_by
 | 
			
		||||
        if split_length is None:
 | 
			
		||||
            split_length = self.split_length
 | 
			
		||||
        if split_overlap is None:
 | 
			
		||||
            split_overlap = self.split_overlap
 | 
			
		||||
        if split_respect_sentence_boundary is None:
 | 
			
		||||
            split_respect_sentence_boundary = self.split_respect_sentence_boundary
 | 
			
		||||
 | 
			
		||||
        cleaned_document = self.clean(
 | 
			
		||||
            document=document,
 | 
			
		||||
            clean_whitespace=clean_whitespace,
 | 
			
		||||
            clean_header_footer=clean_header_footer,
 | 
			
		||||
            clean_empty_lines=clean_empty_lines,
 | 
			
		||||
        )
 | 
			
		||||
        split_documents = self.split(
 | 
			
		||||
            document=cleaned_document,
 | 
			
		||||
            split_by=split_by,
 | 
			
		||||
            split_length=split_length,
 | 
			
		||||
            split_overlap=split_overlap,
 | 
			
		||||
            split_respect_sentence_boundary=split_respect_sentence_boundary,
 | 
			
		||||
        )
 | 
			
		||||
        return split_documents
 | 
			
		||||
 | 
			
		||||
    def clean(
 | 
			
		||||
        self,
 | 
			
		||||
        document: dict,
 | 
			
		||||
        clean_whitespace: bool,
 | 
			
		||||
        clean_header_footer: bool,
 | 
			
		||||
        clean_empty_lines: bool,
 | 
			
		||||
    ) -> dict:
 | 
			
		||||
        """
 | 
			
		||||
        Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
 | 
			
		||||
        and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
 | 
			
		||||
        """
 | 
			
		||||
        text = document["text"]
 | 
			
		||||
        if self.clean_header_footer:
 | 
			
		||||
        if clean_header_footer:
 | 
			
		||||
            text = self._find_and_remove_header_footer(
 | 
			
		||||
                text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        if self.clean_whitespace:
 | 
			
		||||
        if clean_whitespace:
 | 
			
		||||
            lines = text.splitlines()
 | 
			
		||||
 | 
			
		||||
            cleaned_lines = []
 | 
			
		||||
@ -73,30 +123,37 @@ class PreProcessor(BasePreProcessor):
 | 
			
		||||
                cleaned_lines.append(line)
 | 
			
		||||
            text = "\n".join(cleaned_lines)
 | 
			
		||||
 | 
			
		||||
        if self.clean_empty_lines:
 | 
			
		||||
        if clean_empty_lines:
 | 
			
		||||
            text = re.sub(r"\n\n+", "\n\n", text)
 | 
			
		||||
 | 
			
		||||
        document["text"] = text
 | 
			
		||||
        return document
 | 
			
		||||
 | 
			
		||||
    def split(self, document: dict) -> List[dict]:
 | 
			
		||||
    def split(
 | 
			
		||||
        self,
 | 
			
		||||
        document: dict,
 | 
			
		||||
        split_by: str,
 | 
			
		||||
        split_length: int,
 | 
			
		||||
        split_overlap: int,
 | 
			
		||||
        split_respect_sentence_boundary: bool,
 | 
			
		||||
    ) -> List[dict]:
 | 
			
		||||
        """Perform document splitting on a single document. This method can split on different units, at different lengths,
 | 
			
		||||
        with different strides. It can also respect sentence boundaries. Its exact functionality is defined by
 | 
			
		||||
        the parameters passed into PreProcessor.__init__(). Takes a single document as input and returns a list of documents. """
 | 
			
		||||
 | 
			
		||||
        if not self.split_by:
 | 
			
		||||
        if not split_by:
 | 
			
		||||
            return [document]
 | 
			
		||||
 | 
			
		||||
        if not self.split_length:
 | 
			
		||||
        if not split_length:
 | 
			
		||||
            raise Exception("split_length needs be set when using split_by.")
 | 
			
		||||
 | 
			
		||||
        if self.split_respect_sentence_boundary and self.split_by not in("word","sentence"):
 | 
			
		||||
        if split_respect_sentence_boundary and split_by not in("word","sentence"):
 | 
			
		||||
            raise NotImplementedError("'split_respect_sentence_boundary=True' is only compatible with"
 | 
			
		||||
                                      " split_by='word' or split_by='sentence'.")
 | 
			
		||||
 | 
			
		||||
        text = document["text"]
 | 
			
		||||
 | 
			
		||||
        if self.split_respect_sentence_boundary and self.split_by == "word":
 | 
			
		||||
        if split_respect_sentence_boundary and split_by == "word":
 | 
			
		||||
            # split by words ensuring no sub sentence splits
 | 
			
		||||
            sentences = nltk.tokenize.sent_tokenize(text)
 | 
			
		||||
            word_count = 0
 | 
			
		||||
@ -104,17 +161,17 @@ class PreProcessor(BasePreProcessor):
 | 
			
		||||
            current_slice: List[str] = []
 | 
			
		||||
            for sen in sentences:
 | 
			
		||||
                current_word_count = len(sen.split(" "))
 | 
			
		||||
                if current_word_count > self.split_length:
 | 
			
		||||
                if current_word_count > split_length:
 | 
			
		||||
                    logger.warning(f"A sentence found with word count higher than the split length.")
 | 
			
		||||
                if word_count + current_word_count > self.split_length:
 | 
			
		||||
                if word_count + current_word_count > split_length:
 | 
			
		||||
                    list_splits.append(current_slice)
 | 
			
		||||
                    #Enable split_stride with split_by='word' while respecting sentence boundaries.
 | 
			
		||||
                    if self.split_overlap:
 | 
			
		||||
                    # Enable split_stride with split_by='word' while respecting sentence boundaries.
 | 
			
		||||
                    if split_overlap:
 | 
			
		||||
                        overlap = []
 | 
			
		||||
                        w_count = 0
 | 
			
		||||
                        for s in current_slice[::-1]:
 | 
			
		||||
                            sen_len = len(s.split(" "))
 | 
			
		||||
                            if w_count < self.split_overlap:
 | 
			
		||||
                            if w_count < split_overlap:
 | 
			
		||||
                                overlap.append(s)
 | 
			
		||||
                                w_count += sen_len
 | 
			
		||||
                            else:
 | 
			
		||||
@ -136,20 +193,20 @@ class PreProcessor(BasePreProcessor):
 | 
			
		||||
                    text_splits.append(txt)
 | 
			
		||||
        else:
 | 
			
		||||
            # create individual "elements" of passage, sentence, or word
 | 
			
		||||
            if self.split_by == "passage":
 | 
			
		||||
            if split_by == "passage":
 | 
			
		||||
                elements = text.split("\n\n")
 | 
			
		||||
            elif self.split_by == "sentence":
 | 
			
		||||
            elif split_by == "sentence":
 | 
			
		||||
                elements = nltk.tokenize.sent_tokenize(text)
 | 
			
		||||
            elif self.split_by == "word":
 | 
			
		||||
            elif split_by == "word":
 | 
			
		||||
                elements = text.split(" ")
 | 
			
		||||
            else:
 | 
			
		||||
                raise NotImplementedError("PreProcessor only supports 'passage', 'sentence' or 'word' split_by options.")
 | 
			
		||||
 | 
			
		||||
            # concatenate individual elements based on split_length & split_stride
 | 
			
		||||
            if self.split_overlap:
 | 
			
		||||
                segments = windowed(elements, n=self.split_length, step=self.split_length - self.split_overlap)
 | 
			
		||||
            if split_overlap:
 | 
			
		||||
                segments = windowed(elements, n=split_length, step=split_length - split_overlap)
 | 
			
		||||
            else:
 | 
			
		||||
                segments = windowed(elements, n=self.split_length, step=self.split_length)
 | 
			
		||||
                segments = windowed(elements, n=split_length, step=split_length)
 | 
			
		||||
            text_splits = []
 | 
			
		||||
            for seg in segments:
 | 
			
		||||
                txt = " ".join([t for t in seg if t])
 | 
			
		||||
 | 
			
		||||
@ -4,7 +4,7 @@ import logging
 | 
			
		||||
from time import perf_counter
 | 
			
		||||
from functools import wraps
 | 
			
		||||
from tqdm import tqdm
 | 
			
		||||
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
from haystack import Document, BaseComponent
 | 
			
		||||
from haystack.document_store.base import BaseDocumentStore
 | 
			
		||||
 | 
			
		||||
@ -168,12 +168,21 @@ class BaseRetriever(BaseComponent):
 | 
			
		||||
        else:
 | 
			
		||||
            return metrics
 | 
			
		||||
 | 
			
		||||
    def run(
 | 
			
		||||
            self,
 | 
			
		||||
            query: str,
 | 
			
		||||
            filters: Optional[dict] = None,
 | 
			
		||||
            top_k_retriever: Optional[int] = None,
 | 
			
		||||
            **kwargs,
 | 
			
		||||
    def run(self, pipeline_type: str, **kwargs):
 | 
			
		||||
        if pipeline_type == "Query":
 | 
			
		||||
            output, stream = self.run_query(**kwargs)
 | 
			
		||||
        elif pipeline_type == "Indexing":
 | 
			
		||||
            output, stream = self.run_indexing(**kwargs)
 | 
			
		||||
        else:
 | 
			
		||||
            raise Exception(f"Invalid pipeline_type '{pipeline_type}'.")
 | 
			
		||||
        return output, stream
 | 
			
		||||
 | 
			
		||||
    def run_query(
 | 
			
		||||
        self,
 | 
			
		||||
        query: str,
 | 
			
		||||
        filters: Optional[dict] = None,
 | 
			
		||||
        top_k_retriever: Optional[int] = None,
 | 
			
		||||
        **kwargs,
 | 
			
		||||
    ):
 | 
			
		||||
        if top_k_retriever:
 | 
			
		||||
            documents = self.retrieve(query=query, filters=filters, top_k=top_k_retriever)
 | 
			
		||||
@ -188,3 +197,14 @@ class BaseRetriever(BaseComponent):
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        return output, "output_1"
 | 
			
		||||
 | 
			
		||||
    def run_indexing(self, documents: List[dict], **kwargs):
 | 
			
		||||
        if self.__class__.__name__ in ["DensePassageRetriever", "EmbeddingRetriever"]:
 | 
			
		||||
            documents = deepcopy(documents)
 | 
			
		||||
            document_objects = [Document.from_dict(doc) for doc in documents]
 | 
			
		||||
            embeddings = self.embed_passages(document_objects)  # type: ignore
 | 
			
		||||
            for doc, emb in zip(documents, embeddings):
 | 
			
		||||
                doc["embedding"] = emb
 | 
			
		||||
 | 
			
		||||
        output = {**kwargs, "documents": documents}
 | 
			
		||||
        return output, "output_1"
 | 
			
		||||
 | 
			
		||||
@ -14,12 +14,34 @@ components:
 | 
			
		||||
  - name: TestDocumentStore
 | 
			
		||||
    type: ElasticsearchDocumentStore
 | 
			
		||||
    params:
 | 
			
		||||
      index: haystack_test
 | 
			
		||||
      index: haystack_test_pipeline
 | 
			
		||||
  - name: TestPDFConverter
 | 
			
		||||
    type: PDFToTextConverter
 | 
			
		||||
    params:
 | 
			
		||||
      remove_numeric_tables: false
 | 
			
		||||
  - name: TestPreprocessor
 | 
			
		||||
    type: PreProcessor
 | 
			
		||||
    params:
 | 
			
		||||
      clean_whitespace: true
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
pipelines:
 | 
			
		||||
  - name: test_query_pipeline
 | 
			
		||||
    type: Query
 | 
			
		||||
    nodes:
 | 
			
		||||
      - name: TestESRetriever
 | 
			
		||||
        inputs: [Query]
 | 
			
		||||
      - name: TestReader
 | 
			
		||||
        inputs: [TestESRetriever]
 | 
			
		||||
        inputs: [TestESRetriever]
 | 
			
		||||
 | 
			
		||||
  - name: test_indexing_pipeline
 | 
			
		||||
    type: Indexing
 | 
			
		||||
    nodes:
 | 
			
		||||
      - name: TestPDFConverter
 | 
			
		||||
        inputs: [File]
 | 
			
		||||
      - name: TestPreprocessor
 | 
			
		||||
        inputs: [TestPDFConverter]
 | 
			
		||||
      - name: TestESRetriever
 | 
			
		||||
        inputs: [TestPreprocessor]
 | 
			
		||||
      - name: TestDocumentStore
 | 
			
		||||
        inputs: [TestESRetriever]
 | 
			
		||||
 | 
			
		||||
@ -11,12 +11,16 @@ from haystack.retriever.sparse import ElasticsearchRetriever
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
 | 
			
		||||
def test_load_yaml(document_store_with_docs):
 | 
			
		||||
    # test correct load of indexing pipeline from yaml
 | 
			
		||||
    pipeline = Pipeline.load_from_yaml(Path("samples/pipeline/test_pipeline.yaml"),
 | 
			
		||||
                                       pipeline_name="test_indexing_pipeline")
 | 
			
		||||
    pipeline.run(file_path=Path("samples/pdf/sample_pdf_1.pdf"), top_k_retriever=10, top_k_reader=3)
 | 
			
		||||
 | 
			
		||||
    # # test correct load from yaml
 | 
			
		||||
    pipeline = Pipeline.load_from_yaml(Path("samples/pipeline/test_pipeline.yaml", pipeline_name="my_query"))
 | 
			
		||||
    prediction = pipeline.run(query="Who lives in Berlin?", top_k_retriever=10, top_k_reader=3)
 | 
			
		||||
    assert prediction["query"] == "Who lives in Berlin?"
 | 
			
		||||
    assert prediction["answers"][0]["answer"] == "Carla"
 | 
			
		||||
    # test correct load of query pipeline from yaml
 | 
			
		||||
    pipeline = Pipeline.load_from_yaml(Path("samples/pipeline/test_pipeline.yaml"), pipeline_name="test_query_pipeline")
 | 
			
		||||
    prediction = pipeline.run(query="Who made the PDF specification?", top_k_retriever=10, top_k_reader=3)
 | 
			
		||||
    assert prediction["query"] == "Who made the PDF specification?"
 | 
			
		||||
    assert prediction["answers"][0]["answer"] == "Adobe Systems"
 | 
			
		||||
 | 
			
		||||
    # test invalid pipeline name
 | 
			
		||||
    with pytest.raises(Exception):
 | 
			
		||||
 | 
			
		||||
@ -93,7 +93,7 @@ def test_elasticsearch_custom_query(elasticsearch_fixture):
 | 
			
		||||
                            "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}],
 | 
			
		||||
                            "filter": [{"terms": {"year": ${years}}}]}}}""",
 | 
			
		||||
    )
 | 
			
		||||
    results = retriever.run(query="test", filters={"years": ["2020", "2021"]})[0]["documents"]
 | 
			
		||||
    results = retriever.retrieve(query="test", filters={"years": ["2020", "2021"]})
 | 
			
		||||
    assert len(results) == 4
 | 
			
		||||
 | 
			
		||||
    # test custom "term" query
 | 
			
		||||
@ -108,7 +108,7 @@ def test_elasticsearch_custom_query(elasticsearch_fixture):
 | 
			
		||||
                                "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}],
 | 
			
		||||
                                "filter": [{"term": {"year": ${years}}}]}}}""",
 | 
			
		||||
    )
 | 
			
		||||
    results = retriever.run(query="test", filters={"years": "2021"})[0]["documents"]
 | 
			
		||||
    results = retriever.retrieve(query="test", filters={"years": "2021"})
 | 
			
		||||
    assert len(results) == 3
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user