Add support for indexing pipelines (#816)

This commit is contained in:
Tanay Soni 2021-02-16 16:24:28 +01:00 committed by GitHub
parent 7030c94325
commit 07907f9eac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 433 additions and 110 deletions

View File

@ -5,7 +5,7 @@
## BaseConverter Objects
```python
class BaseConverter()
class BaseConverter(BaseComponent)
```
Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore.
@ -14,7 +14,7 @@ Base class for implementing file converts to transform input documents to text f
#### \_\_init\_\_
```python
| __init__(remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None)
| __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
```
**Arguments**:
@ -35,7 +35,7 @@ in garbled text.
```python
| @abstractmethod
| convert(file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]
| convert(file_path: Path, meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any]
```
Convert a file to a dictionary containing the text and any associated meta data.
@ -47,6 +47,16 @@ supplied meta data like author, url, external IDs can be supplied as a dictionar
- `file_path`: path of the file to convert
- `meta`: dictionary of meta data key-value pairs to append in the returned document.
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
<a name="base.BaseConverter.validate_language"></a>
#### validate\_language
@ -71,7 +81,7 @@ class TextConverter(BaseConverter)
#### \_\_init\_\_
```python
| __init__(remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None)
| __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
```
**Arguments**:
@ -91,16 +101,25 @@ in garbled text.
#### convert
```python
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, encoding: str = "utf-8") -> Dict[str, Any]
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: str = "utf-8") -> Dict[str, Any]
```
Reads text from a txt file and executes optional preprocessing steps.
**Arguments**:
- `file_path`: Path of the file to convert
- `meta`: Optional meta data that should be associated with the the document (e.g. name)
- `encoding`: Encoding of the file
- `file_path`: path of the file to convert
- `meta`: dictionary of meta data key-value pairs to append in the returned document.
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
**Returns**:
@ -120,7 +139,7 @@ class DocxToTextConverter(BaseConverter)
#### convert
```python
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any]
```
Extract text from a .docx file.
@ -130,6 +149,17 @@ For compliance with other converters we nevertheless opted for keeping the metho
**Arguments**:
- `file_path`: Path to the .docx file you want to convert
- `meta`: dictionary of meta data key-value pairs to append in the returned document.
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
<a name="tika"></a>
# Module tika
@ -145,7 +175,7 @@ class TikaConverter(BaseConverter)
#### \_\_init\_\_
```python
| __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None)
| __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
```
**Arguments**:
@ -166,12 +196,23 @@ in garbled text.
#### convert
```python
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any]
```
**Arguments**:
- `file_path`: Path of file to be converted.
- `file_path`: path of the file to convert
- `meta`: dictionary of meta data key-value pairs to append in the returned document.
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
**Returns**:
@ -191,7 +232,7 @@ class PDFToTextConverter(BaseConverter)
#### \_\_init\_\_
```python
| __init__(remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None)
| __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)
```
**Arguments**:
@ -211,7 +252,7 @@ in garbled text.
#### convert
```python
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, encoding: str = "Latin1") -> Dict[str, Any]
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: str = "Latin1") -> Dict[str, Any]
```
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
@ -221,6 +262,16 @@ Extract text from a .pdf file using the pdftotext library (https://www.xpdfreade
- `file_path`: Path to the .pdf file you want to convert
- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents.
Can be any custom keys and values.
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
- `encoding`: Encoding that will be passed as -enc parameter to pdftotext. "Latin 1" is the default encoding
of pdftotext. While this works well on many PDFs, it might be needed to switch to "UTF-8" or
others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...).

View File

@ -5,14 +5,14 @@
## BasePreProcessor Objects
```python
class BasePreProcessor()
class BasePreProcessor(BaseComponent)
```
<a name="base.BasePreProcessor.process"></a>
#### process
```python
| process(document: dict) -> List[dict]
| process(document: dict, clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict]
```
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
@ -31,7 +31,7 @@ class PreProcessor(BasePreProcessor)
#### \_\_init\_\_
```python
| __init__(clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True)
| __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, split_by: str = "word", split_length: int = 1000, split_overlap: int = 0, split_respect_sentence_boundary: bool = True)
```
**Arguments**:
@ -50,16 +50,25 @@ Setting this to a positive number essentially enables the sliding window approac
For example, if split_by -> `word`,
split_length -> 5 & split_overlap -> 2, then the splits would be like:
[w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12].
Set the value to None to ensure there is no overlap among the documents after splitting.
Set the value to 0 to ensure there is no overlap among the documents after splitting.
- `split_respect_sentence_boundary`: Whether to split in partial sentences if split_by -> `word`. If set
to True, the individual split will always have complete sentences &
the number of words will be <= split_length.
<a name="preprocessor.PreProcessor.process"></a>
#### process
```python
| process(document: dict, clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict]
```
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
<a name="preprocessor.PreProcessor.clean"></a>
#### clean
```python
| clean(document: dict) -> dict
| clean(document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool) -> dict
```
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
@ -69,7 +78,7 @@ and empty lines. Its exact functionality is defined by the parameters passed int
#### split
```python
| split(document: dict) -> List[dict]
| split(document: dict, split_by: str, split_length: int, split_overlap: int, split_respect_sentence_boundary: bool) -> List[dict]
```
Perform document splitting on a single document. This method can split on different units, at different lengths,

View File

@ -206,5 +206,6 @@ class BaseDocumentStore(BaseComponent):
def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
pass
def run(self, **kwargs):
raise NotImplementedError
def run(self, documents: List[dict], index: Optional[str] = None, **kwargs):
self.write_documents(documents=documents, index=index)
return kwargs, "output_1"

View File

@ -4,13 +4,17 @@ from typing import List, Optional, Dict, Any
import langdetect
from haystack import BaseComponent
class BaseConverter:
class BaseConverter(BaseComponent):
"""
Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore.
"""
def __init__(self, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None):
outgoing_edges = 1
def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
@ -27,7 +31,13 @@ class BaseConverter:
self.valid_languages = valid_languages
@abstractmethod
def convert(self, file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]:
def convert(
self,
file_path: Path,
meta: Optional[Dict[str, str]],
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
) -> Dict[str, Any]:
"""
Convert a file to a dictionary containing the text and any associated meta data.
@ -36,6 +46,16 @@ class BaseConverter:
:param file_path: path of the file to convert
:param meta: dictionary of meta data key-value pairs to append in the returned document.
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
"""
pass
@ -56,4 +76,20 @@ class BaseConverter:
else:
return False
def run(
self,
file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
**kwargs
):
document = self.convert(
file_path=file_path,
meta=meta,
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages,
)
result = {"document": document, **kwargs}
return result, "output_1"

View File

@ -1,6 +1,6 @@
import logging
from pathlib import Path
from typing import Dict, Optional, Any
from typing import Dict, Optional, Any, List
import docx
@ -10,14 +10,39 @@ logger = logging.getLogger(__name__)
class DocxToTextConverter(BaseConverter):
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
def convert(
self,
file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
) -> Dict[str, Any]:
"""
Extract text from a .docx file.
Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
For compliance with other converters we nevertheless opted for keeping the methods name.
:param file_path: Path to the .docx file you want to convert
:param meta: dictionary of meta data key-value pairs to append in the returned document.
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
"""
if remove_numeric_tables is None:
remove_numeric_tables = self.remove_numeric_tables
if valid_languages is None:
valid_languages = self.valid_languages
if remove_numeric_tables is True:
raise Exception("'remove_numeric_tables' is not supported by DocxToTextConverter.")
if valid_languages is True:
raise Exception("Language validation using 'valid_languages' is not supported by DocxToTextConverter.")
file = docx.Document(file_path) # Creating word reader object.
paragraphs = [para.text for para in file.paragraphs]

View File

@ -9,7 +9,7 @@ logger = logging.getLogger(__name__)
class PDFToTextConverter(BaseConverter):
def __init__(self, remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None):
def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
@ -40,13 +40,30 @@ class PDFToTextConverter(BaseConverter):
super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None, encoding: str = "Latin1") -> Dict[str, Any]:
def convert(
self,
file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: str = "Latin1",
) -> Dict[str, Any]:
"""
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
:param file_path: Path to the .pdf file you want to convert
:param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
Can be any custom keys and values.
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
:param encoding: Encoding that will be passed as -enc parameter to pdftotext. "Latin 1" is the default encoding
of pdftotext. While this works well on many PDFs, it might be needed to switch to "UTF-8" or
others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...).
@ -56,6 +73,10 @@ class PDFToTextConverter(BaseConverter):
"""
pages = self._read_pdf(file_path, layout=False, encoding=encoding)
if remove_numeric_tables is None:
remove_numeric_tables = self.remove_numeric_tables
if valid_languages is None:
valid_languages = self.valid_languages
cleaned_pages = []
for page in pages:
@ -76,7 +97,7 @@ class PDFToTextConverter(BaseConverter):
digits = [word for word in words if any(i.isdigit() for i in word)]
# remove lines having > 40% of words as digits AND not ending with a period(.)
if self.remove_numeric_tables:
if remove_numeric_tables:
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
logger.debug(f"Removing line '{line}' from {file_path}")
continue
@ -85,7 +106,7 @@ class PDFToTextConverter(BaseConverter):
page = "\n".join(cleaned_lines)
cleaned_pages.append(page)
if self.valid_languages:
if valid_languages:
document_text = "".join(cleaned_pages)
if not self.validate_language(document_text):
logger.warning(

View File

@ -42,7 +42,7 @@ class TikaConverter(BaseConverter):
def __init__(
self,
tika_url: str = "http://localhost:9998/tika",
remove_numeric_tables: Optional[bool] = False,
remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None
):
"""
@ -65,12 +65,34 @@ class TikaConverter(BaseConverter):
self.tika_url = tika_url
super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
def convert(
self,
file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
) -> Dict[str, Any]:
"""
:param file_path: Path of file to be converted.
:param file_path: path of the file to convert
:param meta: dictionary of meta data key-value pairs to append in the returned document.
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
:return: a list of pages and the extracted meta data of the file.
"""
if remove_numeric_tables is None:
remove_numeric_tables = self.remove_numeric_tables
if valid_languages is None:
valid_languages = self.valid_languages
parsed = tikaparser.from_file(file_path.as_posix(), self.tika_url, xmlContent=True)
parser = TikaXHTMLParser()
parser.feed(parsed["content"])
@ -85,7 +107,7 @@ class TikaConverter(BaseConverter):
digits = [word for word in words if any(i.isdigit() for i in word)]
# remove lines having > 40% of words as digits AND not ending with a period(.)
if self.remove_numeric_tables:
if remove_numeric_tables:
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
logger.debug(f"Removing line '{line}' from {file_path}")
continue
@ -95,11 +117,11 @@ class TikaConverter(BaseConverter):
page = "\n".join(cleaned_lines)
cleaned_pages.append(page)
if self.valid_languages:
if valid_languages:
document_text = "".join(cleaned_pages)
if not self.validate_language(document_text):
logger.warning(
f"The language for {file_path} is not one of {self.valid_languages}. The file may not have "
f"The language for {file_path} is not one of {valid_languages}. The file may not have "
f"been decoded in the correct text format."
)

View File

@ -8,7 +8,7 @@ logger = logging.getLogger(__name__)
class TextConverter(BaseConverter):
def __init__(self, remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None):
def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None):
"""
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
@ -22,23 +22,40 @@ class TextConverter(BaseConverter):
in garbled text.
"""
super().__init__(remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages)
super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
def convert(self,
file_path: Path,
meta: Optional[Dict[str, str]] = None,
encoding: str = "utf-8") -> Dict[str, Any]:
def convert(
self,
file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: str = "utf-8",
) -> Dict[str, Any]:
"""
Reads text from a txt file and executes optional preprocessing steps.
:param file_path: Path of the file to convert
:param meta: Optional meta data that should be associated with the the document (e.g. name)
:param encoding: Encoding of the file
:param file_path: path of the file to convert
:param meta: dictionary of meta data key-value pairs to append in the returned document.
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
The tabular structures in documents might be noise for the reader model if it
does not have table parsing capability for finding answers. However, tables
may also have long strings that could possible candidate for searching answers.
The rows containing strings are thus retained in this option.
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
(https://en.wikipedia.org/wiki/ISO_639-1) format.
This option can be used to add test for encoding errors. If the extracted text is
not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
:return: Dict of format {"text": "The text from file", "meta": meta}}
"""
if remove_numeric_tables is None:
remove_numeric_tables = self.remove_numeric_tables
if valid_languages is None:
valid_languages = self.valid_languages
with open(file_path, encoding=encoding, errors="ignore") as f:
text = f.read()
pages = text.split("\f")
@ -52,7 +69,7 @@ class TextConverter(BaseConverter):
digits = [word for word in words if any(i.isdigit() for i in word)]
# remove lines having > 40% of words as digits AND not ending with a period(.)
if self.remove_numeric_tables:
if remove_numeric_tables:
if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
logger.debug(f"Removing line '{line}' from {file_path}")
continue
@ -62,7 +79,7 @@ class TextConverter(BaseConverter):
page = "\n".join(cleaned_lines)
cleaned_pages.append(page)
if self.valid_languages:
if valid_languages:
document_text = "".join(cleaned_pages)
if not self.validate_language(document_text):
logger.warning(

View File

@ -26,10 +26,18 @@ class Pipeline(ABC):
Reader from multiple Retrievers, or re-ranking of candidate documents.
"""
def __init__(self):
def __init__(self, pipeline_type: str = "Query"):
self.graph = DiGraph()
self.root_node_id = "Query"
self.graph.add_node("Query", component=QueryNode())
if pipeline_type == "Query":
self.root_node_id = "Query"
self.graph.add_node("Query", component=RootNode())
elif pipeline_type == "Indexing":
self.root_node_id = "File"
self.graph.add_node("File", component=RootNode())
else:
raise Exception(f"pipeline_type '{pipeline_type}' is not valid. Supported types are 'Query' & 'Indexing'.")
self.pipeline_type = pipeline_type
self.components: dict = {}
def add_node(self, component, name: str, inputs: List[str]):
@ -49,6 +57,10 @@ class Pipeline(ABC):
"""
self.graph.add_node(name, component=component, inputs=inputs)
if len(self.graph.nodes) == 2: # first node added; connect with Root
self.graph.add_edge(self.root_node_id, name, label="output_1")
return
for i in inputs:
if "." in i:
[input_node_name, input_edge_name] = i.split(".")
@ -89,7 +101,7 @@ class Pipeline(ABC):
def run(self, **kwargs):
has_next_node = True
current_node_id = self.root_node_id
input_dict = kwargs
input_dict = {"pipeline_type": self.pipeline_type, **kwargs}
output_dict = None
while has_next_node:
@ -207,14 +219,13 @@ class Pipeline(ABC):
name = definition.pop("name")
definitions[name] = definition
pipeline = cls()
pipeline = cls(pipeline_type=pipeline_config["type"])
components: dict = {} # instances of component objects.
for node_config in pipeline_config["nodes"]:
name = node_config["name"]
component = cls._load_or_get_component(name=name, definitions=definitions, components=components)
if "DocumentStore" not in definitions[name]["type"]: # DocumentStore is not an explicit node in a Pipeline
pipeline.add_node(component=component, name=node_config["name"], inputs=node_config["inputs"])
pipeline.add_node(component=component, name=node_config["name"], inputs=node_config.get("inputs", []))
return pipeline
@ -499,7 +510,7 @@ class TranslationWrapperPipeline(BaseStandardPipeline):
return output
class QueryNode:
class RootNode:
outgoing_edges = 1
def run(self, **kwargs):

View File

@ -1,17 +1,44 @@
from typing import List, Dict, Any
from typing import List, Dict, Any, Optional
from haystack import BaseComponent
class BasePreProcessor:
def process(self, document: dict) -> List[dict]:
class BasePreProcessor(BaseComponent):
outgoing_edges = 1
def process(
self,
document: dict,
clean_whitespace: Optional[bool] = True,
clean_header_footer: Optional[bool] = False,
clean_empty_lines: Optional[bool] = True,
split_by: Optional[str] = "word",
split_length: Optional[int] = 1000,
split_overlap: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = True,
) -> List[dict]:
"""
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
"""
cleaned_document = self.clean(document)
split_documents = self.split(cleaned_document)
return split_documents
def clean(self, document: Dict[str, Any]) -> Dict[str, Any]:
raise NotImplementedError
def split(self, document: Dict[str, Any]) -> List[Dict[str, Any]]:
def clean(
self, document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool,
) -> Dict[str, Any]:
raise NotImplementedError
def split(
self,
document: dict,
split_by: str,
split_length: int,
split_overlap: int,
split_respect_sentence_boundary: bool,
) -> List[Dict[str, Any]]:
raise NotImplementedError
def run(self, document: dict, **kwargs):
documents = self.process(document)
result = {"documents": documents, **kwargs}
return result, "output_1"

View File

@ -16,13 +16,13 @@ logger = logging.getLogger(__name__)
class PreProcessor(BasePreProcessor):
def __init__(
self,
clean_whitespace: Optional[bool] = True,
clean_header_footer: Optional[bool] = False,
clean_empty_lines: Optional[bool] = True,
split_by: Optional[str] = "word",
split_length: Optional[int] = 1000,
split_overlap: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = True,
clean_whitespace: bool = True,
clean_header_footer: bool = False,
clean_empty_lines: bool = True,
split_by: str = "word",
split_length: int = 1000,
split_overlap: int = 0,
split_respect_sentence_boundary: bool = True,
):
"""
:param clean_header_footer: Use heuristic to remove footers and headers across different pages by searching
@ -39,7 +39,7 @@ class PreProcessor(BasePreProcessor):
For example, if split_by -> `word`,
split_length -> 5 & split_overlap -> 2, then the splits would be like:
[w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12].
Set the value to None to ensure there is no overlap among the documents after splitting.
Set the value to 0 to ensure there is no overlap among the documents after splitting.
:param split_respect_sentence_boundary: Whether to split in partial sentences if split_by -> `word`. If set
to True, the individual split will always have complete sentences &
the number of words will be <= split_length.
@ -53,18 +53,68 @@ class PreProcessor(BasePreProcessor):
self.split_overlap = split_overlap
self.split_respect_sentence_boundary = split_respect_sentence_boundary
def clean(self, document: dict) -> dict:
def process(
self,
document: dict,
clean_whitespace: Optional[bool] = None,
clean_header_footer: Optional[bool] = None,
clean_empty_lines: Optional[bool] = None,
split_by: Optional[str] = None,
split_length: Optional[int] = None,
split_overlap: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = None,
) -> List[dict]:
"""
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
"""
if clean_whitespace is None:
clean_whitespace = self.clean_whitespace
if clean_header_footer is None:
clean_header_footer = self.clean_header_footer
if clean_empty_lines is None:
clean_empty_lines = self.clean_empty_lines
if split_by is None:
split_by = self.split_by
if split_length is None:
split_length = self.split_length
if split_overlap is None:
split_overlap = self.split_overlap
if split_respect_sentence_boundary is None:
split_respect_sentence_boundary = self.split_respect_sentence_boundary
cleaned_document = self.clean(
document=document,
clean_whitespace=clean_whitespace,
clean_header_footer=clean_header_footer,
clean_empty_lines=clean_empty_lines,
)
split_documents = self.split(
document=cleaned_document,
split_by=split_by,
split_length=split_length,
split_overlap=split_overlap,
split_respect_sentence_boundary=split_respect_sentence_boundary,
)
return split_documents
def clean(
self,
document: dict,
clean_whitespace: bool,
clean_header_footer: bool,
clean_empty_lines: bool,
) -> dict:
"""
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
"""
text = document["text"]
if self.clean_header_footer:
if clean_header_footer:
text = self._find_and_remove_header_footer(
text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
)
if self.clean_whitespace:
if clean_whitespace:
lines = text.splitlines()
cleaned_lines = []
@ -73,30 +123,37 @@ class PreProcessor(BasePreProcessor):
cleaned_lines.append(line)
text = "\n".join(cleaned_lines)
if self.clean_empty_lines:
if clean_empty_lines:
text = re.sub(r"\n\n+", "\n\n", text)
document["text"] = text
return document
def split(self, document: dict) -> List[dict]:
def split(
self,
document: dict,
split_by: str,
split_length: int,
split_overlap: int,
split_respect_sentence_boundary: bool,
) -> List[dict]:
"""Perform document splitting on a single document. This method can split on different units, at different lengths,
with different strides. It can also respect sentence boundaries. Its exact functionality is defined by
the parameters passed into PreProcessor.__init__(). Takes a single document as input and returns a list of documents. """
if not self.split_by:
if not split_by:
return [document]
if not self.split_length:
if not split_length:
raise Exception("split_length needs be set when using split_by.")
if self.split_respect_sentence_boundary and self.split_by not in("word","sentence"):
if split_respect_sentence_boundary and split_by not in("word","sentence"):
raise NotImplementedError("'split_respect_sentence_boundary=True' is only compatible with"
" split_by='word' or split_by='sentence'.")
text = document["text"]
if self.split_respect_sentence_boundary and self.split_by == "word":
if split_respect_sentence_boundary and split_by == "word":
# split by words ensuring no sub sentence splits
sentences = nltk.tokenize.sent_tokenize(text)
word_count = 0
@ -104,17 +161,17 @@ class PreProcessor(BasePreProcessor):
current_slice: List[str] = []
for sen in sentences:
current_word_count = len(sen.split(" "))
if current_word_count > self.split_length:
if current_word_count > split_length:
logger.warning(f"A sentence found with word count higher than the split length.")
if word_count + current_word_count > self.split_length:
if word_count + current_word_count > split_length:
list_splits.append(current_slice)
#Enable split_stride with split_by='word' while respecting sentence boundaries.
if self.split_overlap:
# Enable split_stride with split_by='word' while respecting sentence boundaries.
if split_overlap:
overlap = []
w_count = 0
for s in current_slice[::-1]:
sen_len = len(s.split(" "))
if w_count < self.split_overlap:
if w_count < split_overlap:
overlap.append(s)
w_count += sen_len
else:
@ -136,20 +193,20 @@ class PreProcessor(BasePreProcessor):
text_splits.append(txt)
else:
# create individual "elements" of passage, sentence, or word
if self.split_by == "passage":
if split_by == "passage":
elements = text.split("\n\n")
elif self.split_by == "sentence":
elif split_by == "sentence":
elements = nltk.tokenize.sent_tokenize(text)
elif self.split_by == "word":
elif split_by == "word":
elements = text.split(" ")
else:
raise NotImplementedError("PreProcessor only supports 'passage', 'sentence' or 'word' split_by options.")
# concatenate individual elements based on split_length & split_stride
if self.split_overlap:
segments = windowed(elements, n=self.split_length, step=self.split_length - self.split_overlap)
if split_overlap:
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
else:
segments = windowed(elements, n=self.split_length, step=self.split_length)
segments = windowed(elements, n=split_length, step=split_length)
text_splits = []
for seg in segments:
txt = " ".join([t for t in seg if t])

View File

@ -4,7 +4,7 @@ import logging
from time import perf_counter
from functools import wraps
from tqdm import tqdm
from copy import deepcopy
from haystack import Document, BaseComponent
from haystack.document_store.base import BaseDocumentStore
@ -168,12 +168,21 @@ class BaseRetriever(BaseComponent):
else:
return metrics
def run(
self,
query: str,
filters: Optional[dict] = None,
top_k_retriever: Optional[int] = None,
**kwargs,
def run(self, pipeline_type: str, **kwargs):
if pipeline_type == "Query":
output, stream = self.run_query(**kwargs)
elif pipeline_type == "Indexing":
output, stream = self.run_indexing(**kwargs)
else:
raise Exception(f"Invalid pipeline_type '{pipeline_type}'.")
return output, stream
def run_query(
self,
query: str,
filters: Optional[dict] = None,
top_k_retriever: Optional[int] = None,
**kwargs,
):
if top_k_retriever:
documents = self.retrieve(query=query, filters=filters, top_k=top_k_retriever)
@ -188,3 +197,14 @@ class BaseRetriever(BaseComponent):
}
return output, "output_1"
def run_indexing(self, documents: List[dict], **kwargs):
if self.__class__.__name__ in ["DensePassageRetriever", "EmbeddingRetriever"]:
documents = deepcopy(documents)
document_objects = [Document.from_dict(doc) for doc in documents]
embeddings = self.embed_passages(document_objects) # type: ignore
for doc, emb in zip(documents, embeddings):
doc["embedding"] = emb
output = {**kwargs, "documents": documents}
return output, "output_1"

View File

@ -14,12 +14,34 @@ components:
- name: TestDocumentStore
type: ElasticsearchDocumentStore
params:
index: haystack_test
index: haystack_test_pipeline
- name: TestPDFConverter
type: PDFToTextConverter
params:
remove_numeric_tables: false
- name: TestPreprocessor
type: PreProcessor
params:
clean_whitespace: true
pipelines:
- name: test_query_pipeline
type: Query
nodes:
- name: TestESRetriever
inputs: [Query]
- name: TestReader
inputs: [TestESRetriever]
inputs: [TestESRetriever]
- name: test_indexing_pipeline
type: Indexing
nodes:
- name: TestPDFConverter
inputs: [File]
- name: TestPreprocessor
inputs: [TestPDFConverter]
- name: TestESRetriever
inputs: [TestPreprocessor]
- name: TestDocumentStore
inputs: [TestESRetriever]

View File

@ -11,12 +11,16 @@ from haystack.retriever.sparse import ElasticsearchRetriever
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
def test_load_yaml(document_store_with_docs):
# test correct load of indexing pipeline from yaml
pipeline = Pipeline.load_from_yaml(Path("samples/pipeline/test_pipeline.yaml"),
pipeline_name="test_indexing_pipeline")
pipeline.run(file_path=Path("samples/pdf/sample_pdf_1.pdf"), top_k_retriever=10, top_k_reader=3)
# # test correct load from yaml
pipeline = Pipeline.load_from_yaml(Path("samples/pipeline/test_pipeline.yaml", pipeline_name="my_query"))
prediction = pipeline.run(query="Who lives in Berlin?", top_k_retriever=10, top_k_reader=3)
assert prediction["query"] == "Who lives in Berlin?"
assert prediction["answers"][0]["answer"] == "Carla"
# test correct load of query pipeline from yaml
pipeline = Pipeline.load_from_yaml(Path("samples/pipeline/test_pipeline.yaml"), pipeline_name="test_query_pipeline")
prediction = pipeline.run(query="Who made the PDF specification?", top_k_retriever=10, top_k_reader=3)
assert prediction["query"] == "Who made the PDF specification?"
assert prediction["answers"][0]["answer"] == "Adobe Systems"
# test invalid pipeline name
with pytest.raises(Exception):

View File

@ -93,7 +93,7 @@ def test_elasticsearch_custom_query(elasticsearch_fixture):
"multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}],
"filter": [{"terms": {"year": ${years}}}]}}}""",
)
results = retriever.run(query="test", filters={"years": ["2020", "2021"]})[0]["documents"]
results = retriever.retrieve(query="test", filters={"years": ["2020", "2021"]})
assert len(results) == 4
# test custom "term" query
@ -108,7 +108,7 @@ def test_elasticsearch_custom_query(elasticsearch_fixture):
"multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}],
"filter": [{"term": {"year": ${years}}}]}}}""",
)
results = retriever.run(query="test", filters={"years": "2021"})[0]["documents"]
results = retriever.retrieve(query="test", filters={"years": "2021"})
assert len(results) == 3