diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md index 90a70edbc..c85b330f8 100644 --- a/docs/_src/api/api/file_converter.md +++ b/docs/_src/api/api/file_converter.md @@ -5,7 +5,7 @@ ## BaseConverter Objects ```python -class BaseConverter() +class BaseConverter(BaseComponent) ``` Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore. @@ -14,7 +14,7 @@ Base class for implementing file converts to transform input documents to text f #### \_\_init\_\_ ```python - | __init__(remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) + | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None) ``` **Arguments**: @@ -35,7 +35,7 @@ in garbled text. ```python | @abstractmethod - | convert(file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any] + | convert(file_path: Path, meta: Optional[Dict[str, str]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any] ``` Convert a file to a dictionary containing the text and any associated meta data. @@ -47,6 +47,16 @@ supplied meta data like author, url, external IDs can be supplied as a dictionar - `file_path`: path of the file to convert - `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. #### validate\_language @@ -71,7 +81,7 @@ class TextConverter(BaseConverter) #### \_\_init\_\_ ```python - | __init__(remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None) + | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None) ``` **Arguments**: @@ -91,16 +101,25 @@ in garbled text. #### convert ```python - | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, encoding: str = "utf-8") -> Dict[str, Any] + | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: str = "utf-8") -> Dict[str, Any] ``` Reads text from a txt file and executes optional preprocessing steps. **Arguments**: -- `file_path`: Path of the file to convert -- `meta`: Optional meta data that should be associated with the the document (e.g. name) -- `encoding`: Encoding of the file +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. **Returns**: @@ -120,7 +139,7 @@ class DocxToTextConverter(BaseConverter) #### convert ```python - | convert(file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any] + | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any] ``` Extract text from a .docx file. @@ -130,6 +149,17 @@ For compliance with other converters we nevertheless opted for keeping the metho **Arguments**: - `file_path`: Path to the .docx file you want to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. # Module tika @@ -145,7 +175,7 @@ class TikaConverter(BaseConverter) #### \_\_init\_\_ ```python - | __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None) + | __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None) ``` **Arguments**: @@ -166,12 +196,23 @@ in garbled text. #### convert ```python - | convert(file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any] + | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None) -> Dict[str, Any] ``` **Arguments**: -- `file_path`: Path of file to be converted. +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. **Returns**: @@ -191,7 +232,7 @@ class PDFToTextConverter(BaseConverter) #### \_\_init\_\_ ```python - | __init__(remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None) + | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None) ``` **Arguments**: @@ -211,7 +252,7 @@ in garbled text. #### convert ```python - | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, encoding: str = "Latin1") -> Dict[str, Any] + | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: str = "Latin1") -> Dict[str, Any] ``` Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html) @@ -221,6 +262,16 @@ Extract text from a .pdf file using the pdftotext library (https://www.xpdfreade - `file_path`: Path to the .pdf file you want to convert - `meta`: Optional dictionary with metadata that shall be attached to all resulting documents. Can be any custom keys and values. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. - `encoding`: Encoding that will be passed as -enc parameter to pdftotext. "Latin 1" is the default encoding of pdftotext. While this works well on many PDFs, it might be needed to switch to "UTF-8" or others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...). diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md index 0bbc28b42..4aced3419 100644 --- a/docs/_src/api/api/preprocessor.md +++ b/docs/_src/api/api/preprocessor.md @@ -5,14 +5,14 @@ ## BasePreProcessor Objects ```python -class BasePreProcessor() +class BasePreProcessor(BaseComponent) ``` #### process ```python - | process(document: dict) -> List[dict] + | process(document: dict, clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict] ``` Perform document cleaning and splitting. Takes a single document as input and returns a list of documents. @@ -31,7 +31,7 @@ class PreProcessor(BasePreProcessor) #### \_\_init\_\_ ```python - | __init__(clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) + | __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, split_by: str = "word", split_length: int = 1000, split_overlap: int = 0, split_respect_sentence_boundary: bool = True) ``` **Arguments**: @@ -50,16 +50,25 @@ Setting this to a positive number essentially enables the sliding window approac For example, if split_by -> `word`, split_length -> 5 & split_overlap -> 2, then the splits would be like: [w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12]. -Set the value to None to ensure there is no overlap among the documents after splitting. +Set the value to 0 to ensure there is no overlap among the documents after splitting. - `split_respect_sentence_boundary`: Whether to split in partial sentences if split_by -> `word`. If set to True, the individual split will always have complete sentences & the number of words will be <= split_length. + +#### process + +```python + | process(document: dict, clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict] +``` + +Perform document cleaning and splitting. Takes a single document as input and returns a list of documents. + #### clean ```python - | clean(document: dict) -> dict + | clean(document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool) -> dict ``` Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers @@ -69,7 +78,7 @@ and empty lines. Its exact functionality is defined by the parameters passed int #### split ```python - | split(document: dict) -> List[dict] + | split(document: dict, split_by: str, split_length: int, split_overlap: int, split_respect_sentence_boundary: bool) -> List[dict] ``` Perform document splitting on a single document. This method can split on different units, at different lengths, diff --git a/haystack/document_store/base.py b/haystack/document_store/base.py index 8431f0260..742d49c42 100644 --- a/haystack/document_store/base.py +++ b/haystack/document_store/base.py @@ -206,5 +206,6 @@ class BaseDocumentStore(BaseComponent): def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None): pass - def run(self, **kwargs): - raise NotImplementedError + def run(self, documents: List[dict], index: Optional[str] = None, **kwargs): + self.write_documents(documents=documents, index=index) + return kwargs, "output_1" diff --git a/haystack/file_converter/base.py b/haystack/file_converter/base.py index 324e814d9..de9bfc78a 100644 --- a/haystack/file_converter/base.py +++ b/haystack/file_converter/base.py @@ -4,13 +4,17 @@ from typing import List, Optional, Dict, Any import langdetect +from haystack import BaseComponent -class BaseConverter: + +class BaseConverter(BaseComponent): """ Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore. """ - def __init__(self, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None): + outgoing_edges = 1 + + def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None): """ :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it @@ -27,7 +31,13 @@ class BaseConverter: self.valid_languages = valid_languages @abstractmethod - def convert(self, file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]: + def convert( + self, + file_path: Path, + meta: Optional[Dict[str, str]], + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + ) -> Dict[str, Any]: """ Convert a file to a dictionary containing the text and any associated meta data. @@ -36,6 +46,16 @@ class BaseConverter: :param file_path: path of the file to convert :param meta: dictionary of meta data key-value pairs to append in the returned document. + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. """ pass @@ -56,4 +76,20 @@ class BaseConverter: else: return False + def run( + self, + file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + **kwargs + ): + document = self.convert( + file_path=file_path, + meta=meta, + remove_numeric_tables=remove_numeric_tables, + valid_languages=valid_languages, + ) + result = {"document": document, **kwargs} + return result, "output_1" diff --git a/haystack/file_converter/docx.py b/haystack/file_converter/docx.py index 4d1842e2f..bef961968 100644 --- a/haystack/file_converter/docx.py +++ b/haystack/file_converter/docx.py @@ -1,6 +1,6 @@ import logging from pathlib import Path -from typing import Dict, Optional, Any +from typing import Dict, Optional, Any, List import docx @@ -10,14 +10,39 @@ logger = logging.getLogger(__name__) class DocxToTextConverter(BaseConverter): - def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + def convert( + self, + file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + ) -> Dict[str, Any]: """ Extract text from a .docx file. Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here. For compliance with other converters we nevertheless opted for keeping the methods name. :param file_path: Path to the .docx file you want to convert + :param meta: dictionary of meta data key-value pairs to append in the returned document. + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. """ + if remove_numeric_tables is None: + remove_numeric_tables = self.remove_numeric_tables + if valid_languages is None: + valid_languages = self.valid_languages + if remove_numeric_tables is True: + raise Exception("'remove_numeric_tables' is not supported by DocxToTextConverter.") + if valid_languages is True: + raise Exception("Language validation using 'valid_languages' is not supported by DocxToTextConverter.") file = docx.Document(file_path) # Creating word reader object. paragraphs = [para.text for para in file.paragraphs] diff --git a/haystack/file_converter/pdf.py b/haystack/file_converter/pdf.py index a850ed343..9c5c61f5d 100644 --- a/haystack/file_converter/pdf.py +++ b/haystack/file_converter/pdf.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) class PDFToTextConverter(BaseConverter): - def __init__(self, remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None): + def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None): """ :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it @@ -40,13 +40,30 @@ class PDFToTextConverter(BaseConverter): super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) - def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None, encoding: str = "Latin1") -> Dict[str, Any]: + def convert( + self, + file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: str = "Latin1", + ) -> Dict[str, Any]: """ Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html) :param file_path: Path to the .pdf file you want to convert :param meta: Optional dictionary with metadata that shall be attached to all resulting documents. Can be any custom keys and values. + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. :param encoding: Encoding that will be passed as -enc parameter to pdftotext. "Latin 1" is the default encoding of pdftotext. While this works well on many PDFs, it might be needed to switch to "UTF-8" or others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...). @@ -56,6 +73,10 @@ class PDFToTextConverter(BaseConverter): """ pages = self._read_pdf(file_path, layout=False, encoding=encoding) + if remove_numeric_tables is None: + remove_numeric_tables = self.remove_numeric_tables + if valid_languages is None: + valid_languages = self.valid_languages cleaned_pages = [] for page in pages: @@ -76,7 +97,7 @@ class PDFToTextConverter(BaseConverter): digits = [word for word in words if any(i.isdigit() for i in word)] # remove lines having > 40% of words as digits AND not ending with a period(.) - if self.remove_numeric_tables: + if remove_numeric_tables: if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."): logger.debug(f"Removing line '{line}' from {file_path}") continue @@ -85,7 +106,7 @@ class PDFToTextConverter(BaseConverter): page = "\n".join(cleaned_lines) cleaned_pages.append(page) - if self.valid_languages: + if valid_languages: document_text = "".join(cleaned_pages) if not self.validate_language(document_text): logger.warning( diff --git a/haystack/file_converter/tika.py b/haystack/file_converter/tika.py index 4f5d1c7d4..bec82965e 100644 --- a/haystack/file_converter/tika.py +++ b/haystack/file_converter/tika.py @@ -42,7 +42,7 @@ class TikaConverter(BaseConverter): def __init__( self, tika_url: str = "http://localhost:9998/tika", - remove_numeric_tables: Optional[bool] = False, + remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None ): """ @@ -65,12 +65,34 @@ class TikaConverter(BaseConverter): self.tika_url = tika_url super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) - def convert(self, file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + def convert( + self, + file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + ) -> Dict[str, Any]: """ - :param file_path: Path of file to be converted. + :param file_path: path of the file to convert + :param meta: dictionary of meta data key-value pairs to append in the returned document. + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. :return: a list of pages and the extracted meta data of the file. """ + if remove_numeric_tables is None: + remove_numeric_tables = self.remove_numeric_tables + if valid_languages is None: + valid_languages = self.valid_languages + parsed = tikaparser.from_file(file_path.as_posix(), self.tika_url, xmlContent=True) parser = TikaXHTMLParser() parser.feed(parsed["content"]) @@ -85,7 +107,7 @@ class TikaConverter(BaseConverter): digits = [word for word in words if any(i.isdigit() for i in word)] # remove lines having > 40% of words as digits AND not ending with a period(.) - if self.remove_numeric_tables: + if remove_numeric_tables: if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."): logger.debug(f"Removing line '{line}' from {file_path}") continue @@ -95,11 +117,11 @@ class TikaConverter(BaseConverter): page = "\n".join(cleaned_lines) cleaned_pages.append(page) - if self.valid_languages: + if valid_languages: document_text = "".join(cleaned_pages) if not self.validate_language(document_text): logger.warning( - f"The language for {file_path} is not one of {self.valid_languages}. The file may not have " + f"The language for {file_path} is not one of {valid_languages}. The file may not have " f"been decoded in the correct text format." ) diff --git a/haystack/file_converter/txt.py b/haystack/file_converter/txt.py index 9eb79ea7d..8cc97050c 100644 --- a/haystack/file_converter/txt.py +++ b/haystack/file_converter/txt.py @@ -8,7 +8,7 @@ logger = logging.getLogger(__name__) class TextConverter(BaseConverter): - def __init__(self, remove_numeric_tables: Optional[bool] = False, valid_languages: Optional[List[str]] = None): + def __init__(self, remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None): """ :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it @@ -22,23 +22,40 @@ class TextConverter(BaseConverter): in garbled text. """ - super().__init__(remove_numeric_tables=remove_numeric_tables, - valid_languages=valid_languages) + super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages) - def convert(self, - file_path: Path, - meta: Optional[Dict[str, str]] = None, - encoding: str = "utf-8") -> Dict[str, Any]: + def convert( + self, + file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: str = "utf-8", + ) -> Dict[str, Any]: """ Reads text from a txt file and executes optional preprocessing steps. - :param file_path: Path of the file to convert - :param meta: Optional meta data that should be associated with the the document (e.g. name) - :param encoding: Encoding of the file + :param file_path: path of the file to convert + :param meta: dictionary of meta data key-value pairs to append in the returned document. + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. :return: Dict of format {"text": "The text from file", "meta": meta}} """ + if remove_numeric_tables is None: + remove_numeric_tables = self.remove_numeric_tables + if valid_languages is None: + valid_languages = self.valid_languages + with open(file_path, encoding=encoding, errors="ignore") as f: text = f.read() pages = text.split("\f") @@ -52,7 +69,7 @@ class TextConverter(BaseConverter): digits = [word for word in words if any(i.isdigit() for i in word)] # remove lines having > 40% of words as digits AND not ending with a period(.) - if self.remove_numeric_tables: + if remove_numeric_tables: if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."): logger.debug(f"Removing line '{line}' from {file_path}") continue @@ -62,7 +79,7 @@ class TextConverter(BaseConverter): page = "\n".join(cleaned_lines) cleaned_pages.append(page) - if self.valid_languages: + if valid_languages: document_text = "".join(cleaned_pages) if not self.validate_language(document_text): logger.warning( diff --git a/haystack/pipeline.py b/haystack/pipeline.py index a45394669..cd22ba2eb 100644 --- a/haystack/pipeline.py +++ b/haystack/pipeline.py @@ -26,10 +26,18 @@ class Pipeline(ABC): Reader from multiple Retrievers, or re-ranking of candidate documents. """ - def __init__(self): + def __init__(self, pipeline_type: str = "Query"): self.graph = DiGraph() - self.root_node_id = "Query" - self.graph.add_node("Query", component=QueryNode()) + if pipeline_type == "Query": + self.root_node_id = "Query" + self.graph.add_node("Query", component=RootNode()) + elif pipeline_type == "Indexing": + self.root_node_id = "File" + self.graph.add_node("File", component=RootNode()) + else: + raise Exception(f"pipeline_type '{pipeline_type}' is not valid. Supported types are 'Query' & 'Indexing'.") + + self.pipeline_type = pipeline_type self.components: dict = {} def add_node(self, component, name: str, inputs: List[str]): @@ -49,6 +57,10 @@ class Pipeline(ABC): """ self.graph.add_node(name, component=component, inputs=inputs) + if len(self.graph.nodes) == 2: # first node added; connect with Root + self.graph.add_edge(self.root_node_id, name, label="output_1") + return + for i in inputs: if "." in i: [input_node_name, input_edge_name] = i.split(".") @@ -89,7 +101,7 @@ class Pipeline(ABC): def run(self, **kwargs): has_next_node = True current_node_id = self.root_node_id - input_dict = kwargs + input_dict = {"pipeline_type": self.pipeline_type, **kwargs} output_dict = None while has_next_node: @@ -207,14 +219,13 @@ class Pipeline(ABC): name = definition.pop("name") definitions[name] = definition - pipeline = cls() + pipeline = cls(pipeline_type=pipeline_config["type"]) components: dict = {} # instances of component objects. for node_config in pipeline_config["nodes"]: name = node_config["name"] component = cls._load_or_get_component(name=name, definitions=definitions, components=components) - if "DocumentStore" not in definitions[name]["type"]: # DocumentStore is not an explicit node in a Pipeline - pipeline.add_node(component=component, name=node_config["name"], inputs=node_config["inputs"]) + pipeline.add_node(component=component, name=node_config["name"], inputs=node_config.get("inputs", [])) return pipeline @@ -499,7 +510,7 @@ class TranslationWrapperPipeline(BaseStandardPipeline): return output -class QueryNode: +class RootNode: outgoing_edges = 1 def run(self, **kwargs): diff --git a/haystack/preprocessor/base.py b/haystack/preprocessor/base.py index 0aec4e62f..3a3962cdc 100644 --- a/haystack/preprocessor/base.py +++ b/haystack/preprocessor/base.py @@ -1,17 +1,44 @@ -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional + +from haystack import BaseComponent -class BasePreProcessor: - def process(self, document: dict) -> List[dict]: +class BasePreProcessor(BaseComponent): + outgoing_edges = 1 + + def process( + self, + document: dict, + clean_whitespace: Optional[bool] = True, + clean_header_footer: Optional[bool] = False, + clean_empty_lines: Optional[bool] = True, + split_by: Optional[str] = "word", + split_length: Optional[int] = 1000, + split_overlap: Optional[int] = None, + split_respect_sentence_boundary: Optional[bool] = True, + ) -> List[dict]: """ Perform document cleaning and splitting. Takes a single document as input and returns a list of documents. """ - cleaned_document = self.clean(document) - split_documents = self.split(cleaned_document) - return split_documents - - def clean(self, document: Dict[str, Any]) -> Dict[str, Any]: raise NotImplementedError - def split(self, document: Dict[str, Any]) -> List[Dict[str, Any]]: + def clean( + self, document: dict, clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool, + ) -> Dict[str, Any]: raise NotImplementedError + + def split( + self, + document: dict, + split_by: str, + split_length: int, + split_overlap: int, + split_respect_sentence_boundary: bool, + ) -> List[Dict[str, Any]]: + raise NotImplementedError + + def run(self, document: dict, **kwargs): + documents = self.process(document) + + result = {"documents": documents, **kwargs} + return result, "output_1" diff --git a/haystack/preprocessor/preprocessor.py b/haystack/preprocessor/preprocessor.py index c5431da77..395df745b 100644 --- a/haystack/preprocessor/preprocessor.py +++ b/haystack/preprocessor/preprocessor.py @@ -16,13 +16,13 @@ logger = logging.getLogger(__name__) class PreProcessor(BasePreProcessor): def __init__( self, - clean_whitespace: Optional[bool] = True, - clean_header_footer: Optional[bool] = False, - clean_empty_lines: Optional[bool] = True, - split_by: Optional[str] = "word", - split_length: Optional[int] = 1000, - split_overlap: Optional[int] = None, - split_respect_sentence_boundary: Optional[bool] = True, + clean_whitespace: bool = True, + clean_header_footer: bool = False, + clean_empty_lines: bool = True, + split_by: str = "word", + split_length: int = 1000, + split_overlap: int = 0, + split_respect_sentence_boundary: bool = True, ): """ :param clean_header_footer: Use heuristic to remove footers and headers across different pages by searching @@ -39,7 +39,7 @@ class PreProcessor(BasePreProcessor): For example, if split_by -> `word`, split_length -> 5 & split_overlap -> 2, then the splits would be like: [w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12]. - Set the value to None to ensure there is no overlap among the documents after splitting. + Set the value to 0 to ensure there is no overlap among the documents after splitting. :param split_respect_sentence_boundary: Whether to split in partial sentences if split_by -> `word`. If set to True, the individual split will always have complete sentences & the number of words will be <= split_length. @@ -53,18 +53,68 @@ class PreProcessor(BasePreProcessor): self.split_overlap = split_overlap self.split_respect_sentence_boundary = split_respect_sentence_boundary - def clean(self, document: dict) -> dict: + def process( + self, + document: dict, + clean_whitespace: Optional[bool] = None, + clean_header_footer: Optional[bool] = None, + clean_empty_lines: Optional[bool] = None, + split_by: Optional[str] = None, + split_length: Optional[int] = None, + split_overlap: Optional[int] = None, + split_respect_sentence_boundary: Optional[bool] = None, + ) -> List[dict]: + """ + Perform document cleaning and splitting. Takes a single document as input and returns a list of documents. + """ + if clean_whitespace is None: + clean_whitespace = self.clean_whitespace + if clean_header_footer is None: + clean_header_footer = self.clean_header_footer + if clean_empty_lines is None: + clean_empty_lines = self.clean_empty_lines + if split_by is None: + split_by = self.split_by + if split_length is None: + split_length = self.split_length + if split_overlap is None: + split_overlap = self.split_overlap + if split_respect_sentence_boundary is None: + split_respect_sentence_boundary = self.split_respect_sentence_boundary + + cleaned_document = self.clean( + document=document, + clean_whitespace=clean_whitespace, + clean_header_footer=clean_header_footer, + clean_empty_lines=clean_empty_lines, + ) + split_documents = self.split( + document=cleaned_document, + split_by=split_by, + split_length=split_length, + split_overlap=split_overlap, + split_respect_sentence_boundary=split_respect_sentence_boundary, + ) + return split_documents + + def clean( + self, + document: dict, + clean_whitespace: bool, + clean_header_footer: bool, + clean_empty_lines: bool, + ) -> dict: """ Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__(). """ text = document["text"] - if self.clean_header_footer: + if clean_header_footer: text = self._find_and_remove_header_footer( text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1 ) - if self.clean_whitespace: + if clean_whitespace: lines = text.splitlines() cleaned_lines = [] @@ -73,30 +123,37 @@ class PreProcessor(BasePreProcessor): cleaned_lines.append(line) text = "\n".join(cleaned_lines) - if self.clean_empty_lines: + if clean_empty_lines: text = re.sub(r"\n\n+", "\n\n", text) document["text"] = text return document - def split(self, document: dict) -> List[dict]: + def split( + self, + document: dict, + split_by: str, + split_length: int, + split_overlap: int, + split_respect_sentence_boundary: bool, + ) -> List[dict]: """Perform document splitting on a single document. This method can split on different units, at different lengths, with different strides. It can also respect sentence boundaries. Its exact functionality is defined by the parameters passed into PreProcessor.__init__(). Takes a single document as input and returns a list of documents. """ - if not self.split_by: + if not split_by: return [document] - if not self.split_length: + if not split_length: raise Exception("split_length needs be set when using split_by.") - if self.split_respect_sentence_boundary and self.split_by not in("word","sentence"): + if split_respect_sentence_boundary and split_by not in("word","sentence"): raise NotImplementedError("'split_respect_sentence_boundary=True' is only compatible with" " split_by='word' or split_by='sentence'.") text = document["text"] - if self.split_respect_sentence_boundary and self.split_by == "word": + if split_respect_sentence_boundary and split_by == "word": # split by words ensuring no sub sentence splits sentences = nltk.tokenize.sent_tokenize(text) word_count = 0 @@ -104,17 +161,17 @@ class PreProcessor(BasePreProcessor): current_slice: List[str] = [] for sen in sentences: current_word_count = len(sen.split(" ")) - if current_word_count > self.split_length: + if current_word_count > split_length: logger.warning(f"A sentence found with word count higher than the split length.") - if word_count + current_word_count > self.split_length: + if word_count + current_word_count > split_length: list_splits.append(current_slice) - #Enable split_stride with split_by='word' while respecting sentence boundaries. - if self.split_overlap: + # Enable split_stride with split_by='word' while respecting sentence boundaries. + if split_overlap: overlap = [] w_count = 0 for s in current_slice[::-1]: sen_len = len(s.split(" ")) - if w_count < self.split_overlap: + if w_count < split_overlap: overlap.append(s) w_count += sen_len else: @@ -136,20 +193,20 @@ class PreProcessor(BasePreProcessor): text_splits.append(txt) else: # create individual "elements" of passage, sentence, or word - if self.split_by == "passage": + if split_by == "passage": elements = text.split("\n\n") - elif self.split_by == "sentence": + elif split_by == "sentence": elements = nltk.tokenize.sent_tokenize(text) - elif self.split_by == "word": + elif split_by == "word": elements = text.split(" ") else: raise NotImplementedError("PreProcessor only supports 'passage', 'sentence' or 'word' split_by options.") # concatenate individual elements based on split_length & split_stride - if self.split_overlap: - segments = windowed(elements, n=self.split_length, step=self.split_length - self.split_overlap) + if split_overlap: + segments = windowed(elements, n=split_length, step=split_length - split_overlap) else: - segments = windowed(elements, n=self.split_length, step=self.split_length) + segments = windowed(elements, n=split_length, step=split_length) text_splits = [] for seg in segments: txt = " ".join([t for t in seg if t]) diff --git a/haystack/retriever/base.py b/haystack/retriever/base.py index 58f35d341..5e396ebb3 100644 --- a/haystack/retriever/base.py +++ b/haystack/retriever/base.py @@ -4,7 +4,7 @@ import logging from time import perf_counter from functools import wraps from tqdm import tqdm - +from copy import deepcopy from haystack import Document, BaseComponent from haystack.document_store.base import BaseDocumentStore @@ -168,12 +168,21 @@ class BaseRetriever(BaseComponent): else: return metrics - def run( - self, - query: str, - filters: Optional[dict] = None, - top_k_retriever: Optional[int] = None, - **kwargs, + def run(self, pipeline_type: str, **kwargs): + if pipeline_type == "Query": + output, stream = self.run_query(**kwargs) + elif pipeline_type == "Indexing": + output, stream = self.run_indexing(**kwargs) + else: + raise Exception(f"Invalid pipeline_type '{pipeline_type}'.") + return output, stream + + def run_query( + self, + query: str, + filters: Optional[dict] = None, + top_k_retriever: Optional[int] = None, + **kwargs, ): if top_k_retriever: documents = self.retrieve(query=query, filters=filters, top_k=top_k_retriever) @@ -188,3 +197,14 @@ class BaseRetriever(BaseComponent): } return output, "output_1" + + def run_indexing(self, documents: List[dict], **kwargs): + if self.__class__.__name__ in ["DensePassageRetriever", "EmbeddingRetriever"]: + documents = deepcopy(documents) + document_objects = [Document.from_dict(doc) for doc in documents] + embeddings = self.embed_passages(document_objects) # type: ignore + for doc, emb in zip(documents, embeddings): + doc["embedding"] = emb + + output = {**kwargs, "documents": documents} + return output, "output_1" diff --git a/test/samples/pipeline/test_pipeline.yaml b/test/samples/pipeline/test_pipeline.yaml index cc1ba433a..ad70f593c 100644 --- a/test/samples/pipeline/test_pipeline.yaml +++ b/test/samples/pipeline/test_pipeline.yaml @@ -14,12 +14,34 @@ components: - name: TestDocumentStore type: ElasticsearchDocumentStore params: - index: haystack_test + index: haystack_test_pipeline + - name: TestPDFConverter + type: PDFToTextConverter + params: + remove_numeric_tables: false + - name: TestPreprocessor + type: PreProcessor + params: + clean_whitespace: true + pipelines: - name: test_query_pipeline + type: Query nodes: - name: TestESRetriever inputs: [Query] - name: TestReader - inputs: [TestESRetriever] \ No newline at end of file + inputs: [TestESRetriever] + + - name: test_indexing_pipeline + type: Indexing + nodes: + - name: TestPDFConverter + inputs: [File] + - name: TestPreprocessor + inputs: [TestPDFConverter] + - name: TestESRetriever + inputs: [TestPreprocessor] + - name: TestDocumentStore + inputs: [TestESRetriever] diff --git a/test/test_pipeline.py b/test/test_pipeline.py index 68abaf910..ae9e34941 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -11,12 +11,16 @@ from haystack.retriever.sparse import ElasticsearchRetriever @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) def test_load_yaml(document_store_with_docs): + # test correct load of indexing pipeline from yaml + pipeline = Pipeline.load_from_yaml(Path("samples/pipeline/test_pipeline.yaml"), + pipeline_name="test_indexing_pipeline") + pipeline.run(file_path=Path("samples/pdf/sample_pdf_1.pdf"), top_k_retriever=10, top_k_reader=3) - # # test correct load from yaml - pipeline = Pipeline.load_from_yaml(Path("samples/pipeline/test_pipeline.yaml", pipeline_name="my_query")) - prediction = pipeline.run(query="Who lives in Berlin?", top_k_retriever=10, top_k_reader=3) - assert prediction["query"] == "Who lives in Berlin?" - assert prediction["answers"][0]["answer"] == "Carla" + # test correct load of query pipeline from yaml + pipeline = Pipeline.load_from_yaml(Path("samples/pipeline/test_pipeline.yaml"), pipeline_name="test_query_pipeline") + prediction = pipeline.run(query="Who made the PDF specification?", top_k_retriever=10, top_k_reader=3) + assert prediction["query"] == "Who made the PDF specification?" + assert prediction["answers"][0]["answer"] == "Adobe Systems" # test invalid pipeline name with pytest.raises(Exception): diff --git a/test/test_retriever.py b/test/test_retriever.py index 9b6df8d25..79a8b7766 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -93,7 +93,7 @@ def test_elasticsearch_custom_query(elasticsearch_fixture): "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}], "filter": [{"terms": {"year": ${years}}}]}}}""", ) - results = retriever.run(query="test", filters={"years": ["2020", "2021"]})[0]["documents"] + results = retriever.retrieve(query="test", filters={"years": ["2020", "2021"]}) assert len(results) == 4 # test custom "term" query @@ -108,7 +108,7 @@ def test_elasticsearch_custom_query(elasticsearch_fixture): "multi_match": {"query": ${query}, "type": "most_fields", "fields": ["text"]}}], "filter": [{"term": {"year": ${years}}}]}}}""", ) - results = retriever.run(query="test", filters={"years": "2021"})[0]["documents"] + results = retriever.retrieve(query="test", filters={"years": "2021"}) assert len(results) == 3