diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md index 8fd6a86c3..e954390de 100644 --- a/docs/_src/api/api/file_converter.md +++ b/docs/_src/api/api/file_converter.md @@ -17,10 +17,7 @@ Base class for implementing file converts to transform input documents to text f #### BaseConverter.\_\_init\_\_ ```python -def __init__(remove_numeric_tables: bool = False, - valid_languages: Optional[List[str]] = None, - id_hash_keys: Optional[List[str]] = None, - progress_bar: bool = True) +def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True) ``` **Arguments**: @@ -47,12 +44,7 @@ In this case the id will be generated by using the content and the defined metad ```python @abstractmethod -def convert(file_path: Path, - meta: Optional[Dict[str, Any]], - remove_numeric_tables: Optional[bool] = None, - valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = "UTF-8", - id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, meta: Optional[Dict[str, Any]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Convert a file to a dictionary containing the text and any associated meta data. @@ -85,8 +77,7 @@ In this case the id will be generated by using the content and the defined metad #### BaseConverter.validate\_language ```python -def validate_language(text: str, - valid_languages: Optional[List[str]] = None) -> bool +def validate_language(text: str, valid_languages: Optional[List[str]] = None) -> bool ``` Validate if the language of the text is one of valid languages. @@ -96,14 +87,7 @@ Validate if the language of the text is one of valid languages. #### BaseConverter.run ```python -def run(file_paths: Union[Path, List[Path]], - meta: Optional[Union[Dict[str, str], - List[Optional[Dict[str, str]]]]] = None, - remove_numeric_tables: Optional[bool] = None, - known_ligatures: Dict[str, str] = KNOWN_LIGATURES, - valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = "UTF-8", - id_hash_keys: Optional[List[str]] = None) +def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) ``` Extract text from a file. @@ -153,12 +137,7 @@ class DocxToTextConverter(BaseConverter) #### DocxToTextConverter.convert ```python -def convert(file_path: Path, - meta: Optional[Dict[str, str]] = None, - remove_numeric_tables: Optional[bool] = None, - valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = None, - id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Extract text from a .docx file. @@ -203,9 +182,7 @@ class ImageToTextConverter(BaseConverter) #### ImageToTextConverter.\_\_init\_\_ ```python -def __init__(remove_numeric_tables: bool = False, - valid_languages: Optional[List[str]] = ["eng"], - id_hash_keys: Optional[List[str]] = None) +def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None) ``` **Arguments**: @@ -232,12 +209,7 @@ In this case the id will be generated by using the content and the defined metad #### ImageToTextConverter.convert ```python -def convert(file_path: Union[Path, str], - meta: Optional[Dict[str, str]] = None, - remove_numeric_tables: Optional[bool] = None, - valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = None, - id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Union[Path, str], meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) @@ -275,20 +247,35 @@ In this case the id will be generated by using the content and the defined metad class MarkdownConverter(BaseConverter) ``` + + +#### MarkdownConverter.\_\_init\_\_ + +```python +def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True, remove_code_snippets: bool = True, extract_headlines: bool = False) +``` + +**Arguments**: + +- `remove_numeric_tables`: Not applicable. +- `valid_languages`: Not applicable. +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. +- `progress_bar`: Show a progress bar for the conversion. +- `remove_code_snippets`: Whether to remove snippets from the markdown file. +- `extract_headlines`: Whether to extract headings from the markdown file. + #### MarkdownConverter.convert ```python -def convert(file_path: Path, - meta: Optional[Dict[str, str]] = None, - remove_numeric_tables: Optional[bool] = None, - valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = "utf-8", - id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None, remove_code_snippets: Optional[bool] = None, extract_headlines: Optional[bool] = None) -> List[Document] ``` -Reads text from a markdown file and executes optional preprocessing steps. +Reads text from a txt file and executes optional preprocessing steps. **Arguments**: @@ -301,21 +288,8 @@ Reads text from a markdown file and executes optional preprocessing steps. attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. - - - -#### MarkdownConverter.markdown\_to\_text - -```python -@staticmethod -def markdown_to_text(markdown_string: str) -> str -``` - -Converts a markdown string to plaintext - -**Arguments**: - -- `markdown_string`: String in markdown format +- `remove_code_snippets`: Whether to remove snippets from the markdown file. +- `extract_headlines`: Whether to extract headings from the markdown file. @@ -334,11 +308,7 @@ class PDFToTextConverter(BaseConverter) #### PDFToTextConverter.\_\_init\_\_ ```python -def __init__(remove_numeric_tables: bool = False, - valid_languages: Optional[List[str]] = None, - id_hash_keys: Optional[List[str]] = None, - encoding: Optional[str] = "UTF-8", - keep_physical_layout: bool = False) +def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8") ``` **Arguments**: @@ -360,20 +330,13 @@ In this case the id will be generated by using the content and the defined metad - `encoding`: Encoding that will be passed as `-enc` parameter to `pdftotext`. Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...). (See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal) -- `keep_physical_layout`: This option will maintain original physical layout on the extracted text. -It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order. #### PDFToTextConverter.convert ```python -def convert(file_path: Path, - meta: Optional[Dict[str, Any]] = None, - remove_numeric_tables: Optional[bool] = None, - valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = None, - id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html) @@ -395,8 +358,6 @@ not one of the valid languages, then it might likely be encoding error resulting in garbled text. - `encoding`: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`. (See list of available encodings by running `pdftotext -listenc` in the terminal) -- `keep_physical_layout`: This option will maintain original physical layout on the extracted text. -It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order. - `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). @@ -415,9 +376,7 @@ class PDFToTextOCRConverter(BaseConverter) #### PDFToTextOCRConverter.\_\_init\_\_ ```python -def __init__(remove_numeric_tables: bool = False, - valid_languages: Optional[List[str]] = ["eng"], - id_hash_keys: Optional[List[str]] = None) +def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None) ``` Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) @@ -444,12 +403,7 @@ In this case the id will be generated by using the content and the defined metad #### PDFToTextOCRConverter.convert ```python -def convert(file_path: Path, - meta: Optional[Dict[str, Any]] = None, - remove_numeric_tables: Optional[bool] = None, - valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = None, - id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Convert a file to a dictionary containing the text and any associated meta data. @@ -499,17 +453,7 @@ Supported file formats are: PDF, DOCX #### ParsrConverter.\_\_init\_\_ ```python -def __init__(parsr_url: str = "http://localhost:3001", - extractor: Literal["pdfminer", "pdfjs"] = "pdfminer", - table_detection_mode: Literal["lattice", "stream"] = "lattice", - preceding_context_len: int = 3, - following_context_len: int = 3, - remove_page_headers: bool = False, - remove_page_footers: bool = False, - remove_table_of_contents: bool = False, - valid_languages: Optional[List[str]] = None, - id_hash_keys: Optional[List[str]] = None, - add_page_number: bool = True) +def __init__(parsr_url: str = "http://localhost:3001", extractor: Literal["pdfminer", "pdfjs"] = "pdfminer", table_detection_mode: Literal["lattice", "stream"] = "lattice", preceding_context_len: int = 3, following_context_len: int = 3, remove_page_headers: bool = False, remove_page_footers: bool = False, remove_table_of_contents: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True) ``` **Arguments**: @@ -543,12 +487,7 @@ In this case the id will be generated by using the content and the defined metad #### ParsrConverter.convert ```python -def convert(file_path: Path, - meta: Optional[Dict[str, Any]] = None, - remove_numeric_tables: Optional[bool] = None, - valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = "utf-8", - id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Extract text and tables from a PDF or DOCX using the open-source Parsr tool. @@ -597,16 +536,7 @@ https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quick #### AzureConverter.\_\_init\_\_ ```python -def __init__(endpoint: str, - credential_key: str, - model_id: str = "prebuilt-document", - valid_languages: Optional[List[str]] = None, - save_json: bool = False, - preceding_context_len: int = 3, - following_context_len: int = 3, - merge_multiple_column_headers: bool = True, - id_hash_keys: Optional[List[str]] = None, - add_page_number: bool = True) +def __init__(endpoint: str, credential_key: str, model_id: str = "prebuilt-document", valid_languages: Optional[List[str]] = None, save_json: bool = False, preceding_context_len: int = 3, following_context_len: int = 3, merge_multiple_column_headers: bool = True, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True) ``` **Arguments**: @@ -641,14 +571,7 @@ In this case the id will be generated by using the content and the defined metad #### AzureConverter.convert ```python -def convert(file_path: Path, - meta: Optional[Dict[str, Any]] = None, - remove_numeric_tables: Optional[bool] = None, - valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = "utf-8", - id_hash_keys: Optional[List[str]] = None, - pages: Optional[str] = None, - known_language: Optional[str] = None) -> List[Document] +def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None, pages: Optional[str] = None, known_language: Optional[str] = None) -> List[Document] ``` Extract text and tables from a PDF, JPEG, PNG, BMP or TIFF file using Azure's Form Recognizer service. @@ -680,11 +603,7 @@ See supported locales here: https://aka.ms/azsdk/formrecognizer/supportedlocales #### AzureConverter.convert\_azure\_json ```python -def convert_azure_json( - file_path: Path, - meta: Optional[Dict[str, Any]] = None, - valid_languages: Optional[List[str]] = None, - id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert_azure_json(file_path: Path, meta: Optional[Dict[str, Any]] = None, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Extract text and tables from the JSON output of Azure's Form Recognizer service. @@ -721,10 +640,7 @@ class TikaConverter(BaseConverter) #### TikaConverter.\_\_init\_\_ ```python -def __init__(tika_url: str = "http://localhost:9998/tika", - remove_numeric_tables: bool = False, - valid_languages: Optional[List[str]] = None, - id_hash_keys: Optional[List[str]] = None) +def __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None) ``` **Arguments**: @@ -750,12 +666,7 @@ In this case the id will be generated by using the content and the defined metad #### TikaConverter.convert ```python -def convert(file_path: Path, - meta: Optional[Dict[str, str]] = None, - remove_numeric_tables: Optional[bool] = None, - valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = None, - id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` **Arguments**: @@ -799,12 +710,7 @@ class TextConverter(BaseConverter) #### TextConverter.convert ```python -def convert(file_path: Path, - meta: Optional[Dict[str, str]] = None, - remove_numeric_tables: Optional[bool] = None, - valid_languages: Optional[List[str]] = None, - encoding: Optional[str] = "utf-8", - id_hash_keys: Optional[List[str]] = None) -> List[Document] +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document] ``` Reads text from a txt file and executes optional preprocessing steps. diff --git a/haystack/json-schemas/haystack-pipeline-1.11.0rc0.schema.json b/haystack/json-schemas/haystack-pipeline-1.11.0rc0.schema.json index 5411f0e63..ce36bc76b 100644 --- a/haystack/json-schemas/haystack-pipeline-1.11.0rc0.schema.json +++ b/haystack/json-schemas/haystack-pipeline-1.11.0rc0.schema.json @@ -2421,6 +2421,11 @@ "additionalProperties": false, "description": "Each parameter can reference other components defined in the same YAML file.", "properties": { + "extract_headlines": { + "default": false, + "title": "Extract Headlines", + "type": "boolean" + }, "id_hash_keys": { "anyOf": [ { @@ -2440,6 +2445,11 @@ "title": "Progress Bar", "type": "boolean" }, + "remove_code_snippets": { + "default": true, + "title": "Remove Code Snippets", + "type": "boolean" + }, "remove_numeric_tables": { "default": false, "title": "Remove Numeric Tables", diff --git a/haystack/json-schemas/haystack-pipeline-main.schema.json b/haystack/json-schemas/haystack-pipeline-main.schema.json index a286d64d1..d36511a08 100644 --- a/haystack/json-schemas/haystack-pipeline-main.schema.json +++ b/haystack/json-schemas/haystack-pipeline-main.schema.json @@ -2421,6 +2421,11 @@ "additionalProperties": false, "description": "Each parameter can reference other components defined in the same YAML file.", "properties": { + "extract_headlines": { + "default": false, + "title": "Extract Headlines", + "type": "boolean" + }, "id_hash_keys": { "anyOf": [ { @@ -2440,6 +2445,11 @@ "title": "Progress Bar", "type": "boolean" }, + "remove_code_snippets": { + "default": true, + "title": "Remove Code Snippets", + "type": "boolean" + }, "remove_numeric_tables": { "default": false, "title": "Remove Numeric Tables", diff --git a/haystack/nodes/file_converter/markdown.py b/haystack/nodes/file_converter/markdown.py index e539bac78..7c5af831c 100644 --- a/haystack/nodes/file_converter/markdown.py +++ b/haystack/nodes/file_converter/markdown.py @@ -1,10 +1,10 @@ import logging import re from pathlib import Path -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple, Any try: - from bs4 import BeautifulSoup + from bs4 import BeautifulSoup, NavigableString from markdown import markdown except (ImportError, ModuleNotFoundError) as ie: from haystack.utils.import_utils import _optional_component_not_installed @@ -19,14 +19,46 @@ logger = logging.getLogger(__name__) class MarkdownConverter(BaseConverter): + def __init__( + self, + remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = None, + id_hash_keys: Optional[List[str]] = None, + progress_bar: bool = True, + remove_code_snippets: bool = True, + extract_headlines: bool = False, + ): + """ + :param remove_numeric_tables: Not applicable. + :param valid_languages: Not applicable. + :param id_hash_keys: Generate the document ID from a custom list of strings that refer to the document's + attributes. To make sure you don't have duplicate documents in your DocumentStore if texts are + not unique, you can modify the metadata and pass for example, `"meta"` to this field ([`"content"`, `"meta"`]). + In this case, the ID is generated by using the content and the defined metadata. + :param progress_bar: Show a progress bar for the conversion. + :param remove_code_snippets: Whether to remove snippets from the markdown file. + :param extract_headlines: Whether to extract headings from the markdown file. + """ + super().__init__( + remove_numeric_tables=remove_numeric_tables, + valid_languages=valid_languages, + id_hash_keys=id_hash_keys, + progress_bar=progress_bar, + ) + + self.remove_code_snippets = remove_code_snippets + self.extract_headlines = extract_headlines + def convert( self, file_path: Path, - meta: Optional[Dict[str, str]] = None, + meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None, + remove_code_snippets: Optional[bool] = None, + extract_headlines: Optional[bool] = None, ) -> List[Document]: """ Reads text from a markdown file and executes optional preprocessing steps. @@ -40,32 +72,53 @@ class MarkdownConverter(BaseConverter): attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). In this case the id will be generated by using the content and the defined metadata. + :param remove_code_snippets: Whether to remove snippets from the markdown file. + :param extract_headlines: Whether to extract headings from the markdown file. """ - if id_hash_keys is None: - id_hash_keys = self.id_hash_keys + + id_hash_keys = id_hash_keys if id_hash_keys is not None else self.id_hash_keys + remove_code_snippets = remove_code_snippets if remove_code_snippets is not None else self.remove_code_snippets + extract_headlines = extract_headlines if extract_headlines is not None else self.extract_headlines + with open(file_path, encoding=encoding, errors="ignore") as f: markdown_text = f.read() - text = self.markdown_to_text(markdown_text) + + # md -> html -> text since BeautifulSoup can extract text cleanly + html = markdown(markdown_text) + + # remove code snippets + if remove_code_snippets: + html = re.sub(r"
(.*?)", " ", html, flags=re.DOTALL) + html = re.sub(r"
(.*?)", " ", html, flags=re.DOTALL)
+ soup = BeautifulSoup(html, "html.parser")
+
+ if extract_headlines:
+ text, headlines = self._extract_text_and_headlines(soup)
+ if meta is None:
+ meta = {}
+ meta["headlines"] = headlines
+ else:
+ text = soup.get_text()
+
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
return [document]
- # Following code snippet is copied from https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
@staticmethod
- def markdown_to_text(markdown_string: str) -> str:
+ def _extract_text_and_headlines(soup: BeautifulSoup) -> Tuple[str, List[Dict]]:
"""
- Converts a markdown string to plaintext
-
- :param markdown_string: String in markdown format
+ Extracts text and headings from a soup object.
"""
- # md -> html -> text since BeautifulSoup can extract text cleanly
- html = markdown(markdown_string)
+ headline_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
+ headlines = []
+ text = ""
+ for desc in soup.descendants:
+ if desc.name in headline_tags:
+ current_headline = desc.get_text()
+ current_start_idx = len(text)
+ current_level = int(desc.name[-1]) - 1
+ headlines.append({"headline": current_headline, "start_idx": current_start_idx, "level": current_level})
- # remove code snippets
- html = re.sub(r"(.*?)", " ", html) - html = re.sub(r"
(.*?)", " ", html)
+ if isinstance(desc, NavigableString):
+ text += desc.get_text()
- # extract text
- soup = BeautifulSoup(html, "html.parser")
- text = "".join(soup.findAll(text=True))
-
- return text
+ return text, headlines
diff --git a/haystack/nodes/preprocessor/preprocessor.py b/haystack/nodes/preprocessor/preprocessor.py
index 985cf640d..fe69fb7db 100644
--- a/haystack/nodes/preprocessor/preprocessor.py
+++ b/haystack/nodes/preprocessor/preprocessor.py
@@ -3,7 +3,7 @@ import re
from copy import deepcopy
from functools import partial, reduce
from itertools import chain
-from typing import List, Optional, Generator, Set, Union
+from typing import List, Optional, Generator, Set, Union, Tuple, Dict
try:
from typing import Literal
@@ -47,8 +47,6 @@ iso639_to_nltk = {
"ml": "malayalam",
}
-EMPTY_PAGE_PLACEHOLDER = "@@@HAYSTACK_KEEP_PAGE@@@."
-
class PreProcessor(BasePreProcessor):
def __init__(
@@ -261,35 +259,22 @@ class PreProcessor(BasePreProcessor):
text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
)
+ headlines = document.meta["headlines"] if "headlines" in document.meta else []
+
if clean_whitespace:
- pages = text.split("\f")
- cleaned_pages = []
- for page in pages:
- if not page:
- # there are many "empty text" pages in a marketing document, as for example the cover page. If we just forget about them, we have a mismatch
- # with page numbers which causes problems later on. Therefore, we replace them with a dummy text, which will not be found by any query.
- cleaned_page = EMPTY_PAGE_PLACEHOLDER
- else:
- lines = page.splitlines()
- cleaned_lines = []
- for line in lines:
- line = line.strip()
- cleaned_lines.append(line)
- cleaned_page = "\n".join(cleaned_lines)
-
- cleaned_pages.append(cleaned_page)
-
- text = "\f".join(cleaned_pages)
+ text, headlines = self._clean_whitespace(text=text, headlines=headlines)
if clean_empty_lines:
- text = re.sub(r"\n\n+", "\n\n", text)
+ text, headlines = self._clean_empty_lines(text=text, headlines=headlines)
for substring in remove_substrings:
- text = text.replace(substring, "")
+ text, headline = self._remove_substring(text=text, substring=substring, headlines=headlines)
if text != document.content:
document = deepcopy(document)
document.content = text
+ if headlines:
+ document.meta["headlines"] = headlines
return document
@@ -328,131 +313,302 @@ class PreProcessor(BasePreProcessor):
return [document]
text = document.content
+ headlines = document.meta["headlines"] if "headlines" in document.meta else []
if split_respect_sentence_boundary and split_by == "word":
- # split by words ensuring no sub sentence splits
- if self.add_page_number:
- # SentenceTokenizer will remove "\f" if it is at the end of a sentence, so substituting it in these
- # cases for "[NEW_PAGE]" to don't lose any page breaks.
- text = self._substitute_page_breaks(text)
- sentences = self._split_sentences(text)
-
- word_count_slice = 0
- cur_page = 1
- splits_pages = []
- list_splits = []
- current_slice: List[str] = []
- for sen in sentences:
- if self.add_page_number and "[NEW_PAGE]" in sen:
- sen = sen.replace("[NEW_PAGE]", "\f")
-
- word_count_sen = len(sen.split(" "))
- if word_count_sen > split_length:
- long_sentence_message = f"One or more sentence found with word count higher than the split length."
- if long_sentence_message not in self.print_log:
- self.print_log.add(long_sentence_message)
- logger.warning(long_sentence_message)
- if word_count_slice + word_count_sen > split_length:
- # Number of words exceeds split_length -> save current slice and start a new one
- if current_slice:
- list_splits.append(current_slice)
- splits_pages.append(cur_page)
-
- if split_overlap:
- overlap = []
- processed_sents = []
- word_count_overlap = 0
- current_slice_copy = deepcopy(current_slice)
- for idx, s in reversed(list(enumerate(current_slice))):
- sen_len = len(s.split(" "))
- if word_count_overlap < split_overlap:
- overlap.append(s)
- word_count_overlap += sen_len
- current_slice_copy.pop(idx)
- else:
- processed_sents = current_slice_copy
- break
- current_slice = list(reversed(overlap))
- word_count_slice = word_count_overlap
- else:
- processed_sents = current_slice
- current_slice = []
- word_count_slice = 0
-
- # Count number of page breaks in processed sentences
- if self.add_page_number:
- num_page_breaks = self._count_processed_page_breaks(
- sentences=processed_sents,
- split_overlap=split_overlap,
- overlapping_sents=current_slice,
- current_sent=sen,
- )
- cur_page += num_page_breaks
-
- current_slice.append(sen)
- word_count_slice += word_count_sen
-
- if current_slice:
- list_splits.append(current_slice)
- splits_pages.append(cur_page)
-
- text_splits = []
- for sl in list_splits:
- txt = " ".join(sl)
- if len(txt) > 0:
- text_splits.append(txt)
+ text_splits, splits_pages, splits_start_idxs = self._split_by_word_respecting_sent_boundary(
+ text=text, split_length=split_length, split_overlap=split_overlap
+ )
else:
# create individual "elements" of passage, sentence, or word
- if split_by == "passage":
- elements = text.split("\n\n")
- elif split_by == "sentence":
- if self.add_page_number:
- # SentenceTokenizer will remove "\f" if it is at the end of a sentence, so substituting it in these
- # cases for "[NEW_PAGE]" to don't lose any page breaks.
- text = self._substitute_page_breaks(text)
- elements = self._split_sentences(text)
- elif split_by == "word":
- elements = text.split(" ")
- else:
- raise NotImplementedError(
- "PreProcessor only supports 'passage', 'sentence' or 'word' split_by options."
- )
+ elements, split_at = self._split_into_units(text=text, split_by=split_by)
# concatenate individual elements based on split_length & split_stride
- if split_overlap:
- segments = windowed(elements, n=split_length, step=split_length - split_overlap)
- else:
- segments = windowed(elements, n=split_length, step=split_length)
- text_splits = []
- splits_pages = []
- cur_page = 1
- for seg in segments:
- current_units = [unit for unit in seg if unit is not None]
- txt = " ".join(current_units)
- if len(txt) > 0:
- text_splits.append(txt)
- splits_pages.append(cur_page)
- if self.add_page_number:
- processed_units = current_units[: split_length - split_overlap]
- num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
- cur_page += num_page_breaks
+ text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
+ elements=elements, split_length=split_length, split_overlap=split_overlap, split_at=split_at
+ )
# create new document dicts for each text split
- documents = []
- for i, txt in enumerate(text_splits):
- # now we want to get rid of the empty page placeholder and skip the split if there's nothing left
- txt_clean = txt.replace(EMPTY_PAGE_PLACEHOLDER, "")
- if not txt_clean.strip():
- continue
+ documents = self._create_docs_from_splits(
+ text_splits=text_splits,
+ splits_pages=splits_pages,
+ splits_start_idxs=splits_start_idxs,
+ headlines=headlines,
+ meta=document.meta or {},
+ id_hash_keys=id_hash_keys,
+ )
- doc = Document(content=txt_clean, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
+ return documents
+
+ @staticmethod
+ def _clean_whitespace(text: str, headlines: List[Dict]) -> Tuple[str, List[Dict]]:
+ """
+ Strips whitespaces before or after each line in the text.
+ """
+ pages = text.split("\f")
+ cleaned_pages = []
+ cur_headline_idx = 0
+ num_headlines = len(headlines)
+ cur_char_idx = 0
+ num_removed_chars_total = 0
+ for page in pages:
+ lines = page.splitlines()
+ cleaned_lines = []
+ for idx, line in enumerate(lines):
+ old_line_len = len(line)
+ cleaned_line = line.strip()
+ cleaned_line_len = len(cleaned_line)
+ cur_char_idx += old_line_len + 1 # add 1 for newline char
+ if old_line_len != cleaned_line_len:
+ num_removed_chars_current = old_line_len - cleaned_line_len
+ num_removed_chars_total += num_removed_chars_current
+ for headline_idx in range(cur_headline_idx, num_headlines):
+ if cur_char_idx - num_removed_chars_total <= headlines[headline_idx]["start_idx"]:
+ headlines[headline_idx]["start_idx"] -= num_removed_chars_current
+ else:
+ cur_headline_idx += 1
+
+ cleaned_lines.append(cleaned_line)
+ cleaned_page = "\n".join(cleaned_lines)
+ cleaned_pages.append(cleaned_page)
+
+ cleaned_text = "\f".join(cleaned_pages)
+ return cleaned_text, headlines
+
+ @staticmethod
+ def _clean_empty_lines(text: str, headlines: List[Dict]) -> Tuple[str, List[Dict]]:
+ if headlines:
+ num_headlines = len(headlines)
+ multiple_new_line_matches = re.finditer(r"\n\n\n+", text)
+ cur_headline_idx = 0
+ num_removed_chars_accumulated = 0
+ for match in multiple_new_line_matches:
+ num_removed_chars_current = match.end() - match.start() - 2
+ for headline_idx in range(cur_headline_idx, num_headlines):
+ if match.end() - num_removed_chars_accumulated <= headlines[headline_idx]["start_idx"]:
+ headlines[headline_idx]["start_idx"] -= num_removed_chars_current
+ else:
+ cur_headline_idx += 1
+ num_removed_chars_accumulated += num_removed_chars_current
+
+ cleaned_text = re.sub(r"\n\n\n+", "\n\n", text)
+ return cleaned_text, headlines
+
+ @staticmethod
+ def _remove_substring(text: str, substring: str, headlines: List[Dict]) -> Tuple[str, List[Dict]]:
+ if headlines:
+ num_headlines = len(headlines)
+ multiple_substring_matches = re.finditer(substring, text)
+ cur_headline_idx = 0
+ num_removed_chars_accumulated = 0
+ for match in multiple_substring_matches:
+ for headline_idx in range(cur_headline_idx, num_headlines):
+ if match.end() - num_removed_chars_accumulated <= headlines[headline_idx]["start_idx"]:
+ headlines[headline_idx]["start_idx"] -= len(substring)
+ else:
+ cur_headline_idx += 1
+ num_removed_chars_accumulated += len(substring)
+
+ cleaned_text = text.replace(substring, "")
+ return cleaned_text, headlines
+
+ def _split_by_word_respecting_sent_boundary(
+ self, text: str, split_length: int, split_overlap: int
+ ) -> Tuple[List[str], List[int], List[int]]:
+ """
+ Splits the text into parts of split_length words while respecting sentence boundaries.
+ """
+ sentences = self._split_sentences(text)
+
+ word_count_slice = 0
+ cur_page = 1
+ cur_start_idx = 0
+ splits_pages = []
+ list_splits = []
+ splits_start_idxs = []
+ current_slice: List[str] = []
+ for sen in sentences:
+ word_count_sen = len(sen.split())
+
+ if word_count_sen > split_length:
+ long_sentence_message = (
+ f"We found one or more sentences whose word count is higher than the split length."
+ )
+ if long_sentence_message not in self.print_log:
+ self.print_log.add(long_sentence_message)
+ logger.warning(long_sentence_message)
+
+ if word_count_slice + word_count_sen > split_length:
+ # Number of words exceeds split_length -> save current slice and start a new one
+ if current_slice:
+ list_splits.append(current_slice)
+ splits_pages.append(cur_page)
+ splits_start_idxs.append(cur_start_idx)
+
+ if split_overlap:
+ overlap = []
+ processed_sents = []
+ word_count_overlap = 0
+ current_slice_copy = deepcopy(current_slice)
+ for idx, s in reversed(list(enumerate(current_slice))):
+ sen_len = len(s.split())
+ if word_count_overlap < split_overlap:
+ overlap.append(s)
+ word_count_overlap += sen_len
+ current_slice_copy.pop(idx)
+ else:
+ processed_sents = current_slice_copy
+ break
+ current_slice = list(reversed(overlap))
+ word_count_slice = word_count_overlap
+ else:
+ processed_sents = current_slice
+ current_slice = []
+ word_count_slice = 0
+
+ cur_start_idx += len("".join(processed_sents))
+
+ # Count number of page breaks in processed sentences
+ if self.add_page_number:
+ num_page_breaks = self._count_processed_page_breaks(
+ sentences=processed_sents,
+ split_overlap=split_overlap,
+ overlapping_sents=current_slice,
+ current_sent=sen,
+ )
+ cur_page += num_page_breaks
+
+ current_slice.append(sen)
+ word_count_slice += word_count_sen
+
+ if current_slice:
+ list_splits.append(current_slice)
+ splits_pages.append(cur_page)
+ splits_start_idxs.append(cur_start_idx)
+
+ text_splits = []
+ for sl in list_splits:
+ txt = "".join(sl)
+ if len(txt) > 0:
+ text_splits.append(txt)
+
+ return text_splits, splits_pages, splits_start_idxs
+
+ def _split_into_units(self, text: str, split_by: str) -> Tuple[List[str], str]:
+ if split_by == "passage":
+ elements = text.split("\n\n")
+ split_at = "\n\n"
+ elif split_by == "sentence":
+ elements = self._split_sentences(text)
+ split_at = "" # whitespace will be preserved while splitting text into sentences
+ elif split_by == "word":
+ elements = text.split(" ")
+ split_at = " "
+ else:
+ raise NotImplementedError("PreProcessor only supports 'passage', 'sentence' or 'word' split_by options.")
+
+ return elements, split_at
+
+ def _concatenate_units(
+ self, elements: List[str], split_length: int, split_overlap: int, split_at: str
+ ) -> Tuple[List[str], List[int], List[int]]:
+ """
+ Concatenates the elements into parts of split_length units.
+ """
+ segments = windowed(elements, n=split_length, step=split_length - split_overlap)
+ split_at_len = len(split_at)
+ text_splits = []
+ splits_pages = []
+ splits_start_idxs = []
+ cur_page = 1
+ cur_start_idx = 0
+ for seg in segments:
+ current_units = [unit for unit in seg if unit is not None]
+ txt = split_at.join(current_units)
+ if len(txt) > 0:
+ text_splits.append(txt)
+ splits_pages.append(cur_page)
+ splits_start_idxs.append(cur_start_idx)
+ processed_units = current_units[: split_length - split_overlap]
+ cur_start_idx += len((split_at_len * " ").join(processed_units)) + split_at_len
+ if self.add_page_number:
+ num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
+ cur_page += num_page_breaks
+
+ return text_splits, splits_pages, splits_start_idxs
+
+ def _create_docs_from_splits(
+ self,
+ text_splits: List[str],
+ splits_pages: List[int],
+ splits_start_idxs: List[int],
+ headlines: List[Dict],
+ meta: Dict,
+ id_hash_keys=Optional[List[str]],
+ ) -> List[Document]:
+ """
+ Creates Document objects from text splits enriching them with page number and headline information if given.
+ """
+ documents = []
+
+ earliest_rel_hl = 0
+ for i, txt in enumerate(text_splits):
+ meta = deepcopy(meta)
+ doc = Document(content=txt, meta=meta, id_hash_keys=id_hash_keys)
doc.meta["_split_id"] = i
if self.add_page_number:
doc.meta["page"] = splits_pages[i]
+ if headlines:
+ split_start_idx = splits_start_idxs[i]
+ relevant_headlines, earliest_rel_hl = self._extract_relevant_headlines_for_split(
+ headlines=headlines, split_txt=txt, split_start_idx=split_start_idx, earliest_rel_hl=earliest_rel_hl
+ )
+ doc.meta["headlines"] = relevant_headlines
+
documents.append(doc)
return documents
+ @staticmethod
+ def _extract_relevant_headlines_for_split(
+ headlines: List[Dict], split_txt: str, split_start_idx: int, earliest_rel_hl: int
+ ) -> Tuple[List[Dict], int]:
+ """
+ If you give it a list of headlines, a text split, and the start index of the split in the original text, this method
+ extracts the headlines that are relevant for the split.
+ """
+ relevant_headlines = []
+
+ for headline_idx in range(earliest_rel_hl, len(headlines)):
+ # Headline is part of current split
+ if split_start_idx <= headlines[headline_idx]["start_idx"] < split_start_idx + len(split_txt):
+ headline_copy = deepcopy(headlines[headline_idx])
+ headline_copy["start_idx"] = headlines[headline_idx]["start_idx"] - split_start_idx
+ relevant_headlines.append(headline_copy)
+ # Headline appears before current split, but might be relevant for current split
+ elif headlines[headline_idx]["start_idx"] < split_start_idx:
+ # Check if following headlines are on a higher level
+ headline_to_check = headline_idx + 1
+ headline_is_relevant = True
+ while (
+ headline_to_check < len(headlines) and headlines[headline_to_check]["start_idx"] <= split_start_idx
+ ):
+ if headlines[headline_to_check]["level"] <= headlines[headline_idx]["level"]:
+ headline_is_relevant = False
+ break
+ headline_to_check += 1
+ if headline_is_relevant:
+ headline_copy = deepcopy(headlines[headline_idx])
+ headline_copy["start_idx"] = None
+ relevant_headlines.append(headline_copy)
+ else:
+ earliest_rel_hl += 1
+ # Headline (and all subsequent ones) only relevant for later splits
+ elif headlines[headline_idx]["start_idx"] > split_start_idx + len(split_txt):
+ break
+
+ return relevant_headlines, earliest_rel_hl
+
def _find_and_remove_header_footer(
self, text: str, n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
) -> str:
@@ -542,46 +698,74 @@ class PreProcessor(BasePreProcessor):
:param text: str, text to tokenize
:return: list[str], list of sentences
"""
- sentences = []
-
language_name = iso639_to_nltk.get(self.language)
+ sentence_tokenizer = self._load_sentence_tokenizer(language_name)
+ # The following adjustment of PunktSentenceTokenizer is inspired by:
+ # https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
+ # It is needed for preserving whitespace while splitting text into sentences.
+ period_context_fmt = r"""
+ %(SentEndChars)s # a potential sentence ending
+ \s* # match potential whitespace (is originally in lookahead assertion)
+ (?=(?P