feat: Extraction of headlines in markdown files (#3445)

* Extract headings from markdown files + adapt PreProcessor

* Add tests

* Fix mypy

* Generate JSON schema

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Update haystack/nodes/file_converter/markdown.py

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* Apply black

* Add PR feedback

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
This commit is contained in:
bogdankostic 2022-10-26 11:57:55 +02:00 committed by GitHub
parent 5ca96357ff
commit 4fbe80c098
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 735 additions and 337 deletions

View File

@ -17,10 +17,7 @@ Base class for implementing file converts to transform input documents to text f
#### BaseConverter.\_\_init\_\_
```python
def __init__(remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
progress_bar: bool = True)
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True)
```
**Arguments**:
@ -47,12 +44,7 @@ In this case the id will be generated by using the content and the defined metad
```python
@abstractmethod
def convert(file_path: Path,
meta: Optional[Dict[str, Any]],
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8",
id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path, meta: Optional[Dict[str, Any]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Convert a file to a dictionary containing the text and any associated meta data.
@ -85,8 +77,7 @@ In this case the id will be generated by using the content and the defined metad
#### BaseConverter.validate\_language
```python
def validate_language(text: str,
valid_languages: Optional[List[str]] = None) -> bool
def validate_language(text: str, valid_languages: Optional[List[str]] = None) -> bool
```
Validate if the language of the text is one of valid languages.
@ -96,14 +87,7 @@ Validate if the language of the text is one of valid languages.
#### BaseConverter.run
```python
def run(file_paths: Union[Path, List[Path]],
meta: Optional[Union[Dict[str, str],
List[Optional[Dict[str, str]]]]] = None,
remove_numeric_tables: Optional[bool] = None,
known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8",
id_hash_keys: Optional[List[str]] = None)
def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None)
```
Extract text from a file.
@ -153,12 +137,7 @@ class DocxToTextConverter(BaseConverter)
#### DocxToTextConverter.convert
```python
def convert(file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Extract text from a .docx file.
@ -203,9 +182,7 @@ class ImageToTextConverter(BaseConverter)
#### ImageToTextConverter.\_\_init\_\_
```python
def __init__(remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = ["eng"],
id_hash_keys: Optional[List[str]] = None)
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None)
```
**Arguments**:
@ -232,12 +209,7 @@ In this case the id will be generated by using the content and the defined metad
#### ImageToTextConverter.convert
```python
def convert(file_path: Union[Path, str],
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Union[Path, str], meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
@ -275,20 +247,35 @@ In this case the id will be generated by using the content and the defined metad
class MarkdownConverter(BaseConverter)
```
<a id="markdown.MarkdownConverter.__init__"></a>
#### MarkdownConverter.\_\_init\_\_
```python
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True, remove_code_snippets: bool = True, extract_headlines: bool = False)
```
**Arguments**:
- `remove_numeric_tables`: Not applicable.
- `valid_languages`: Not applicable.
- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
- `progress_bar`: Show a progress bar for the conversion.
- `remove_code_snippets`: Whether to remove snippets from the markdown file.
- `extract_headlines`: Whether to extract headings from the markdown file.
<a id="markdown.MarkdownConverter.convert"></a>
#### MarkdownConverter.convert
```python
def convert(file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None, remove_code_snippets: Optional[bool] = None, extract_headlines: Optional[bool] = None) -> List[Document]
```
Reads text from a markdown file and executes optional preprocessing steps.
Reads text from a txt file and executes optional preprocessing steps.
**Arguments**:
@ -301,21 +288,8 @@ Reads text from a markdown file and executes optional preprocessing steps.
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
<a id="markdown.MarkdownConverter.markdown_to_text"></a>
#### MarkdownConverter.markdown\_to\_text
```python
@staticmethod
def markdown_to_text(markdown_string: str) -> str
```
Converts a markdown string to plaintext
**Arguments**:
- `markdown_string`: String in markdown format
- `remove_code_snippets`: Whether to remove snippets from the markdown file.
- `extract_headlines`: Whether to extract headings from the markdown file.
<a id="pdf"></a>
@ -334,11 +308,7 @@ class PDFToTextConverter(BaseConverter)
#### PDFToTextConverter.\_\_init\_\_
```python
def __init__(remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8",
keep_physical_layout: bool = False)
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8")
```
**Arguments**:
@ -360,20 +330,13 @@ In this case the id will be generated by using the content and the defined metad
- `encoding`: Encoding that will be passed as `-enc` parameter to `pdftotext`.
Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
(See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
- `keep_physical_layout`: This option will maintain original physical layout on the extracted text.
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
<a id="pdf.PDFToTextConverter.convert"></a>
#### PDFToTextConverter.convert
```python
def convert(file_path: Path,
meta: Optional[Dict[str, Any]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
@ -395,8 +358,6 @@ not one of the valid languages, then it might likely be encoding error resulting
in garbled text.
- `encoding`: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
(See list of available encodings by running `pdftotext -listenc` in the terminal)
- `keep_physical_layout`: This option will maintain original physical layout on the extracted text.
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
@ -415,9 +376,7 @@ class PDFToTextOCRConverter(BaseConverter)
#### PDFToTextOCRConverter.\_\_init\_\_
```python
def __init__(remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = ["eng"],
id_hash_keys: Optional[List[str]] = None)
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None)
```
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
@ -444,12 +403,7 @@ In this case the id will be generated by using the content and the defined metad
#### PDFToTextOCRConverter.convert
```python
def convert(file_path: Path,
meta: Optional[Dict[str, Any]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Convert a file to a dictionary containing the text and any associated meta data.
@ -499,17 +453,7 @@ Supported file formats are: PDF, DOCX
#### ParsrConverter.\_\_init\_\_
```python
def __init__(parsr_url: str = "http://localhost:3001",
extractor: Literal["pdfminer", "pdfjs"] = "pdfminer",
table_detection_mode: Literal["lattice", "stream"] = "lattice",
preceding_context_len: int = 3,
following_context_len: int = 3,
remove_page_headers: bool = False,
remove_page_footers: bool = False,
remove_table_of_contents: bool = False,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
add_page_number: bool = True)
def __init__(parsr_url: str = "http://localhost:3001", extractor: Literal["pdfminer", "pdfjs"] = "pdfminer", table_detection_mode: Literal["lattice", "stream"] = "lattice", preceding_context_len: int = 3, following_context_len: int = 3, remove_page_headers: bool = False, remove_page_footers: bool = False, remove_table_of_contents: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True)
```
**Arguments**:
@ -543,12 +487,7 @@ In this case the id will be generated by using the content and the defined metad
#### ParsrConverter.convert
```python
def convert(file_path: Path,
meta: Optional[Dict[str, Any]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Extract text and tables from a PDF or DOCX using the open-source Parsr tool.
@ -597,16 +536,7 @@ https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quick
#### AzureConverter.\_\_init\_\_
```python
def __init__(endpoint: str,
credential_key: str,
model_id: str = "prebuilt-document",
valid_languages: Optional[List[str]] = None,
save_json: bool = False,
preceding_context_len: int = 3,
following_context_len: int = 3,
merge_multiple_column_headers: bool = True,
id_hash_keys: Optional[List[str]] = None,
add_page_number: bool = True)
def __init__(endpoint: str, credential_key: str, model_id: str = "prebuilt-document", valid_languages: Optional[List[str]] = None, save_json: bool = False, preceding_context_len: int = 3, following_context_len: int = 3, merge_multiple_column_headers: bool = True, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True)
```
**Arguments**:
@ -641,14 +571,7 @@ In this case the id will be generated by using the content and the defined metad
#### AzureConverter.convert
```python
def convert(file_path: Path,
meta: Optional[Dict[str, Any]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
id_hash_keys: Optional[List[str]] = None,
pages: Optional[str] = None,
known_language: Optional[str] = None) -> List[Document]
def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None, pages: Optional[str] = None, known_language: Optional[str] = None) -> List[Document]
```
Extract text and tables from a PDF, JPEG, PNG, BMP or TIFF file using Azure's Form Recognizer service.
@ -680,11 +603,7 @@ See supported locales here: https://aka.ms/azsdk/formrecognizer/supportedlocales
#### AzureConverter.convert\_azure\_json
```python
def convert_azure_json(
file_path: Path,
meta: Optional[Dict[str, Any]] = None,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert_azure_json(file_path: Path, meta: Optional[Dict[str, Any]] = None, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Extract text and tables from the JSON output of Azure's Form Recognizer service.
@ -721,10 +640,7 @@ class TikaConverter(BaseConverter)
#### TikaConverter.\_\_init\_\_
```python
def __init__(tika_url: str = "http://localhost:9998/tika",
remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None)
def __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None)
```
**Arguments**:
@ -750,12 +666,7 @@ In this case the id will be generated by using the content and the defined metad
#### TikaConverter.convert
```python
def convert(file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
**Arguments**:
@ -799,12 +710,7 @@ class TextConverter(BaseConverter)
#### TextConverter.convert
```python
def convert(file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Reads text from a txt file and executes optional preprocessing steps.

View File

@ -2421,6 +2421,11 @@
"additionalProperties": false,
"description": "Each parameter can reference other components defined in the same YAML file.",
"properties": {
"extract_headlines": {
"default": false,
"title": "Extract Headlines",
"type": "boolean"
},
"id_hash_keys": {
"anyOf": [
{
@ -2440,6 +2445,11 @@
"title": "Progress Bar",
"type": "boolean"
},
"remove_code_snippets": {
"default": true,
"title": "Remove Code Snippets",
"type": "boolean"
},
"remove_numeric_tables": {
"default": false,
"title": "Remove Numeric Tables",

View File

@ -2421,6 +2421,11 @@
"additionalProperties": false,
"description": "Each parameter can reference other components defined in the same YAML file.",
"properties": {
"extract_headlines": {
"default": false,
"title": "Extract Headlines",
"type": "boolean"
},
"id_hash_keys": {
"anyOf": [
{
@ -2440,6 +2445,11 @@
"title": "Progress Bar",
"type": "boolean"
},
"remove_code_snippets": {
"default": true,
"title": "Remove Code Snippets",
"type": "boolean"
},
"remove_numeric_tables": {
"default": false,
"title": "Remove Numeric Tables",

View File

@ -1,10 +1,10 @@
import logging
import re
from pathlib import Path
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Tuple, Any
try:
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, NavigableString
from markdown import markdown
except (ImportError, ModuleNotFoundError) as ie:
from haystack.utils.import_utils import _optional_component_not_installed
@ -19,14 +19,46 @@ logger = logging.getLogger(__name__)
class MarkdownConverter(BaseConverter):
def __init__(
self,
remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
progress_bar: bool = True,
remove_code_snippets: bool = True,
extract_headlines: bool = False,
):
"""
:param remove_numeric_tables: Not applicable.
:param valid_languages: Not applicable.
:param id_hash_keys: Generate the document ID from a custom list of strings that refer to the document's
attributes. To make sure you don't have duplicate documents in your DocumentStore if texts are
not unique, you can modify the metadata and pass for example, `"meta"` to this field ([`"content"`, `"meta"`]).
In this case, the ID is generated by using the content and the defined metadata.
:param progress_bar: Show a progress bar for the conversion.
:param remove_code_snippets: Whether to remove snippets from the markdown file.
:param extract_headlines: Whether to extract headings from the markdown file.
"""
super().__init__(
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages,
id_hash_keys=id_hash_keys,
progress_bar=progress_bar,
)
self.remove_code_snippets = remove_code_snippets
self.extract_headlines = extract_headlines
def convert(
self,
file_path: Path,
meta: Optional[Dict[str, str]] = None,
meta: Optional[Dict[str, Any]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
id_hash_keys: Optional[List[str]] = None,
remove_code_snippets: Optional[bool] = None,
extract_headlines: Optional[bool] = None,
) -> List[Document]:
"""
Reads text from a markdown file and executes optional preprocessing steps.
@ -40,32 +72,53 @@ class MarkdownConverter(BaseConverter):
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
:param remove_code_snippets: Whether to remove snippets from the markdown file.
:param extract_headlines: Whether to extract headings from the markdown file.
"""
if id_hash_keys is None:
id_hash_keys = self.id_hash_keys
id_hash_keys = id_hash_keys if id_hash_keys is not None else self.id_hash_keys
remove_code_snippets = remove_code_snippets if remove_code_snippets is not None else self.remove_code_snippets
extract_headlines = extract_headlines if extract_headlines is not None else self.extract_headlines
with open(file_path, encoding=encoding, errors="ignore") as f:
markdown_text = f.read()
text = self.markdown_to_text(markdown_text)
# md -> html -> text since BeautifulSoup can extract text cleanly
html = markdown(markdown_text)
# remove code snippets
if remove_code_snippets:
html = re.sub(r"<pre>(.*?)</pre>", " ", html, flags=re.DOTALL)
html = re.sub(r"<code>(.*?)</code>", " ", html, flags=re.DOTALL)
soup = BeautifulSoup(html, "html.parser")
if extract_headlines:
text, headlines = self._extract_text_and_headlines(soup)
if meta is None:
meta = {}
meta["headlines"] = headlines
else:
text = soup.get_text()
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
return [document]
# Following code snippet is copied from https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
@staticmethod
def markdown_to_text(markdown_string: str) -> str:
def _extract_text_and_headlines(soup: BeautifulSoup) -> Tuple[str, List[Dict]]:
"""
Converts a markdown string to plaintext
:param markdown_string: String in markdown format
Extracts text and headings from a soup object.
"""
# md -> html -> text since BeautifulSoup can extract text cleanly
html = markdown(markdown_string)
headline_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
headlines = []
text = ""
for desc in soup.descendants:
if desc.name in headline_tags:
current_headline = desc.get_text()
current_start_idx = len(text)
current_level = int(desc.name[-1]) - 1
headlines.append({"headline": current_headline, "start_idx": current_start_idx, "level": current_level})
# remove code snippets
html = re.sub(r"<pre>(.*?)</pre>", " ", html)
html = re.sub(r"<code>(.*?)</code >", " ", html)
if isinstance(desc, NavigableString):
text += desc.get_text()
# extract text
soup = BeautifulSoup(html, "html.parser")
text = "".join(soup.findAll(text=True))
return text
return text, headlines

View File

@ -3,7 +3,7 @@ import re
from copy import deepcopy
from functools import partial, reduce
from itertools import chain
from typing import List, Optional, Generator, Set, Union
from typing import List, Optional, Generator, Set, Union, Tuple, Dict
try:
from typing import Literal
@ -47,8 +47,6 @@ iso639_to_nltk = {
"ml": "malayalam",
}
EMPTY_PAGE_PLACEHOLDER = "@@@HAYSTACK_KEEP_PAGE@@@."
class PreProcessor(BasePreProcessor):
def __init__(
@ -261,35 +259,22 @@ class PreProcessor(BasePreProcessor):
text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
)
headlines = document.meta["headlines"] if "headlines" in document.meta else []
if clean_whitespace:
pages = text.split("\f")
cleaned_pages = []
for page in pages:
if not page:
# there are many "empty text" pages in a marketing document, as for example the cover page. If we just forget about them, we have a mismatch
# with page numbers which causes problems later on. Therefore, we replace them with a dummy text, which will not be found by any query.
cleaned_page = EMPTY_PAGE_PLACEHOLDER
else:
lines = page.splitlines()
cleaned_lines = []
for line in lines:
line = line.strip()
cleaned_lines.append(line)
cleaned_page = "\n".join(cleaned_lines)
cleaned_pages.append(cleaned_page)
text = "\f".join(cleaned_pages)
text, headlines = self._clean_whitespace(text=text, headlines=headlines)
if clean_empty_lines:
text = re.sub(r"\n\n+", "\n\n", text)
text, headlines = self._clean_empty_lines(text=text, headlines=headlines)
for substring in remove_substrings:
text = text.replace(substring, "")
text, headline = self._remove_substring(text=text, substring=substring, headlines=headlines)
if text != document.content:
document = deepcopy(document)
document.content = text
if headlines:
document.meta["headlines"] = headlines
return document
@ -328,131 +313,302 @@ class PreProcessor(BasePreProcessor):
return [document]
text = document.content
headlines = document.meta["headlines"] if "headlines" in document.meta else []
if split_respect_sentence_boundary and split_by == "word":
# split by words ensuring no sub sentence splits
if self.add_page_number:
# SentenceTokenizer will remove "\f" if it is at the end of a sentence, so substituting it in these
# cases for "[NEW_PAGE]" to don't lose any page breaks.
text = self._substitute_page_breaks(text)
sentences = self._split_sentences(text)
word_count_slice = 0
cur_page = 1
splits_pages = []
list_splits = []
current_slice: List[str] = []
for sen in sentences:
if self.add_page_number and "[NEW_PAGE]" in sen:
sen = sen.replace("[NEW_PAGE]", "\f")
word_count_sen = len(sen.split(" "))
if word_count_sen > split_length:
long_sentence_message = f"One or more sentence found with word count higher than the split length."
if long_sentence_message not in self.print_log:
self.print_log.add(long_sentence_message)
logger.warning(long_sentence_message)
if word_count_slice + word_count_sen > split_length:
# Number of words exceeds split_length -> save current slice and start a new one
if current_slice:
list_splits.append(current_slice)
splits_pages.append(cur_page)
if split_overlap:
overlap = []
processed_sents = []
word_count_overlap = 0
current_slice_copy = deepcopy(current_slice)
for idx, s in reversed(list(enumerate(current_slice))):
sen_len = len(s.split(" "))
if word_count_overlap < split_overlap:
overlap.append(s)
word_count_overlap += sen_len
current_slice_copy.pop(idx)
else:
processed_sents = current_slice_copy
break
current_slice = list(reversed(overlap))
word_count_slice = word_count_overlap
else:
processed_sents = current_slice
current_slice = []
word_count_slice = 0
# Count number of page breaks in processed sentences
if self.add_page_number:
num_page_breaks = self._count_processed_page_breaks(
sentences=processed_sents,
split_overlap=split_overlap,
overlapping_sents=current_slice,
current_sent=sen,
)
cur_page += num_page_breaks
current_slice.append(sen)
word_count_slice += word_count_sen
if current_slice:
list_splits.append(current_slice)
splits_pages.append(cur_page)
text_splits = []
for sl in list_splits:
txt = " ".join(sl)
if len(txt) > 0:
text_splits.append(txt)
text_splits, splits_pages, splits_start_idxs = self._split_by_word_respecting_sent_boundary(
text=text, split_length=split_length, split_overlap=split_overlap
)
else:
# create individual "elements" of passage, sentence, or word
if split_by == "passage":
elements = text.split("\n\n")
elif split_by == "sentence":
if self.add_page_number:
# SentenceTokenizer will remove "\f" if it is at the end of a sentence, so substituting it in these
# cases for "[NEW_PAGE]" to don't lose any page breaks.
text = self._substitute_page_breaks(text)
elements = self._split_sentences(text)
elif split_by == "word":
elements = text.split(" ")
else:
raise NotImplementedError(
"PreProcessor only supports 'passage', 'sentence' or 'word' split_by options."
)
elements, split_at = self._split_into_units(text=text, split_by=split_by)
# concatenate individual elements based on split_length & split_stride
if split_overlap:
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
else:
segments = windowed(elements, n=split_length, step=split_length)
text_splits = []
splits_pages = []
cur_page = 1
for seg in segments:
current_units = [unit for unit in seg if unit is not None]
txt = " ".join(current_units)
if len(txt) > 0:
text_splits.append(txt)
splits_pages.append(cur_page)
if self.add_page_number:
processed_units = current_units[: split_length - split_overlap]
num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
cur_page += num_page_breaks
text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
elements=elements, split_length=split_length, split_overlap=split_overlap, split_at=split_at
)
# create new document dicts for each text split
documents = []
for i, txt in enumerate(text_splits):
# now we want to get rid of the empty page placeholder and skip the split if there's nothing left
txt_clean = txt.replace(EMPTY_PAGE_PLACEHOLDER, "")
if not txt_clean.strip():
continue
documents = self._create_docs_from_splits(
text_splits=text_splits,
splits_pages=splits_pages,
splits_start_idxs=splits_start_idxs,
headlines=headlines,
meta=document.meta or {},
id_hash_keys=id_hash_keys,
)
doc = Document(content=txt_clean, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
return documents
@staticmethod
def _clean_whitespace(text: str, headlines: List[Dict]) -> Tuple[str, List[Dict]]:
"""
Strips whitespaces before or after each line in the text.
"""
pages = text.split("\f")
cleaned_pages = []
cur_headline_idx = 0
num_headlines = len(headlines)
cur_char_idx = 0
num_removed_chars_total = 0
for page in pages:
lines = page.splitlines()
cleaned_lines = []
for idx, line in enumerate(lines):
old_line_len = len(line)
cleaned_line = line.strip()
cleaned_line_len = len(cleaned_line)
cur_char_idx += old_line_len + 1 # add 1 for newline char
if old_line_len != cleaned_line_len:
num_removed_chars_current = old_line_len - cleaned_line_len
num_removed_chars_total += num_removed_chars_current
for headline_idx in range(cur_headline_idx, num_headlines):
if cur_char_idx - num_removed_chars_total <= headlines[headline_idx]["start_idx"]:
headlines[headline_idx]["start_idx"] -= num_removed_chars_current
else:
cur_headline_idx += 1
cleaned_lines.append(cleaned_line)
cleaned_page = "\n".join(cleaned_lines)
cleaned_pages.append(cleaned_page)
cleaned_text = "\f".join(cleaned_pages)
return cleaned_text, headlines
@staticmethod
def _clean_empty_lines(text: str, headlines: List[Dict]) -> Tuple[str, List[Dict]]:
if headlines:
num_headlines = len(headlines)
multiple_new_line_matches = re.finditer(r"\n\n\n+", text)
cur_headline_idx = 0
num_removed_chars_accumulated = 0
for match in multiple_new_line_matches:
num_removed_chars_current = match.end() - match.start() - 2
for headline_idx in range(cur_headline_idx, num_headlines):
if match.end() - num_removed_chars_accumulated <= headlines[headline_idx]["start_idx"]:
headlines[headline_idx]["start_idx"] -= num_removed_chars_current
else:
cur_headline_idx += 1
num_removed_chars_accumulated += num_removed_chars_current
cleaned_text = re.sub(r"\n\n\n+", "\n\n", text)
return cleaned_text, headlines
@staticmethod
def _remove_substring(text: str, substring: str, headlines: List[Dict]) -> Tuple[str, List[Dict]]:
if headlines:
num_headlines = len(headlines)
multiple_substring_matches = re.finditer(substring, text)
cur_headline_idx = 0
num_removed_chars_accumulated = 0
for match in multiple_substring_matches:
for headline_idx in range(cur_headline_idx, num_headlines):
if match.end() - num_removed_chars_accumulated <= headlines[headline_idx]["start_idx"]:
headlines[headline_idx]["start_idx"] -= len(substring)
else:
cur_headline_idx += 1
num_removed_chars_accumulated += len(substring)
cleaned_text = text.replace(substring, "")
return cleaned_text, headlines
def _split_by_word_respecting_sent_boundary(
self, text: str, split_length: int, split_overlap: int
) -> Tuple[List[str], List[int], List[int]]:
"""
Splits the text into parts of split_length words while respecting sentence boundaries.
"""
sentences = self._split_sentences(text)
word_count_slice = 0
cur_page = 1
cur_start_idx = 0
splits_pages = []
list_splits = []
splits_start_idxs = []
current_slice: List[str] = []
for sen in sentences:
word_count_sen = len(sen.split())
if word_count_sen > split_length:
long_sentence_message = (
f"We found one or more sentences whose word count is higher than the split length."
)
if long_sentence_message not in self.print_log:
self.print_log.add(long_sentence_message)
logger.warning(long_sentence_message)
if word_count_slice + word_count_sen > split_length:
# Number of words exceeds split_length -> save current slice and start a new one
if current_slice:
list_splits.append(current_slice)
splits_pages.append(cur_page)
splits_start_idxs.append(cur_start_idx)
if split_overlap:
overlap = []
processed_sents = []
word_count_overlap = 0
current_slice_copy = deepcopy(current_slice)
for idx, s in reversed(list(enumerate(current_slice))):
sen_len = len(s.split())
if word_count_overlap < split_overlap:
overlap.append(s)
word_count_overlap += sen_len
current_slice_copy.pop(idx)
else:
processed_sents = current_slice_copy
break
current_slice = list(reversed(overlap))
word_count_slice = word_count_overlap
else:
processed_sents = current_slice
current_slice = []
word_count_slice = 0
cur_start_idx += len("".join(processed_sents))
# Count number of page breaks in processed sentences
if self.add_page_number:
num_page_breaks = self._count_processed_page_breaks(
sentences=processed_sents,
split_overlap=split_overlap,
overlapping_sents=current_slice,
current_sent=sen,
)
cur_page += num_page_breaks
current_slice.append(sen)
word_count_slice += word_count_sen
if current_slice:
list_splits.append(current_slice)
splits_pages.append(cur_page)
splits_start_idxs.append(cur_start_idx)
text_splits = []
for sl in list_splits:
txt = "".join(sl)
if len(txt) > 0:
text_splits.append(txt)
return text_splits, splits_pages, splits_start_idxs
def _split_into_units(self, text: str, split_by: str) -> Tuple[List[str], str]:
if split_by == "passage":
elements = text.split("\n\n")
split_at = "\n\n"
elif split_by == "sentence":
elements = self._split_sentences(text)
split_at = "" # whitespace will be preserved while splitting text into sentences
elif split_by == "word":
elements = text.split(" ")
split_at = " "
else:
raise NotImplementedError("PreProcessor only supports 'passage', 'sentence' or 'word' split_by options.")
return elements, split_at
def _concatenate_units(
self, elements: List[str], split_length: int, split_overlap: int, split_at: str
) -> Tuple[List[str], List[int], List[int]]:
"""
Concatenates the elements into parts of split_length units.
"""
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
split_at_len = len(split_at)
text_splits = []
splits_pages = []
splits_start_idxs = []
cur_page = 1
cur_start_idx = 0
for seg in segments:
current_units = [unit for unit in seg if unit is not None]
txt = split_at.join(current_units)
if len(txt) > 0:
text_splits.append(txt)
splits_pages.append(cur_page)
splits_start_idxs.append(cur_start_idx)
processed_units = current_units[: split_length - split_overlap]
cur_start_idx += len((split_at_len * " ").join(processed_units)) + split_at_len
if self.add_page_number:
num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
cur_page += num_page_breaks
return text_splits, splits_pages, splits_start_idxs
def _create_docs_from_splits(
self,
text_splits: List[str],
splits_pages: List[int],
splits_start_idxs: List[int],
headlines: List[Dict],
meta: Dict,
id_hash_keys=Optional[List[str]],
) -> List[Document]:
"""
Creates Document objects from text splits enriching them with page number and headline information if given.
"""
documents = []
earliest_rel_hl = 0
for i, txt in enumerate(text_splits):
meta = deepcopy(meta)
doc = Document(content=txt, meta=meta, id_hash_keys=id_hash_keys)
doc.meta["_split_id"] = i
if self.add_page_number:
doc.meta["page"] = splits_pages[i]
if headlines:
split_start_idx = splits_start_idxs[i]
relevant_headlines, earliest_rel_hl = self._extract_relevant_headlines_for_split(
headlines=headlines, split_txt=txt, split_start_idx=split_start_idx, earliest_rel_hl=earliest_rel_hl
)
doc.meta["headlines"] = relevant_headlines
documents.append(doc)
return documents
@staticmethod
def _extract_relevant_headlines_for_split(
headlines: List[Dict], split_txt: str, split_start_idx: int, earliest_rel_hl: int
) -> Tuple[List[Dict], int]:
"""
If you give it a list of headlines, a text split, and the start index of the split in the original text, this method
extracts the headlines that are relevant for the split.
"""
relevant_headlines = []
for headline_idx in range(earliest_rel_hl, len(headlines)):
# Headline is part of current split
if split_start_idx <= headlines[headline_idx]["start_idx"] < split_start_idx + len(split_txt):
headline_copy = deepcopy(headlines[headline_idx])
headline_copy["start_idx"] = headlines[headline_idx]["start_idx"] - split_start_idx
relevant_headlines.append(headline_copy)
# Headline appears before current split, but might be relevant for current split
elif headlines[headline_idx]["start_idx"] < split_start_idx:
# Check if following headlines are on a higher level
headline_to_check = headline_idx + 1
headline_is_relevant = True
while (
headline_to_check < len(headlines) and headlines[headline_to_check]["start_idx"] <= split_start_idx
):
if headlines[headline_to_check]["level"] <= headlines[headline_idx]["level"]:
headline_is_relevant = False
break
headline_to_check += 1
if headline_is_relevant:
headline_copy = deepcopy(headlines[headline_idx])
headline_copy["start_idx"] = None
relevant_headlines.append(headline_copy)
else:
earliest_rel_hl += 1
# Headline (and all subsequent ones) only relevant for later splits
elif headlines[headline_idx]["start_idx"] > split_start_idx + len(split_txt):
break
return relevant_headlines, earliest_rel_hl
def _find_and_remove_header_footer(
self, text: str, n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
) -> str:
@ -542,46 +698,74 @@ class PreProcessor(BasePreProcessor):
:param text: str, text to tokenize
:return: list[str], list of sentences
"""
sentences = []
language_name = iso639_to_nltk.get(self.language)
sentence_tokenizer = self._load_sentence_tokenizer(language_name)
# The following adjustment of PunktSentenceTokenizer is inspired by:
# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
# It is needed for preserving whitespace while splitting text into sentences.
period_context_fmt = r"""
%(SentEndChars)s # a potential sentence ending
\s* # match potential whitespace (is originally in lookahead assertion)
(?=(?P<after_tok>
%(NonWord)s # either other punctuation
|
(?P<next_tok>\S+) # or some other token - original version: \s+(?P<next_tok>\S+)
))"""
re_period_context = re.compile(
period_context_fmt
% {
"NonWord": sentence_tokenizer._lang_vars._re_non_word_chars,
"SentEndChars": sentence_tokenizer._lang_vars._re_sent_end_chars,
},
re.UNICODE | re.VERBOSE,
)
sentence_tokenizer._lang_vars._re_period_context = re_period_context
sentences = sentence_tokenizer.tokenize(text)
return sentences
def _load_sentence_tokenizer(self, language_name: Optional[str]) -> nltk.tokenize.punkt.PunktSentenceTokenizer:
# Try to load a custom model from 'tokenizer_model_path'
if self.tokenizer_model_folder is not None:
tokenizer_model_path = Path(self.tokenizer_model_folder).absolute() / f"{self.language}.pickle"
try:
sentence_tokenizer = nltk.data.load(f"file:{str(tokenizer_model_path)}", format="pickle")
sentences = sentence_tokenizer.tokenize(text)
except LookupError:
logger.exception("PreProcessor couldn't load sentence tokenizer from %s", tokenizer_model_path)
except (UnpicklingError, ValueError) as e:
logger.exception(
"PreProcessor couldn't determine model format of sentence tokenizer at %s", tokenizer_model_path
)
if sentences:
return sentences
except (LookupError, UnpicklingError, ValueError) as e:
if isinstance(e, LookupError):
logger.exception(f"PreProcessor couldn't load sentence tokenizer from %s", tokenizer_model_path)
else:
logger.exception(
f"PreProcessor couldn't determine model format of sentence tokenizer at %s",
tokenizer_model_path,
)
# NLTK failed to split, fallback to the default model or to English
if language_name is not None:
logger.error(
f"PreProcessor couldn't find custom sentence tokenizer model for {self.language}. Using default {self.language} model."
)
return nltk.tokenize.sent_tokenize(text, language=language_name)
logger.error(
f"PreProcessor couldn't find default or custom sentence tokenizer model for {self.language}. Using English instead."
)
return nltk.tokenize.sent_tokenize(text, language="english")
# NLTK failed to load custom SentenceTokenizer, fallback to the default model or to English
if language_name is not None:
logger.error(
f"PreProcessor couldn't find custom sentence tokenizer model for {self.language}. "
f"Using default {self.language} model."
)
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
else:
logger.error(
f"PreProcessor couldn't find default or custom sentence tokenizer model for {self.language}. "
f"Using English instead."
)
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/english.pickle")
# Use a default NLTK model
if language_name is not None:
return nltk.tokenize.sent_tokenize(text, language=language_name)
elif language_name is not None:
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
else:
logger.error(
f"PreProcessor couldn't find the default sentence tokenizer model for {self.language}. "
f" Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter."
)
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/english.pickle")
logger.error(
f"PreProcessor couldn't find default sentence tokenizer model for {self.language}. Using English instead. "
"You may train your own model and use the 'tokenizer_model_folder' parameter."
)
return nltk.tokenize.sent_tokenize(text, language="english")
return sentence_tokenizer
@staticmethod
def _count_processed_page_breaks(
@ -603,13 +787,3 @@ class PreProcessor(BasePreProcessor):
num_page_breaks += 1
return num_page_breaks
@staticmethod
def _substitute_page_breaks(text: str) -> str:
"""
This method substitutes the page break character "\f" for "[NEW_PAGE]" if it is at the end of a sentence.
"""
# This regex matches any of sentence-ending punctuation (one of ".", ":", "?", "!") followed by a page break
# character ("\f") and replaces the page break character with "[NEW_PAGE]" keeping the original sentence-ending
# punctuation.
return re.sub(r"([\.:?!])\f", r"\1 [NEW_PAGE]", text)

View File

@ -141,6 +141,31 @@ def test_markdown_converter():
assert document.content.startswith("What to build with Haystack")
def test_markdown_converter_headline_extraction():
expected_headlines = [
("What to build with Haystack", 1),
("Core Features", 1),
("Quick Demo", 1),
("2nd level headline for testing purposes", 2),
("3rd level headline for testing purposes", 3),
]
converter = MarkdownConverter(extract_headlines=True, remove_code_snippets=False)
document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
# Check if correct number of headlines are extracted
assert len(document.meta["headlines"]) == 5
for extracted_headline, (expected_headline, expected_level) in zip(document.meta["headlines"], expected_headlines):
# Check if correct headline and level is extracted
assert extracted_headline["headline"] == expected_headline
assert extracted_headline["level"] == expected_level
# Check if correct start_idx is extracted
start_idx = extracted_headline["start_idx"]
hl_len = len(extracted_headline["headline"])
assert extracted_headline["headline"] == document.content[start_idx : start_idx + hl_len]
def test_azure_converter():
# Check if Form Recognizer endpoint and credential key in environment variables
if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ:

View File

@ -26,6 +26,15 @@ paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test
in the sentence.
"""
HEADLINES = [
{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0},
{"headline": "paragraph_1", "start_idx": 198, "level": 1},
{"headline": "sample sentence in paragraph_2", "start_idx": 223, "level": 0},
{"headline": "in paragraph_2", "start_idx": 365, "level": 1},
{"headline": "sample sentence in paragraph_3", "start_idx": 434, "level": 0},
{"headline": "trick the test", "start_idx": 603, "level": 1},
]
LEGAL_TEXT_PT = """
A Lei 9.514/1997, que instituiu a alienação fiduciária de
bens imóveis, é norma especial e posterior ao Código de Defesa do
@ -124,8 +133,8 @@ def test_preprocess_word_split():
documents = preprocessor.process(document)
for i, doc in enumerate(documents):
if i == 0:
assert len(doc.content.split(" ")) == 14
assert len(doc.content.split(" ")) <= 15 or doc.content.startswith("This is to trick")
assert len(doc.content.split()) == 14
assert len(doc.content.split()) <= 15 or doc.content.startswith("This is to trick")
assert len(documents) == 8
preprocessor = PreProcessor(
@ -244,9 +253,217 @@ def test_page_number_extraction_on_empty_pages():
assert documents[1].content.strip() == text_page_three
def test_substitute_page_break():
# Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of
# sentences should not be replaced.
result = PreProcessor._substitute_page_breaks(TEXT)
assert result[211:221] == "[NEW_PAGE]"
assert result[654] == "\f"
def test_headline_processing_split_by_word():
expected_headlines = [
[{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0}],
[
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
{"headline": "paragraph_1", "start_idx": 19, "level": 1},
{"headline": "sample sentence in paragraph_2", "start_idx": 44, "level": 0},
{"headline": "in paragraph_2", "start_idx": 186, "level": 1},
],
[
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
{"headline": "in paragraph_2", "start_idx": None, "level": 1},
{"headline": "sample sentence in paragraph_3", "start_idx": 53, "level": 0},
],
[
{"headline": "sample sentence in paragraph_3", "start_idx": None, "level": 0},
{"headline": "trick the test", "start_idx": 36, "level": 1},
],
]
document = Document(content=TEXT, meta={"headlines": HEADLINES})
preprocessor = PreProcessor(
split_length=30, split_overlap=0, split_by="word", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
for doc, expected in zip(documents, expected_headlines):
assert doc.meta["headlines"] == expected
def test_headline_processing_split_by_word_overlap():
expected_headlines = [
[{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0}],
[
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
{"headline": "paragraph_1", "start_idx": 71, "level": 1},
{"headline": "sample sentence in paragraph_2", "start_idx": 96, "level": 0},
],
[
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
{"headline": "in paragraph_2", "start_idx": 110, "level": 1},
{"headline": "sample sentence in paragraph_3", "start_idx": 179, "level": 0},
],
[
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
{"headline": "in paragraph_2", "start_idx": None, "level": 1},
{"headline": "sample sentence in paragraph_3", "start_idx": 53, "level": 0},
],
[
{"headline": "sample sentence in paragraph_3", "start_idx": None, "level": 0},
{"headline": "trick the test", "start_idx": 95, "level": 1},
],
]
document = Document(content=TEXT, meta={"headlines": HEADLINES})
preprocessor = PreProcessor(
split_length=30, split_overlap=10, split_by="word", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
for doc, expected in zip(documents, expected_headlines):
assert doc.meta["headlines"] == expected
def test_headline_processing_split_by_word_respect_sentence_boundary():
expected_headlines = [
[{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0}],
[
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
{"headline": "paragraph_1", "start_idx": 71, "level": 1},
{"headline": "sample sentence in paragraph_2", "start_idx": 96, "level": 0},
],
[
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
{"headline": "in paragraph_2", "start_idx": 110, "level": 1},
],
[
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
{"headline": "in paragraph_2", "start_idx": None, "level": 1},
{"headline": "sample sentence in paragraph_3", "start_idx": 53, "level": 0},
],
[
{"headline": "sample sentence in paragraph_3", "start_idx": None, "level": 0},
{"headline": "trick the test", "start_idx": 95, "level": 1},
],
]
document = Document(content=TEXT, meta={"headlines": HEADLINES})
preprocessor = PreProcessor(split_length=30, split_overlap=5, split_by="word", split_respect_sentence_boundary=True)
documents = preprocessor.process(document)
for doc, expected in zip(documents, expected_headlines):
assert doc.meta["headlines"] == expected
def test_headline_processing_split_by_sentence():
expected_headlines = [
[
{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0},
{"headline": "paragraph_1", "start_idx": 198, "level": 1},
],
[
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
{"headline": "paragraph_1", "start_idx": None, "level": 1},
{"headline": "sample sentence in paragraph_2", "start_idx": 10, "level": 0},
{"headline": "in paragraph_2", "start_idx": 152, "level": 1},
],
[
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
{"headline": "in paragraph_2", "start_idx": None, "level": 1},
{"headline": "sample sentence in paragraph_3", "start_idx": 10, "level": 0},
{"headline": "trick the test", "start_idx": 179, "level": 1},
],
]
document = Document(content=TEXT, meta={"headlines": HEADLINES})
preprocessor = PreProcessor(
split_length=5, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
for doc, expected in zip(documents, expected_headlines):
assert doc.meta["headlines"] == expected
def test_headline_processing_split_by_sentence_overlap():
expected_headlines = [
[
{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0},
{"headline": "paragraph_1", "start_idx": 198, "level": 1},
],
[
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
{"headline": "paragraph_1", "start_idx": 29, "level": 1},
{"headline": "sample sentence in paragraph_2", "start_idx": 54, "level": 0},
{"headline": "in paragraph_2", "start_idx": 196, "level": 1},
],
[
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
{"headline": "in paragraph_2", "start_idx": 26, "level": 1},
{"headline": "sample sentence in paragraph_3", "start_idx": 95, "level": 0},
],
[
{"headline": "sample sentence in paragraph_3", "start_idx": None, "level": 0},
{"headline": "trick the test", "start_idx": 95, "level": 1},
],
]
document = Document(content=TEXT, meta={"headlines": HEADLINES})
preprocessor = PreProcessor(
split_length=5, split_overlap=1, split_by="sentence", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
for doc, expected in zip(documents, expected_headlines):
assert doc.meta["headlines"] == expected
def test_headline_processing_split_by_passage():
expected_headlines = [
[
{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0},
{"headline": "paragraph_1", "start_idx": 198, "level": 1},
],
[
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
{"headline": "paragraph_1", "start_idx": None, "level": 1},
{"headline": "sample sentence in paragraph_2", "start_idx": 10, "level": 0},
{"headline": "in paragraph_2", "start_idx": 152, "level": 1},
],
[
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
{"headline": "in paragraph_2", "start_idx": None, "level": 1},
{"headline": "sample sentence in paragraph_3", "start_idx": 10, "level": 0},
{"headline": "trick the test", "start_idx": 179, "level": 1},
],
]
document = Document(content=TEXT, meta={"headlines": HEADLINES})
preprocessor = PreProcessor(
split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
for doc, expected in zip(documents, expected_headlines):
assert doc.meta["headlines"] == expected
def test_headline_processing_split_by_passage_overlap():
expected_headlines = [
[
{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0},
{"headline": "paragraph_1", "start_idx": 198, "level": 1},
{"headline": "sample sentence in paragraph_2", "start_idx": 223, "level": 0},
{"headline": "in paragraph_2", "start_idx": 365, "level": 1},
],
[
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
{"headline": "paragraph_1", "start_idx": None, "level": 1},
{"headline": "sample sentence in paragraph_2", "start_idx": 10, "level": 0},
{"headline": "in paragraph_2", "start_idx": 152, "level": 1},
{"headline": "sample sentence in paragraph_3", "start_idx": 221, "level": 0},
{"headline": "trick the test", "start_idx": 390, "level": 1},
],
]
document = Document(content=TEXT, meta={"headlines": HEADLINES})
preprocessor = PreProcessor(
split_length=2, split_overlap=1, split_by="passage", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
for doc, expected in zip(documents, expected_headlines):
assert doc.meta["headlines"] == expected

View File

@ -53,3 +53,6 @@ The quickest way to see what Haystack offers is to start a [Docker Compose](http
```
# git clone https://github.com/deepset-ai/haystack.git
```
### 2nd level headline for testing purposes
#### 3rd level headline for testing purposes