mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-27 15:08:43 +00:00
feat: Extraction of headlines in markdown files (#3445)
* Extract headings from markdown files + adapt PreProcessor * Add tests * Fix mypy * Generate JSON schema * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Update haystack/nodes/file_converter/markdown.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply black * Add PR feedback Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
This commit is contained in:
parent
5ca96357ff
commit
4fbe80c098
@ -17,10 +17,7 @@ Base class for implementing file converts to transform input documents to text f
|
||||
#### BaseConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
progress_bar: bool = True)
|
||||
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -47,12 +44,7 @@ In this case the id will be generated by using the content and the defined metad
|
||||
|
||||
```python
|
||||
@abstractmethod
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, Any]],
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "UTF-8",
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, Any]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
@ -85,8 +77,7 @@ In this case the id will be generated by using the content and the defined metad
|
||||
#### BaseConverter.validate\_language
|
||||
|
||||
```python
|
||||
def validate_language(text: str,
|
||||
valid_languages: Optional[List[str]] = None) -> bool
|
||||
def validate_language(text: str, valid_languages: Optional[List[str]] = None) -> bool
|
||||
```
|
||||
|
||||
Validate if the language of the text is one of valid languages.
|
||||
@ -96,14 +87,7 @@ Validate if the language of the text is one of valid languages.
|
||||
#### BaseConverter.run
|
||||
|
||||
```python
|
||||
def run(file_paths: Union[Path, List[Path]],
|
||||
meta: Optional[Union[Dict[str, str],
|
||||
List[Optional[Dict[str, str]]]]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "UTF-8",
|
||||
id_hash_keys: Optional[List[str]] = None)
|
||||
def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
Extract text from a file.
|
||||
@ -153,12 +137,7 @@ class DocxToTextConverter(BaseConverter)
|
||||
#### DocxToTextConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Extract text from a .docx file.
|
||||
@ -203,9 +182,7 @@ class ImageToTextConverter(BaseConverter)
|
||||
#### ImageToTextConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = ["eng"],
|
||||
id_hash_keys: Optional[List[str]] = None)
|
||||
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -232,12 +209,7 @@ In this case the id will be generated by using the content and the defined metad
|
||||
#### ImageToTextConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Union[Path, str],
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Union[Path, str], meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
|
||||
@ -275,20 +247,35 @@ In this case the id will be generated by using the content and the defined metad
|
||||
class MarkdownConverter(BaseConverter)
|
||||
```
|
||||
|
||||
<a id="markdown.MarkdownConverter.__init__"></a>
|
||||
|
||||
#### MarkdownConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True, remove_code_snippets: bool = True, extract_headlines: bool = False)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `remove_numeric_tables`: Not applicable.
|
||||
- `valid_languages`: Not applicable.
|
||||
- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
- `progress_bar`: Show a progress bar for the conversion.
|
||||
- `remove_code_snippets`: Whether to remove snippets from the markdown file.
|
||||
- `extract_headlines`: Whether to extract headings from the markdown file.
|
||||
|
||||
<a id="markdown.MarkdownConverter.convert"></a>
|
||||
|
||||
#### MarkdownConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None, remove_code_snippets: Optional[bool] = None, extract_headlines: Optional[bool] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Reads text from a markdown file and executes optional preprocessing steps.
|
||||
Reads text from a txt file and executes optional preprocessing steps.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
@ -301,21 +288,8 @@ Reads text from a markdown file and executes optional preprocessing steps.
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
|
||||
<a id="markdown.MarkdownConverter.markdown_to_text"></a>
|
||||
|
||||
#### MarkdownConverter.markdown\_to\_text
|
||||
|
||||
```python
|
||||
@staticmethod
|
||||
def markdown_to_text(markdown_string: str) -> str
|
||||
```
|
||||
|
||||
Converts a markdown string to plaintext
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `markdown_string`: String in markdown format
|
||||
- `remove_code_snippets`: Whether to remove snippets from the markdown file.
|
||||
- `extract_headlines`: Whether to extract headings from the markdown file.
|
||||
|
||||
<a id="pdf"></a>
|
||||
|
||||
@ -334,11 +308,7 @@ class PDFToTextConverter(BaseConverter)
|
||||
#### PDFToTextConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "UTF-8",
|
||||
keep_physical_layout: bool = False)
|
||||
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8")
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -360,20 +330,13 @@ In this case the id will be generated by using the content and the defined metad
|
||||
- `encoding`: Encoding that will be passed as `-enc` parameter to `pdftotext`.
|
||||
Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...).
|
||||
(See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal)
|
||||
- `keep_physical_layout`: This option will maintain original physical layout on the extracted text.
|
||||
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
|
||||
|
||||
<a id="pdf.PDFToTextConverter.convert"></a>
|
||||
|
||||
#### PDFToTextConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
|
||||
@ -395,8 +358,6 @@ not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
- `encoding`: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`.
|
||||
(See list of available encodings by running `pdftotext -listenc` in the terminal)
|
||||
- `keep_physical_layout`: This option will maintain original physical layout on the extracted text.
|
||||
It works by passing the `-layout` parameter to `pdftotext`. When disabled, PDF is read in the stream order.
|
||||
- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
@ -415,9 +376,7 @@ class PDFToTextOCRConverter(BaseConverter)
|
||||
#### PDFToTextOCRConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = ["eng"],
|
||||
id_hash_keys: Optional[List[str]] = None)
|
||||
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
|
||||
@ -444,12 +403,7 @@ In this case the id will be generated by using the content and the defined metad
|
||||
#### PDFToTextOCRConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
@ -499,17 +453,7 @@ Supported file formats are: PDF, DOCX
|
||||
#### ParsrConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(parsr_url: str = "http://localhost:3001",
|
||||
extractor: Literal["pdfminer", "pdfjs"] = "pdfminer",
|
||||
table_detection_mode: Literal["lattice", "stream"] = "lattice",
|
||||
preceding_context_len: int = 3,
|
||||
following_context_len: int = 3,
|
||||
remove_page_headers: bool = False,
|
||||
remove_page_footers: bool = False,
|
||||
remove_table_of_contents: bool = False,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
add_page_number: bool = True)
|
||||
def __init__(parsr_url: str = "http://localhost:3001", extractor: Literal["pdfminer", "pdfjs"] = "pdfminer", table_detection_mode: Literal["lattice", "stream"] = "lattice", preceding_context_len: int = 3, following_context_len: int = 3, remove_page_headers: bool = False, remove_page_footers: bool = False, remove_table_of_contents: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -543,12 +487,7 @@ In this case the id will be generated by using the content and the defined metad
|
||||
#### ParsrConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Extract text and tables from a PDF or DOCX using the open-source Parsr tool.
|
||||
@ -597,16 +536,7 @@ https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quick
|
||||
#### AzureConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(endpoint: str,
|
||||
credential_key: str,
|
||||
model_id: str = "prebuilt-document",
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
save_json: bool = False,
|
||||
preceding_context_len: int = 3,
|
||||
following_context_len: int = 3,
|
||||
merge_multiple_column_headers: bool = True,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
add_page_number: bool = True)
|
||||
def __init__(endpoint: str, credential_key: str, model_id: str = "prebuilt-document", valid_languages: Optional[List[str]] = None, save_json: bool = False, preceding_context_len: int = 3, following_context_len: int = 3, merge_multiple_column_headers: bool = True, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -641,14 +571,7 @@ In this case the id will be generated by using the content and the defined metad
|
||||
#### AzureConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
pages: Optional[str] = None,
|
||||
known_language: Optional[str] = None) -> List[Document]
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None, pages: Optional[str] = None, known_language: Optional[str] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Extract text and tables from a PDF, JPEG, PNG, BMP or TIFF file using Azure's Form Recognizer service.
|
||||
@ -680,11 +603,7 @@ See supported locales here: https://aka.ms/azsdk/formrecognizer/supportedlocales
|
||||
#### AzureConverter.convert\_azure\_json
|
||||
|
||||
```python
|
||||
def convert_azure_json(
|
||||
file_path: Path,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert_azure_json(file_path: Path, meta: Optional[Dict[str, Any]] = None, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Extract text and tables from the JSON output of Azure's Form Recognizer service.
|
||||
@ -721,10 +640,7 @@ class TikaConverter(BaseConverter)
|
||||
#### TikaConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(tika_url: str = "http://localhost:9998/tika",
|
||||
remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None)
|
||||
def __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -750,12 +666,7 @@ In this case the id will be generated by using the content and the defined metad
|
||||
#### TikaConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -799,12 +710,7 @@ class TextConverter(BaseConverter)
|
||||
#### TextConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Reads text from a txt file and executes optional preprocessing steps.
|
||||
|
||||
@ -2421,6 +2421,11 @@
|
||||
"additionalProperties": false,
|
||||
"description": "Each parameter can reference other components defined in the same YAML file.",
|
||||
"properties": {
|
||||
"extract_headlines": {
|
||||
"default": false,
|
||||
"title": "Extract Headlines",
|
||||
"type": "boolean"
|
||||
},
|
||||
"id_hash_keys": {
|
||||
"anyOf": [
|
||||
{
|
||||
@ -2440,6 +2445,11 @@
|
||||
"title": "Progress Bar",
|
||||
"type": "boolean"
|
||||
},
|
||||
"remove_code_snippets": {
|
||||
"default": true,
|
||||
"title": "Remove Code Snippets",
|
||||
"type": "boolean"
|
||||
},
|
||||
"remove_numeric_tables": {
|
||||
"default": false,
|
||||
"title": "Remove Numeric Tables",
|
||||
|
||||
@ -2421,6 +2421,11 @@
|
||||
"additionalProperties": false,
|
||||
"description": "Each parameter can reference other components defined in the same YAML file.",
|
||||
"properties": {
|
||||
"extract_headlines": {
|
||||
"default": false,
|
||||
"title": "Extract Headlines",
|
||||
"type": "boolean"
|
||||
},
|
||||
"id_hash_keys": {
|
||||
"anyOf": [
|
||||
{
|
||||
@ -2440,6 +2445,11 @@
|
||||
"title": "Progress Bar",
|
||||
"type": "boolean"
|
||||
},
|
||||
"remove_code_snippets": {
|
||||
"default": true,
|
||||
"title": "Remove Code Snippets",
|
||||
"type": "boolean"
|
||||
},
|
||||
"remove_numeric_tables": {
|
||||
"default": false,
|
||||
"title": "Remove Numeric Tables",
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from markdown import markdown
|
||||
except (ImportError, ModuleNotFoundError) as ie:
|
||||
from haystack.utils.import_utils import _optional_component_not_installed
|
||||
@ -19,14 +19,46 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownConverter(BaseConverter):
|
||||
def __init__(
|
||||
self,
|
||||
remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
progress_bar: bool = True,
|
||||
remove_code_snippets: bool = True,
|
||||
extract_headlines: bool = False,
|
||||
):
|
||||
"""
|
||||
:param remove_numeric_tables: Not applicable.
|
||||
:param valid_languages: Not applicable.
|
||||
:param id_hash_keys: Generate the document ID from a custom list of strings that refer to the document's
|
||||
attributes. To make sure you don't have duplicate documents in your DocumentStore if texts are
|
||||
not unique, you can modify the metadata and pass for example, `"meta"` to this field ([`"content"`, `"meta"`]).
|
||||
In this case, the ID is generated by using the content and the defined metadata.
|
||||
:param progress_bar: Show a progress bar for the conversion.
|
||||
:param remove_code_snippets: Whether to remove snippets from the markdown file.
|
||||
:param extract_headlines: Whether to extract headings from the markdown file.
|
||||
"""
|
||||
super().__init__(
|
||||
remove_numeric_tables=remove_numeric_tables,
|
||||
valid_languages=valid_languages,
|
||||
id_hash_keys=id_hash_keys,
|
||||
progress_bar=progress_bar,
|
||||
)
|
||||
|
||||
self.remove_code_snippets = remove_code_snippets
|
||||
self.extract_headlines = extract_headlines
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
remove_code_snippets: Optional[bool] = None,
|
||||
extract_headlines: Optional[bool] = None,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Reads text from a markdown file and executes optional preprocessing steps.
|
||||
@ -40,32 +72,53 @@ class MarkdownConverter(BaseConverter):
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
:param remove_code_snippets: Whether to remove snippets from the markdown file.
|
||||
:param extract_headlines: Whether to extract headings from the markdown file.
|
||||
"""
|
||||
if id_hash_keys is None:
|
||||
id_hash_keys = self.id_hash_keys
|
||||
|
||||
id_hash_keys = id_hash_keys if id_hash_keys is not None else self.id_hash_keys
|
||||
remove_code_snippets = remove_code_snippets if remove_code_snippets is not None else self.remove_code_snippets
|
||||
extract_headlines = extract_headlines if extract_headlines is not None else self.extract_headlines
|
||||
|
||||
with open(file_path, encoding=encoding, errors="ignore") as f:
|
||||
markdown_text = f.read()
|
||||
text = self.markdown_to_text(markdown_text)
|
||||
|
||||
# md -> html -> text since BeautifulSoup can extract text cleanly
|
||||
html = markdown(markdown_text)
|
||||
|
||||
# remove code snippets
|
||||
if remove_code_snippets:
|
||||
html = re.sub(r"<pre>(.*?)</pre>", " ", html, flags=re.DOTALL)
|
||||
html = re.sub(r"<code>(.*?)</code>", " ", html, flags=re.DOTALL)
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
if extract_headlines:
|
||||
text, headlines = self._extract_text_and_headlines(soup)
|
||||
if meta is None:
|
||||
meta = {}
|
||||
meta["headlines"] = headlines
|
||||
else:
|
||||
text = soup.get_text()
|
||||
|
||||
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
|
||||
return [document]
|
||||
|
||||
# Following code snippet is copied from https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
|
||||
@staticmethod
|
||||
def markdown_to_text(markdown_string: str) -> str:
|
||||
def _extract_text_and_headlines(soup: BeautifulSoup) -> Tuple[str, List[Dict]]:
|
||||
"""
|
||||
Converts a markdown string to plaintext
|
||||
|
||||
:param markdown_string: String in markdown format
|
||||
Extracts text and headings from a soup object.
|
||||
"""
|
||||
# md -> html -> text since BeautifulSoup can extract text cleanly
|
||||
html = markdown(markdown_string)
|
||||
headline_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
|
||||
headlines = []
|
||||
text = ""
|
||||
for desc in soup.descendants:
|
||||
if desc.name in headline_tags:
|
||||
current_headline = desc.get_text()
|
||||
current_start_idx = len(text)
|
||||
current_level = int(desc.name[-1]) - 1
|
||||
headlines.append({"headline": current_headline, "start_idx": current_start_idx, "level": current_level})
|
||||
|
||||
# remove code snippets
|
||||
html = re.sub(r"<pre>(.*?)</pre>", " ", html)
|
||||
html = re.sub(r"<code>(.*?)</code >", " ", html)
|
||||
if isinstance(desc, NavigableString):
|
||||
text += desc.get_text()
|
||||
|
||||
# extract text
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
text = "".join(soup.findAll(text=True))
|
||||
|
||||
return text
|
||||
return text, headlines
|
||||
|
||||
@ -3,7 +3,7 @@ import re
|
||||
from copy import deepcopy
|
||||
from functools import partial, reduce
|
||||
from itertools import chain
|
||||
from typing import List, Optional, Generator, Set, Union
|
||||
from typing import List, Optional, Generator, Set, Union, Tuple, Dict
|
||||
|
||||
try:
|
||||
from typing import Literal
|
||||
@ -47,8 +47,6 @@ iso639_to_nltk = {
|
||||
"ml": "malayalam",
|
||||
}
|
||||
|
||||
EMPTY_PAGE_PLACEHOLDER = "@@@HAYSTACK_KEEP_PAGE@@@."
|
||||
|
||||
|
||||
class PreProcessor(BasePreProcessor):
|
||||
def __init__(
|
||||
@ -261,35 +259,22 @@ class PreProcessor(BasePreProcessor):
|
||||
text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
|
||||
)
|
||||
|
||||
headlines = document.meta["headlines"] if "headlines" in document.meta else []
|
||||
|
||||
if clean_whitespace:
|
||||
pages = text.split("\f")
|
||||
cleaned_pages = []
|
||||
for page in pages:
|
||||
if not page:
|
||||
# there are many "empty text" pages in a marketing document, as for example the cover page. If we just forget about them, we have a mismatch
|
||||
# with page numbers which causes problems later on. Therefore, we replace them with a dummy text, which will not be found by any query.
|
||||
cleaned_page = EMPTY_PAGE_PLACEHOLDER
|
||||
else:
|
||||
lines = page.splitlines()
|
||||
cleaned_lines = []
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
cleaned_lines.append(line)
|
||||
cleaned_page = "\n".join(cleaned_lines)
|
||||
|
||||
cleaned_pages.append(cleaned_page)
|
||||
|
||||
text = "\f".join(cleaned_pages)
|
||||
text, headlines = self._clean_whitespace(text=text, headlines=headlines)
|
||||
|
||||
if clean_empty_lines:
|
||||
text = re.sub(r"\n\n+", "\n\n", text)
|
||||
text, headlines = self._clean_empty_lines(text=text, headlines=headlines)
|
||||
|
||||
for substring in remove_substrings:
|
||||
text = text.replace(substring, "")
|
||||
text, headline = self._remove_substring(text=text, substring=substring, headlines=headlines)
|
||||
|
||||
if text != document.content:
|
||||
document = deepcopy(document)
|
||||
document.content = text
|
||||
if headlines:
|
||||
document.meta["headlines"] = headlines
|
||||
|
||||
return document
|
||||
|
||||
@ -328,131 +313,302 @@ class PreProcessor(BasePreProcessor):
|
||||
return [document]
|
||||
|
||||
text = document.content
|
||||
headlines = document.meta["headlines"] if "headlines" in document.meta else []
|
||||
|
||||
if split_respect_sentence_boundary and split_by == "word":
|
||||
# split by words ensuring no sub sentence splits
|
||||
if self.add_page_number:
|
||||
# SentenceTokenizer will remove "\f" if it is at the end of a sentence, so substituting it in these
|
||||
# cases for "[NEW_PAGE]" to don't lose any page breaks.
|
||||
text = self._substitute_page_breaks(text)
|
||||
sentences = self._split_sentences(text)
|
||||
|
||||
word_count_slice = 0
|
||||
cur_page = 1
|
||||
splits_pages = []
|
||||
list_splits = []
|
||||
current_slice: List[str] = []
|
||||
for sen in sentences:
|
||||
if self.add_page_number and "[NEW_PAGE]" in sen:
|
||||
sen = sen.replace("[NEW_PAGE]", "\f")
|
||||
|
||||
word_count_sen = len(sen.split(" "))
|
||||
if word_count_sen > split_length:
|
||||
long_sentence_message = f"One or more sentence found with word count higher than the split length."
|
||||
if long_sentence_message not in self.print_log:
|
||||
self.print_log.add(long_sentence_message)
|
||||
logger.warning(long_sentence_message)
|
||||
if word_count_slice + word_count_sen > split_length:
|
||||
# Number of words exceeds split_length -> save current slice and start a new one
|
||||
if current_slice:
|
||||
list_splits.append(current_slice)
|
||||
splits_pages.append(cur_page)
|
||||
|
||||
if split_overlap:
|
||||
overlap = []
|
||||
processed_sents = []
|
||||
word_count_overlap = 0
|
||||
current_slice_copy = deepcopy(current_slice)
|
||||
for idx, s in reversed(list(enumerate(current_slice))):
|
||||
sen_len = len(s.split(" "))
|
||||
if word_count_overlap < split_overlap:
|
||||
overlap.append(s)
|
||||
word_count_overlap += sen_len
|
||||
current_slice_copy.pop(idx)
|
||||
else:
|
||||
processed_sents = current_slice_copy
|
||||
break
|
||||
current_slice = list(reversed(overlap))
|
||||
word_count_slice = word_count_overlap
|
||||
else:
|
||||
processed_sents = current_slice
|
||||
current_slice = []
|
||||
word_count_slice = 0
|
||||
|
||||
# Count number of page breaks in processed sentences
|
||||
if self.add_page_number:
|
||||
num_page_breaks = self._count_processed_page_breaks(
|
||||
sentences=processed_sents,
|
||||
split_overlap=split_overlap,
|
||||
overlapping_sents=current_slice,
|
||||
current_sent=sen,
|
||||
)
|
||||
cur_page += num_page_breaks
|
||||
|
||||
current_slice.append(sen)
|
||||
word_count_slice += word_count_sen
|
||||
|
||||
if current_slice:
|
||||
list_splits.append(current_slice)
|
||||
splits_pages.append(cur_page)
|
||||
|
||||
text_splits = []
|
||||
for sl in list_splits:
|
||||
txt = " ".join(sl)
|
||||
if len(txt) > 0:
|
||||
text_splits.append(txt)
|
||||
text_splits, splits_pages, splits_start_idxs = self._split_by_word_respecting_sent_boundary(
|
||||
text=text, split_length=split_length, split_overlap=split_overlap
|
||||
)
|
||||
else:
|
||||
# create individual "elements" of passage, sentence, or word
|
||||
if split_by == "passage":
|
||||
elements = text.split("\n\n")
|
||||
elif split_by == "sentence":
|
||||
if self.add_page_number:
|
||||
# SentenceTokenizer will remove "\f" if it is at the end of a sentence, so substituting it in these
|
||||
# cases for "[NEW_PAGE]" to don't lose any page breaks.
|
||||
text = self._substitute_page_breaks(text)
|
||||
elements = self._split_sentences(text)
|
||||
elif split_by == "word":
|
||||
elements = text.split(" ")
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"PreProcessor only supports 'passage', 'sentence' or 'word' split_by options."
|
||||
)
|
||||
elements, split_at = self._split_into_units(text=text, split_by=split_by)
|
||||
|
||||
# concatenate individual elements based on split_length & split_stride
|
||||
if split_overlap:
|
||||
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
|
||||
else:
|
||||
segments = windowed(elements, n=split_length, step=split_length)
|
||||
text_splits = []
|
||||
splits_pages = []
|
||||
cur_page = 1
|
||||
for seg in segments:
|
||||
current_units = [unit for unit in seg if unit is not None]
|
||||
txt = " ".join(current_units)
|
||||
if len(txt) > 0:
|
||||
text_splits.append(txt)
|
||||
splits_pages.append(cur_page)
|
||||
if self.add_page_number:
|
||||
processed_units = current_units[: split_length - split_overlap]
|
||||
num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
|
||||
cur_page += num_page_breaks
|
||||
text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
|
||||
elements=elements, split_length=split_length, split_overlap=split_overlap, split_at=split_at
|
||||
)
|
||||
|
||||
# create new document dicts for each text split
|
||||
documents = []
|
||||
for i, txt in enumerate(text_splits):
|
||||
# now we want to get rid of the empty page placeholder and skip the split if there's nothing left
|
||||
txt_clean = txt.replace(EMPTY_PAGE_PLACEHOLDER, "")
|
||||
if not txt_clean.strip():
|
||||
continue
|
||||
documents = self._create_docs_from_splits(
|
||||
text_splits=text_splits,
|
||||
splits_pages=splits_pages,
|
||||
splits_start_idxs=splits_start_idxs,
|
||||
headlines=headlines,
|
||||
meta=document.meta or {},
|
||||
id_hash_keys=id_hash_keys,
|
||||
)
|
||||
|
||||
doc = Document(content=txt_clean, meta=deepcopy(document.meta) or {}, id_hash_keys=id_hash_keys)
|
||||
return documents
|
||||
|
||||
@staticmethod
|
||||
def _clean_whitespace(text: str, headlines: List[Dict]) -> Tuple[str, List[Dict]]:
|
||||
"""
|
||||
Strips whitespaces before or after each line in the text.
|
||||
"""
|
||||
pages = text.split("\f")
|
||||
cleaned_pages = []
|
||||
cur_headline_idx = 0
|
||||
num_headlines = len(headlines)
|
||||
cur_char_idx = 0
|
||||
num_removed_chars_total = 0
|
||||
for page in pages:
|
||||
lines = page.splitlines()
|
||||
cleaned_lines = []
|
||||
for idx, line in enumerate(lines):
|
||||
old_line_len = len(line)
|
||||
cleaned_line = line.strip()
|
||||
cleaned_line_len = len(cleaned_line)
|
||||
cur_char_idx += old_line_len + 1 # add 1 for newline char
|
||||
if old_line_len != cleaned_line_len:
|
||||
num_removed_chars_current = old_line_len - cleaned_line_len
|
||||
num_removed_chars_total += num_removed_chars_current
|
||||
for headline_idx in range(cur_headline_idx, num_headlines):
|
||||
if cur_char_idx - num_removed_chars_total <= headlines[headline_idx]["start_idx"]:
|
||||
headlines[headline_idx]["start_idx"] -= num_removed_chars_current
|
||||
else:
|
||||
cur_headline_idx += 1
|
||||
|
||||
cleaned_lines.append(cleaned_line)
|
||||
cleaned_page = "\n".join(cleaned_lines)
|
||||
cleaned_pages.append(cleaned_page)
|
||||
|
||||
cleaned_text = "\f".join(cleaned_pages)
|
||||
return cleaned_text, headlines
|
||||
|
||||
@staticmethod
|
||||
def _clean_empty_lines(text: str, headlines: List[Dict]) -> Tuple[str, List[Dict]]:
|
||||
if headlines:
|
||||
num_headlines = len(headlines)
|
||||
multiple_new_line_matches = re.finditer(r"\n\n\n+", text)
|
||||
cur_headline_idx = 0
|
||||
num_removed_chars_accumulated = 0
|
||||
for match in multiple_new_line_matches:
|
||||
num_removed_chars_current = match.end() - match.start() - 2
|
||||
for headline_idx in range(cur_headline_idx, num_headlines):
|
||||
if match.end() - num_removed_chars_accumulated <= headlines[headline_idx]["start_idx"]:
|
||||
headlines[headline_idx]["start_idx"] -= num_removed_chars_current
|
||||
else:
|
||||
cur_headline_idx += 1
|
||||
num_removed_chars_accumulated += num_removed_chars_current
|
||||
|
||||
cleaned_text = re.sub(r"\n\n\n+", "\n\n", text)
|
||||
return cleaned_text, headlines
|
||||
|
||||
@staticmethod
|
||||
def _remove_substring(text: str, substring: str, headlines: List[Dict]) -> Tuple[str, List[Dict]]:
|
||||
if headlines:
|
||||
num_headlines = len(headlines)
|
||||
multiple_substring_matches = re.finditer(substring, text)
|
||||
cur_headline_idx = 0
|
||||
num_removed_chars_accumulated = 0
|
||||
for match in multiple_substring_matches:
|
||||
for headline_idx in range(cur_headline_idx, num_headlines):
|
||||
if match.end() - num_removed_chars_accumulated <= headlines[headline_idx]["start_idx"]:
|
||||
headlines[headline_idx]["start_idx"] -= len(substring)
|
||||
else:
|
||||
cur_headline_idx += 1
|
||||
num_removed_chars_accumulated += len(substring)
|
||||
|
||||
cleaned_text = text.replace(substring, "")
|
||||
return cleaned_text, headlines
|
||||
|
||||
def _split_by_word_respecting_sent_boundary(
|
||||
self, text: str, split_length: int, split_overlap: int
|
||||
) -> Tuple[List[str], List[int], List[int]]:
|
||||
"""
|
||||
Splits the text into parts of split_length words while respecting sentence boundaries.
|
||||
"""
|
||||
sentences = self._split_sentences(text)
|
||||
|
||||
word_count_slice = 0
|
||||
cur_page = 1
|
||||
cur_start_idx = 0
|
||||
splits_pages = []
|
||||
list_splits = []
|
||||
splits_start_idxs = []
|
||||
current_slice: List[str] = []
|
||||
for sen in sentences:
|
||||
word_count_sen = len(sen.split())
|
||||
|
||||
if word_count_sen > split_length:
|
||||
long_sentence_message = (
|
||||
f"We found one or more sentences whose word count is higher than the split length."
|
||||
)
|
||||
if long_sentence_message not in self.print_log:
|
||||
self.print_log.add(long_sentence_message)
|
||||
logger.warning(long_sentence_message)
|
||||
|
||||
if word_count_slice + word_count_sen > split_length:
|
||||
# Number of words exceeds split_length -> save current slice and start a new one
|
||||
if current_slice:
|
||||
list_splits.append(current_slice)
|
||||
splits_pages.append(cur_page)
|
||||
splits_start_idxs.append(cur_start_idx)
|
||||
|
||||
if split_overlap:
|
||||
overlap = []
|
||||
processed_sents = []
|
||||
word_count_overlap = 0
|
||||
current_slice_copy = deepcopy(current_slice)
|
||||
for idx, s in reversed(list(enumerate(current_slice))):
|
||||
sen_len = len(s.split())
|
||||
if word_count_overlap < split_overlap:
|
||||
overlap.append(s)
|
||||
word_count_overlap += sen_len
|
||||
current_slice_copy.pop(idx)
|
||||
else:
|
||||
processed_sents = current_slice_copy
|
||||
break
|
||||
current_slice = list(reversed(overlap))
|
||||
word_count_slice = word_count_overlap
|
||||
else:
|
||||
processed_sents = current_slice
|
||||
current_slice = []
|
||||
word_count_slice = 0
|
||||
|
||||
cur_start_idx += len("".join(processed_sents))
|
||||
|
||||
# Count number of page breaks in processed sentences
|
||||
if self.add_page_number:
|
||||
num_page_breaks = self._count_processed_page_breaks(
|
||||
sentences=processed_sents,
|
||||
split_overlap=split_overlap,
|
||||
overlapping_sents=current_slice,
|
||||
current_sent=sen,
|
||||
)
|
||||
cur_page += num_page_breaks
|
||||
|
||||
current_slice.append(sen)
|
||||
word_count_slice += word_count_sen
|
||||
|
||||
if current_slice:
|
||||
list_splits.append(current_slice)
|
||||
splits_pages.append(cur_page)
|
||||
splits_start_idxs.append(cur_start_idx)
|
||||
|
||||
text_splits = []
|
||||
for sl in list_splits:
|
||||
txt = "".join(sl)
|
||||
if len(txt) > 0:
|
||||
text_splits.append(txt)
|
||||
|
||||
return text_splits, splits_pages, splits_start_idxs
|
||||
|
||||
def _split_into_units(self, text: str, split_by: str) -> Tuple[List[str], str]:
|
||||
if split_by == "passage":
|
||||
elements = text.split("\n\n")
|
||||
split_at = "\n\n"
|
||||
elif split_by == "sentence":
|
||||
elements = self._split_sentences(text)
|
||||
split_at = "" # whitespace will be preserved while splitting text into sentences
|
||||
elif split_by == "word":
|
||||
elements = text.split(" ")
|
||||
split_at = " "
|
||||
else:
|
||||
raise NotImplementedError("PreProcessor only supports 'passage', 'sentence' or 'word' split_by options.")
|
||||
|
||||
return elements, split_at
|
||||
|
||||
def _concatenate_units(
|
||||
self, elements: List[str], split_length: int, split_overlap: int, split_at: str
|
||||
) -> Tuple[List[str], List[int], List[int]]:
|
||||
"""
|
||||
Concatenates the elements into parts of split_length units.
|
||||
"""
|
||||
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
|
||||
split_at_len = len(split_at)
|
||||
text_splits = []
|
||||
splits_pages = []
|
||||
splits_start_idxs = []
|
||||
cur_page = 1
|
||||
cur_start_idx = 0
|
||||
for seg in segments:
|
||||
current_units = [unit for unit in seg if unit is not None]
|
||||
txt = split_at.join(current_units)
|
||||
if len(txt) > 0:
|
||||
text_splits.append(txt)
|
||||
splits_pages.append(cur_page)
|
||||
splits_start_idxs.append(cur_start_idx)
|
||||
processed_units = current_units[: split_length - split_overlap]
|
||||
cur_start_idx += len((split_at_len * " ").join(processed_units)) + split_at_len
|
||||
if self.add_page_number:
|
||||
num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
|
||||
cur_page += num_page_breaks
|
||||
|
||||
return text_splits, splits_pages, splits_start_idxs
|
||||
|
||||
def _create_docs_from_splits(
|
||||
self,
|
||||
text_splits: List[str],
|
||||
splits_pages: List[int],
|
||||
splits_start_idxs: List[int],
|
||||
headlines: List[Dict],
|
||||
meta: Dict,
|
||||
id_hash_keys=Optional[List[str]],
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Creates Document objects from text splits enriching them with page number and headline information if given.
|
||||
"""
|
||||
documents = []
|
||||
|
||||
earliest_rel_hl = 0
|
||||
for i, txt in enumerate(text_splits):
|
||||
meta = deepcopy(meta)
|
||||
doc = Document(content=txt, meta=meta, id_hash_keys=id_hash_keys)
|
||||
doc.meta["_split_id"] = i
|
||||
if self.add_page_number:
|
||||
doc.meta["page"] = splits_pages[i]
|
||||
if headlines:
|
||||
split_start_idx = splits_start_idxs[i]
|
||||
relevant_headlines, earliest_rel_hl = self._extract_relevant_headlines_for_split(
|
||||
headlines=headlines, split_txt=txt, split_start_idx=split_start_idx, earliest_rel_hl=earliest_rel_hl
|
||||
)
|
||||
doc.meta["headlines"] = relevant_headlines
|
||||
|
||||
documents.append(doc)
|
||||
|
||||
return documents
|
||||
|
||||
@staticmethod
|
||||
def _extract_relevant_headlines_for_split(
|
||||
headlines: List[Dict], split_txt: str, split_start_idx: int, earliest_rel_hl: int
|
||||
) -> Tuple[List[Dict], int]:
|
||||
"""
|
||||
If you give it a list of headlines, a text split, and the start index of the split in the original text, this method
|
||||
extracts the headlines that are relevant for the split.
|
||||
"""
|
||||
relevant_headlines = []
|
||||
|
||||
for headline_idx in range(earliest_rel_hl, len(headlines)):
|
||||
# Headline is part of current split
|
||||
if split_start_idx <= headlines[headline_idx]["start_idx"] < split_start_idx + len(split_txt):
|
||||
headline_copy = deepcopy(headlines[headline_idx])
|
||||
headline_copy["start_idx"] = headlines[headline_idx]["start_idx"] - split_start_idx
|
||||
relevant_headlines.append(headline_copy)
|
||||
# Headline appears before current split, but might be relevant for current split
|
||||
elif headlines[headline_idx]["start_idx"] < split_start_idx:
|
||||
# Check if following headlines are on a higher level
|
||||
headline_to_check = headline_idx + 1
|
||||
headline_is_relevant = True
|
||||
while (
|
||||
headline_to_check < len(headlines) and headlines[headline_to_check]["start_idx"] <= split_start_idx
|
||||
):
|
||||
if headlines[headline_to_check]["level"] <= headlines[headline_idx]["level"]:
|
||||
headline_is_relevant = False
|
||||
break
|
||||
headline_to_check += 1
|
||||
if headline_is_relevant:
|
||||
headline_copy = deepcopy(headlines[headline_idx])
|
||||
headline_copy["start_idx"] = None
|
||||
relevant_headlines.append(headline_copy)
|
||||
else:
|
||||
earliest_rel_hl += 1
|
||||
# Headline (and all subsequent ones) only relevant for later splits
|
||||
elif headlines[headline_idx]["start_idx"] > split_start_idx + len(split_txt):
|
||||
break
|
||||
|
||||
return relevant_headlines, earliest_rel_hl
|
||||
|
||||
def _find_and_remove_header_footer(
|
||||
self, text: str, n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
|
||||
) -> str:
|
||||
@ -542,46 +698,74 @@ class PreProcessor(BasePreProcessor):
|
||||
:param text: str, text to tokenize
|
||||
:return: list[str], list of sentences
|
||||
"""
|
||||
sentences = []
|
||||
|
||||
language_name = iso639_to_nltk.get(self.language)
|
||||
|
||||
sentence_tokenizer = self._load_sentence_tokenizer(language_name)
|
||||
# The following adjustment of PunktSentenceTokenizer is inspired by:
|
||||
# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
|
||||
# It is needed for preserving whitespace while splitting text into sentences.
|
||||
period_context_fmt = r"""
|
||||
%(SentEndChars)s # a potential sentence ending
|
||||
\s* # match potential whitespace (is originally in lookahead assertion)
|
||||
(?=(?P<after_tok>
|
||||
%(NonWord)s # either other punctuation
|
||||
|
|
||||
(?P<next_tok>\S+) # or some other token - original version: \s+(?P<next_tok>\S+)
|
||||
))"""
|
||||
re_period_context = re.compile(
|
||||
period_context_fmt
|
||||
% {
|
||||
"NonWord": sentence_tokenizer._lang_vars._re_non_word_chars,
|
||||
"SentEndChars": sentence_tokenizer._lang_vars._re_sent_end_chars,
|
||||
},
|
||||
re.UNICODE | re.VERBOSE,
|
||||
)
|
||||
sentence_tokenizer._lang_vars._re_period_context = re_period_context
|
||||
|
||||
sentences = sentence_tokenizer.tokenize(text)
|
||||
return sentences
|
||||
|
||||
def _load_sentence_tokenizer(self, language_name: Optional[str]) -> nltk.tokenize.punkt.PunktSentenceTokenizer:
|
||||
|
||||
# Try to load a custom model from 'tokenizer_model_path'
|
||||
if self.tokenizer_model_folder is not None:
|
||||
tokenizer_model_path = Path(self.tokenizer_model_folder).absolute() / f"{self.language}.pickle"
|
||||
try:
|
||||
sentence_tokenizer = nltk.data.load(f"file:{str(tokenizer_model_path)}", format="pickle")
|
||||
sentences = sentence_tokenizer.tokenize(text)
|
||||
except LookupError:
|
||||
logger.exception("PreProcessor couldn't load sentence tokenizer from %s", tokenizer_model_path)
|
||||
except (UnpicklingError, ValueError) as e:
|
||||
logger.exception(
|
||||
"PreProcessor couldn't determine model format of sentence tokenizer at %s", tokenizer_model_path
|
||||
)
|
||||
if sentences:
|
||||
return sentences
|
||||
except (LookupError, UnpicklingError, ValueError) as e:
|
||||
if isinstance(e, LookupError):
|
||||
logger.exception(f"PreProcessor couldn't load sentence tokenizer from %s", tokenizer_model_path)
|
||||
else:
|
||||
logger.exception(
|
||||
f"PreProcessor couldn't determine model format of sentence tokenizer at %s",
|
||||
tokenizer_model_path,
|
||||
)
|
||||
|
||||
# NLTK failed to split, fallback to the default model or to English
|
||||
if language_name is not None:
|
||||
logger.error(
|
||||
f"PreProcessor couldn't find custom sentence tokenizer model for {self.language}. Using default {self.language} model."
|
||||
)
|
||||
return nltk.tokenize.sent_tokenize(text, language=language_name)
|
||||
|
||||
logger.error(
|
||||
f"PreProcessor couldn't find default or custom sentence tokenizer model for {self.language}. Using English instead."
|
||||
)
|
||||
return nltk.tokenize.sent_tokenize(text, language="english")
|
||||
# NLTK failed to load custom SentenceTokenizer, fallback to the default model or to English
|
||||
if language_name is not None:
|
||||
logger.error(
|
||||
f"PreProcessor couldn't find custom sentence tokenizer model for {self.language}. "
|
||||
f"Using default {self.language} model."
|
||||
)
|
||||
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
|
||||
else:
|
||||
logger.error(
|
||||
f"PreProcessor couldn't find default or custom sentence tokenizer model for {self.language}. "
|
||||
f"Using English instead."
|
||||
)
|
||||
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/english.pickle")
|
||||
|
||||
# Use a default NLTK model
|
||||
if language_name is not None:
|
||||
return nltk.tokenize.sent_tokenize(text, language=language_name)
|
||||
elif language_name is not None:
|
||||
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/{language_name}.pickle")
|
||||
else:
|
||||
logger.error(
|
||||
f"PreProcessor couldn't find the default sentence tokenizer model for {self.language}. "
|
||||
f" Using English instead. You may train your own model and use the 'tokenizer_model_folder' parameter."
|
||||
)
|
||||
sentence_tokenizer = nltk.data.load(f"tokenizers/punkt/english.pickle")
|
||||
|
||||
logger.error(
|
||||
f"PreProcessor couldn't find default sentence tokenizer model for {self.language}. Using English instead. "
|
||||
"You may train your own model and use the 'tokenizer_model_folder' parameter."
|
||||
)
|
||||
return nltk.tokenize.sent_tokenize(text, language="english")
|
||||
return sentence_tokenizer
|
||||
|
||||
@staticmethod
|
||||
def _count_processed_page_breaks(
|
||||
@ -603,13 +787,3 @@ class PreProcessor(BasePreProcessor):
|
||||
num_page_breaks += 1
|
||||
|
||||
return num_page_breaks
|
||||
|
||||
@staticmethod
|
||||
def _substitute_page_breaks(text: str) -> str:
|
||||
"""
|
||||
This method substitutes the page break character "\f" for "[NEW_PAGE]" if it is at the end of a sentence.
|
||||
"""
|
||||
# This regex matches any of sentence-ending punctuation (one of ".", ":", "?", "!") followed by a page break
|
||||
# character ("\f") and replaces the page break character with "[NEW_PAGE]" keeping the original sentence-ending
|
||||
# punctuation.
|
||||
return re.sub(r"([\.:?!])\f", r"\1 [NEW_PAGE]", text)
|
||||
|
||||
@ -141,6 +141,31 @@ def test_markdown_converter():
|
||||
assert document.content.startswith("What to build with Haystack")
|
||||
|
||||
|
||||
def test_markdown_converter_headline_extraction():
|
||||
expected_headlines = [
|
||||
("What to build with Haystack", 1),
|
||||
("Core Features", 1),
|
||||
("Quick Demo", 1),
|
||||
("2nd level headline for testing purposes", 2),
|
||||
("3rd level headline for testing purposes", 3),
|
||||
]
|
||||
|
||||
converter = MarkdownConverter(extract_headlines=True, remove_code_snippets=False)
|
||||
document = converter.convert(file_path=SAMPLES_PATH / "markdown" / "sample.md")[0]
|
||||
|
||||
# Check if correct number of headlines are extracted
|
||||
assert len(document.meta["headlines"]) == 5
|
||||
for extracted_headline, (expected_headline, expected_level) in zip(document.meta["headlines"], expected_headlines):
|
||||
# Check if correct headline and level is extracted
|
||||
assert extracted_headline["headline"] == expected_headline
|
||||
assert extracted_headline["level"] == expected_level
|
||||
|
||||
# Check if correct start_idx is extracted
|
||||
start_idx = extracted_headline["start_idx"]
|
||||
hl_len = len(extracted_headline["headline"])
|
||||
assert extracted_headline["headline"] == document.content[start_idx : start_idx + hl_len]
|
||||
|
||||
|
||||
def test_azure_converter():
|
||||
# Check if Form Recognizer endpoint and credential key in environment variables
|
||||
if "AZURE_FORMRECOGNIZER_ENDPOINT" in os.environ and "AZURE_FORMRECOGNIZER_KEY" in os.environ:
|
||||
|
||||
@ -26,6 +26,15 @@ paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test
|
||||
in the sentence.
|
||||
"""
|
||||
|
||||
HEADLINES = [
|
||||
{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0},
|
||||
{"headline": "paragraph_1", "start_idx": 198, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": 223, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": 365, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_3", "start_idx": 434, "level": 0},
|
||||
{"headline": "trick the test", "start_idx": 603, "level": 1},
|
||||
]
|
||||
|
||||
LEGAL_TEXT_PT = """
|
||||
A Lei nº 9.514/1997, que instituiu a alienação fiduciária de
|
||||
bens imóveis, é norma especial e posterior ao Código de Defesa do
|
||||
@ -124,8 +133,8 @@ def test_preprocess_word_split():
|
||||
documents = preprocessor.process(document)
|
||||
for i, doc in enumerate(documents):
|
||||
if i == 0:
|
||||
assert len(doc.content.split(" ")) == 14
|
||||
assert len(doc.content.split(" ")) <= 15 or doc.content.startswith("This is to trick")
|
||||
assert len(doc.content.split()) == 14
|
||||
assert len(doc.content.split()) <= 15 or doc.content.startswith("This is to trick")
|
||||
assert len(documents) == 8
|
||||
|
||||
preprocessor = PreProcessor(
|
||||
@ -244,9 +253,217 @@ def test_page_number_extraction_on_empty_pages():
|
||||
assert documents[1].content.strip() == text_page_three
|
||||
|
||||
|
||||
def test_substitute_page_break():
|
||||
# Page breaks at the end of sentences should be replaced by "[NEW_PAGE]", while page breaks in between of
|
||||
# sentences should not be replaced.
|
||||
result = PreProcessor._substitute_page_breaks(TEXT)
|
||||
assert result[211:221] == "[NEW_PAGE]"
|
||||
assert result[654] == "\f"
|
||||
def test_headline_processing_split_by_word():
|
||||
expected_headlines = [
|
||||
[{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0}],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
|
||||
{"headline": "paragraph_1", "start_idx": 19, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": 44, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": 186, "level": 1},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": None, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_3", "start_idx": 53, "level": 0},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_3", "start_idx": None, "level": 0},
|
||||
{"headline": "trick the test", "start_idx": 36, "level": 1},
|
||||
],
|
||||
]
|
||||
|
||||
document = Document(content=TEXT, meta={"headlines": HEADLINES})
|
||||
preprocessor = PreProcessor(
|
||||
split_length=30, split_overlap=0, split_by="word", split_respect_sentence_boundary=False
|
||||
)
|
||||
documents = preprocessor.process(document)
|
||||
|
||||
for doc, expected in zip(documents, expected_headlines):
|
||||
assert doc.meta["headlines"] == expected
|
||||
|
||||
|
||||
def test_headline_processing_split_by_word_overlap():
|
||||
expected_headlines = [
|
||||
[{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0}],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
|
||||
{"headline": "paragraph_1", "start_idx": 71, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": 96, "level": 0},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": 110, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_3", "start_idx": 179, "level": 0},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": None, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_3", "start_idx": 53, "level": 0},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_3", "start_idx": None, "level": 0},
|
||||
{"headline": "trick the test", "start_idx": 95, "level": 1},
|
||||
],
|
||||
]
|
||||
|
||||
document = Document(content=TEXT, meta={"headlines": HEADLINES})
|
||||
preprocessor = PreProcessor(
|
||||
split_length=30, split_overlap=10, split_by="word", split_respect_sentence_boundary=False
|
||||
)
|
||||
documents = preprocessor.process(document)
|
||||
|
||||
for doc, expected in zip(documents, expected_headlines):
|
||||
assert doc.meta["headlines"] == expected
|
||||
|
||||
|
||||
def test_headline_processing_split_by_word_respect_sentence_boundary():
|
||||
expected_headlines = [
|
||||
[{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0}],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
|
||||
{"headline": "paragraph_1", "start_idx": 71, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": 96, "level": 0},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": 110, "level": 1},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": None, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_3", "start_idx": 53, "level": 0},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_3", "start_idx": None, "level": 0},
|
||||
{"headline": "trick the test", "start_idx": 95, "level": 1},
|
||||
],
|
||||
]
|
||||
|
||||
document = Document(content=TEXT, meta={"headlines": HEADLINES})
|
||||
preprocessor = PreProcessor(split_length=30, split_overlap=5, split_by="word", split_respect_sentence_boundary=True)
|
||||
documents = preprocessor.process(document)
|
||||
|
||||
for doc, expected in zip(documents, expected_headlines):
|
||||
assert doc.meta["headlines"] == expected
|
||||
|
||||
|
||||
def test_headline_processing_split_by_sentence():
|
||||
expected_headlines = [
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0},
|
||||
{"headline": "paragraph_1", "start_idx": 198, "level": 1},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
|
||||
{"headline": "paragraph_1", "start_idx": None, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": 10, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": 152, "level": 1},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": None, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_3", "start_idx": 10, "level": 0},
|
||||
{"headline": "trick the test", "start_idx": 179, "level": 1},
|
||||
],
|
||||
]
|
||||
|
||||
document = Document(content=TEXT, meta={"headlines": HEADLINES})
|
||||
preprocessor = PreProcessor(
|
||||
split_length=5, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
|
||||
)
|
||||
documents = preprocessor.process(document)
|
||||
|
||||
for doc, expected in zip(documents, expected_headlines):
|
||||
assert doc.meta["headlines"] == expected
|
||||
|
||||
|
||||
def test_headline_processing_split_by_sentence_overlap():
|
||||
expected_headlines = [
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0},
|
||||
{"headline": "paragraph_1", "start_idx": 198, "level": 1},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
|
||||
{"headline": "paragraph_1", "start_idx": 29, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": 54, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": 196, "level": 1},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": 26, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_3", "start_idx": 95, "level": 0},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_3", "start_idx": None, "level": 0},
|
||||
{"headline": "trick the test", "start_idx": 95, "level": 1},
|
||||
],
|
||||
]
|
||||
|
||||
document = Document(content=TEXT, meta={"headlines": HEADLINES})
|
||||
preprocessor = PreProcessor(
|
||||
split_length=5, split_overlap=1, split_by="sentence", split_respect_sentence_boundary=False
|
||||
)
|
||||
documents = preprocessor.process(document)
|
||||
|
||||
for doc, expected in zip(documents, expected_headlines):
|
||||
assert doc.meta["headlines"] == expected
|
||||
|
||||
|
||||
def test_headline_processing_split_by_passage():
|
||||
expected_headlines = [
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0},
|
||||
{"headline": "paragraph_1", "start_idx": 198, "level": 1},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
|
||||
{"headline": "paragraph_1", "start_idx": None, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": 10, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": 152, "level": 1},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": None, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": None, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_3", "start_idx": 10, "level": 0},
|
||||
{"headline": "trick the test", "start_idx": 179, "level": 1},
|
||||
],
|
||||
]
|
||||
|
||||
document = Document(content=TEXT, meta={"headlines": HEADLINES})
|
||||
preprocessor = PreProcessor(
|
||||
split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
|
||||
)
|
||||
documents = preprocessor.process(document)
|
||||
|
||||
for doc, expected in zip(documents, expected_headlines):
|
||||
assert doc.meta["headlines"] == expected
|
||||
|
||||
|
||||
def test_headline_processing_split_by_passage_overlap():
|
||||
expected_headlines = [
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_1", "start_idx": 11, "level": 0},
|
||||
{"headline": "paragraph_1", "start_idx": 198, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": 223, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": 365, "level": 1},
|
||||
],
|
||||
[
|
||||
{"headline": "sample sentence in paragraph_1", "start_idx": None, "level": 0},
|
||||
{"headline": "paragraph_1", "start_idx": None, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_2", "start_idx": 10, "level": 0},
|
||||
{"headline": "in paragraph_2", "start_idx": 152, "level": 1},
|
||||
{"headline": "sample sentence in paragraph_3", "start_idx": 221, "level": 0},
|
||||
{"headline": "trick the test", "start_idx": 390, "level": 1},
|
||||
],
|
||||
]
|
||||
|
||||
document = Document(content=TEXT, meta={"headlines": HEADLINES})
|
||||
preprocessor = PreProcessor(
|
||||
split_length=2, split_overlap=1, split_by="passage", split_respect_sentence_boundary=False
|
||||
)
|
||||
documents = preprocessor.process(document)
|
||||
|
||||
for doc, expected in zip(documents, expected_headlines):
|
||||
assert doc.meta["headlines"] == expected
|
||||
|
||||
@ -53,3 +53,6 @@ The quickest way to see what Haystack offers is to start a [Docker Compose](http
|
||||
```
|
||||
# git clone https://github.com/deepset-ai/haystack.git
|
||||
```
|
||||
|
||||
### 2nd level headline for testing purposes
|
||||
#### 3rd level headline for testing purposes
|
||||
Loading…
x
Reference in New Issue
Block a user