Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor

* First draft of custom model on PreProcessor

* Update Documentation & Code Style

* Update tests to support custom models

* Update Documentation & Code Style

* Test for wrong models in custom folder

* Default to ISO names on custom model folder

Use long names only when needed

* Update Documentation & Code Style

* Refactoring language names usage

* Update fallback logic

* Check unpickling error

* Updated tests using parametrize

Co-authored-by:  Sara Zan <sara.zanzottera@deepset.ai>

* Refactored common logic

* Add format control to NLTK load

* Tests improvements

Add a sample for specialized model

* Update Documentation & Code Style

* Minor log text update

* Log model format exception details

* Change pickle protocol version to 4 for 3.7 compat

* Removed unnecessary model folder parameter

Changed logic comparisons

Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Removed unused import

* Change errors with warnings

* Change to absolute path

* Rename sentence tokenizer method

Co-authored-by: tstadel

* Check document content is a string before process

* Change to log errors and not warnings

* Update Documentation & Code Style

* Improve split sentences method

Co-authored-by:  Sara Zan  <sara.zanzottera@deepset.ai>

* Update Documentation & Code Style

* Empty commit - trigger workflow

* Remove superfluous parameters

Co-authored-by: tstadel

* Explicit None checking

Co-authored-by: tstadel

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
This commit is contained in:
Daniel Bichuetti 2022-07-21 04:50:45 -03:00 committed by GitHub
parent f51587b4ad
commit 3948b997b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 168 additions and 18 deletions

View File

@ -39,7 +39,7 @@ class PreProcessor(BasePreProcessor)
#### PreProcessor.\_\_init\_\_
```python
def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, language: str = "en", id_hash_keys: Optional[List[str]] = None)
def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, tokenizer_model_folder: Optional[Union[str, Path]] = None, language: str = "en", id_hash_keys: Optional[List[str]] = None)
```
**Arguments**:
@ -64,6 +64,7 @@ Set the value to 0 to ensure there is no overlap among the documents after split
to True, the individual split will always have complete sentences &
the number of words will be <= split_length.
- `language`: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more.
- `tokenizer_model_folder`: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise.
- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).

View File

@ -3572,6 +3572,18 @@
"default": true,
"type": "boolean"
},
"tokenizer_model_folder": {
"title": "Tokenizer Model Folder",
"anyOf": [
{
"type": "string"
},
{
"type": "string",
"format": "path"
}
]
},
"language": {
"title": "Language",
"default": "en",

View File

@ -1,6 +1,7 @@
from typing import List, Optional, Union
from abc import abstractmethod
from haystack.nodes.base import BaseComponent
from haystack.schema import Document

View File

@ -5,6 +5,8 @@ from functools import partial, reduce
from itertools import chain
from typing import List, Optional, Generator, Set, Union
import warnings
from pathlib import Path
from pickle import UnpicklingError
import nltk
from more_itertools import windowed
@ -51,6 +53,7 @@ class PreProcessor(BasePreProcessor):
split_length: int = 200,
split_overlap: int = 0,
split_respect_sentence_boundary: bool = True,
tokenizer_model_folder: Optional[Union[str, Path]] = None,
language: str = "en",
id_hash_keys: Optional[List[str]] = None,
):
@ -75,6 +78,7 @@ class PreProcessor(BasePreProcessor):
to True, the individual split will always have complete sentences &
the number of words will be <= split_length.
:param language: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more.
:param tokenizer_model_folder: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise.
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
@ -95,7 +99,8 @@ class PreProcessor(BasePreProcessor):
self.split_length = split_length
self.split_overlap = split_overlap
self.split_respect_sentence_boundary = split_respect_sentence_boundary
self.language = iso639_to_nltk.get(language, language)
self.language = language
self.tokenizer_model_folder = tokenizer_model_folder
self.print_log: Set[str] = set()
self.id_hash_keys = id_hash_keys
@ -229,6 +234,11 @@ class PreProcessor(BasePreProcessor):
# Mainly needed for type checking
if not isinstance(document, Document):
raise HaystackError("Document must not be of type 'dict' but of type 'Document'.")
if type(document.content) is not str:
logger.error("Document content is not of type str. Nothing to clean.")
return document
text = document.content
if clean_header_footer:
text = self._find_and_remove_header_footer(
@ -286,11 +296,16 @@ class PreProcessor(BasePreProcessor):
if split_respect_sentence_boundary and split_by != "word":
raise NotImplementedError("'split_respect_sentence_boundary=True' is only compatible with split_by='word'.")
if type(document.content) is not str:
logger.error("Document content is not of type str. Nothing to split.")
return [document]
text = document.content
if split_respect_sentence_boundary and split_by == "word":
# split by words ensuring no sub sentence splits
sentences = nltk.tokenize.sent_tokenize(text, language=self.language)
sentences = self._split_sentences(text)
word_count = 0
list_splits = []
current_slice: List[str] = []
@ -334,7 +349,7 @@ class PreProcessor(BasePreProcessor):
if split_by == "passage":
elements = text.split("\n\n")
elif split_by == "sentence":
elements = nltk.tokenize.sent_tokenize(text, language=self.language)
elements = self._split_sentences(text)
elif split_by == "word":
elements = text.split(" ")
else:
@ -444,3 +459,50 @@ class PreProcessor(BasePreProcessor):
# no common sequence found
longest = ""
return longest if longest.strip() else None
def _split_sentences(self, text: str) -> List[str]:
"""
Tokenize text into sentences.
:param text: str, text to tokenize
:return: list[str], list of sentences
"""
sentences = []
language_name = iso639_to_nltk.get(self.language)
# Try to load a custom model from 'tokenizer_model_path'
if self.tokenizer_model_folder is not None:
tokenizer_model_path = Path(self.tokenizer_model_folder).absolute() / f"{self.language}.pickle"
try:
sentence_tokenizer = nltk.data.load(f"file:{str(tokenizer_model_path)}", format="pickle")
sentences = sentence_tokenizer.tokenize(text)
except LookupError:
logger.exception(f"PreProcessor couldn't load sentence tokenizer from {str(tokenizer_model_path)}")
except (UnpicklingError, ValueError) as e:
logger.exception(
f"PreProcessor couldn't determine model format of sentence tokenizer at {str(tokenizer_model_path)}."
)
if sentences:
return sentences
# NLTK failed to split, fallback to the default model or to English
if language_name is not None:
logger.error(
f"PreProcessor couldn't find custom sentence tokenizer model for {self.language}. Using default {self.language} model."
)
return nltk.tokenize.sent_tokenize(text, language=language_name)
logger.error(
f"PreProcessor couldn't find default or custom sentence tokenizer model for {self.language}. Using English instead."
)
return nltk.tokenize.sent_tokenize(text, language="english")
# Use a default NLTK model
if language_name is not None:
return nltk.tokenize.sent_tokenize(text, language=language_name)
logger.error(
f"PreProcessor couldn't find default sentence tokenizer model for {self.language}. Using English instead. "
"You may train your own model and use the 'tokenizer_model_folder' parameter."
)
return nltk.tokenize.sent_tokenize(text, language="english")

View File

@ -1,5 +1,6 @@
import sys
from pathlib import Path
import os
import pytest
@ -9,6 +10,10 @@ from haystack.nodes.preprocessor.preprocessor import PreProcessor
from ..conftest import SAMPLES_PATH
NLTK_TEST_MODELS = SAMPLES_PATH.absolute() / "preprocessor" / "nltk_models"
TEXT = """
This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in
paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1.
@ -21,20 +26,90 @@ paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test
in the sentence.
"""
LEGAL_TEXT_PT = """
A Lei 9.514/1997, que instituiu a alienação fiduciária de
bens imóveis, é norma especial e posterior ao Código de Defesa do
Consumidor CDC. Em tais circunstâncias, o inadimplemento do
devedor fiduciante enseja a aplicação da regra prevista nos arts. 26 e 27
da lei especial (REsp 1.871.911/SP, rel. Min. Nancy Andrighi, DJe
25/8/2020).
A Emenda Constitucional n. 35 alterou substancialmente esse mecanismo,
ao determinar, na nova redação conferida ao art. 53: § 3º Recebida a
denúncia contra o Senador ou Deputado, por crime ocorrido após a
diplomação, o Supremo Tribunal Federal dará ciência à Casa respectiva, que,
por iniciativa de partido político nela representado e pelo voto da maioria de
seus membros, poderá, até a decisão final, sustar o andamento da ação.
Vale ressaltar, contudo, que existem, antes do encaminhamento ao
Presidente da República, os chamados autógrafos. Os autógrafos ocorrem
com o texto definitivamente aprovado pelo Plenário ou pelas comissões,
quando for o caso. Os autógrafos devem reproduzir com absoluta fidelidade a
redação final aprovada. O projeto aprovado será encaminhado em autógrafos
ao Presidente da República. O tema encontra-se regulamentado pelo art. 200
do RICD e arts. 328 a 331 do RISF.
"""
@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
def test_preprocess_sentence_split(split_length_and_results):
split_length, expected_documents_count = split_length_and_results
def test_preprocess_sentence_split():
document = Document(content=TEXT)
preprocessor = PreProcessor(
split_length=1, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
split_length=split_length, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
assert len(documents) == 15
assert len(documents) == expected_documents_count
@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
def test_preprocess_sentence_split_custom_models_wrong_file_format(split_length_and_results):
split_length, expected_documents_count = split_length_and_results
document = Document(content=TEXT)
preprocessor = PreProcessor(
split_length=10, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
split_length=split_length,
split_overlap=0,
split_by="sentence",
split_respect_sentence_boundary=False,
tokenizer_model_folder=NLTK_TEST_MODELS / "wrong",
language="en",
)
documents = preprocessor.process(document)
assert len(documents) == 2
assert len(documents) == expected_documents_count
@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
def test_preprocess_sentence_split_custom_models_non_default_language(split_length_and_results):
split_length, expected_documents_count = split_length_and_results
document = Document(content=TEXT)
preprocessor = PreProcessor(
split_length=split_length,
split_overlap=0,
split_by="sentence",
split_respect_sentence_boundary=False,
language="ca",
)
documents = preprocessor.process(document)
assert len(documents) == expected_documents_count
@pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)])
def test_preprocess_sentence_split_custom_models(split_length_and_results):
split_length, expected_documents_count = split_length_and_results
document = Document(content=LEGAL_TEXT_PT)
preprocessor = PreProcessor(
split_length=split_length,
split_overlap=0,
split_by="sentence",
split_respect_sentence_boundary=False,
language="pt",
tokenizer_model_folder=NLTK_TEST_MODELS,
)
documents = preprocessor.process(document)
assert len(documents) == expected_documents_count
def test_preprocess_word_split():
@ -64,19 +139,16 @@ def test_preprocess_word_split():
assert len(documents) == 15
def test_preprocess_passage_split():
@pytest.mark.parametrize("split_length_and_results", [(1, 3), (2, 2)])
def test_preprocess_passage_split(split_length_and_results):
split_length, expected_documents_count = split_length_and_results
document = Document(content=TEXT)
preprocessor = PreProcessor(
split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
split_length=split_length, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
assert len(documents) == 3
preprocessor = PreProcessor(
split_length=2, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
)
documents = preprocessor.process(document)
assert len(documents) == 2
assert len(documents) == expected_documents_count
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="FIXME Footer not detected correctly on Windows")

Binary file not shown.

View File

@ -0,0 +1,2 @@
This is a text file, not a real PunktSentenceTokenizer model.
Loading it should not work on sentence tokenizer.