mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-25 05:58:57 +00:00
Add support for custom trained PunktTokenizer in PreProcessor (#2783)
* Add support for model folder into BasePreProcessor * First draft of custom model on PreProcessor * Update Documentation & Code Style * Update tests to support custom models * Update Documentation & Code Style * Test for wrong models in custom folder * Default to ISO names on custom model folder Use long names only when needed * Update Documentation & Code Style * Refactoring language names usage * Update fallback logic * Check unpickling error * Updated tests using parametrize Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai> * Refactored common logic * Add format control to NLTK load * Tests improvements Add a sample for specialized model * Update Documentation & Code Style * Minor log text update * Log model format exception details * Change pickle protocol version to 4 for 3.7 compat * Removed unnecessary model folder parameter Changed logic comparisons Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai> * Update Documentation & Code Style * Removed unused import * Change errors with warnings * Change to absolute path * Rename sentence tokenizer method Co-authored-by: tstadel * Check document content is a string before process * Change to log errors and not warnings * Update Documentation & Code Style * Improve split sentences method Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai> * Update Documentation & Code Style * Empty commit - trigger workflow * Remove superfluous parameters Co-authored-by: tstadel * Explicit None checking Co-authored-by: tstadel Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
This commit is contained in:
parent
f51587b4ad
commit
3948b997b2
@ -39,7 +39,7 @@ class PreProcessor(BasePreProcessor)
|
||||
#### PreProcessor.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, language: str = "en", id_hash_keys: Optional[List[str]] = None)
|
||||
def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, tokenizer_model_folder: Optional[Union[str, Path]] = None, language: str = "en", id_hash_keys: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -64,6 +64,7 @@ Set the value to 0 to ensure there is no overlap among the documents after split
|
||||
to True, the individual split will always have complete sentences &
|
||||
the number of words will be <= split_length.
|
||||
- `language`: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more.
|
||||
- `tokenizer_model_folder`: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise.
|
||||
- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
|
||||
@ -3572,6 +3572,18 @@
|
||||
"default": true,
|
||||
"type": "boolean"
|
||||
},
|
||||
"tokenizer_model_folder": {
|
||||
"title": "Tokenizer Model Folder",
|
||||
"anyOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "string",
|
||||
"format": "path"
|
||||
}
|
||||
]
|
||||
},
|
||||
"language": {
|
||||
"title": "Language",
|
||||
"default": "en",
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from abc import abstractmethod
|
||||
|
||||
from haystack.nodes.base import BaseComponent
|
||||
from haystack.schema import Document
|
||||
|
||||
|
||||
@ -5,6 +5,8 @@ from functools import partial, reduce
|
||||
from itertools import chain
|
||||
from typing import List, Optional, Generator, Set, Union
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from pickle import UnpicklingError
|
||||
|
||||
import nltk
|
||||
from more_itertools import windowed
|
||||
@ -51,6 +53,7 @@ class PreProcessor(BasePreProcessor):
|
||||
split_length: int = 200,
|
||||
split_overlap: int = 0,
|
||||
split_respect_sentence_boundary: bool = True,
|
||||
tokenizer_model_folder: Optional[Union[str, Path]] = None,
|
||||
language: str = "en",
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
):
|
||||
@ -75,6 +78,7 @@ class PreProcessor(BasePreProcessor):
|
||||
to True, the individual split will always have complete sentences &
|
||||
the number of words will be <= split_length.
|
||||
:param language: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more.
|
||||
:param tokenizer_model_folder: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise.
|
||||
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
@ -95,7 +99,8 @@ class PreProcessor(BasePreProcessor):
|
||||
self.split_length = split_length
|
||||
self.split_overlap = split_overlap
|
||||
self.split_respect_sentence_boundary = split_respect_sentence_boundary
|
||||
self.language = iso639_to_nltk.get(language, language)
|
||||
self.language = language
|
||||
self.tokenizer_model_folder = tokenizer_model_folder
|
||||
self.print_log: Set[str] = set()
|
||||
self.id_hash_keys = id_hash_keys
|
||||
|
||||
@ -229,6 +234,11 @@ class PreProcessor(BasePreProcessor):
|
||||
# Mainly needed for type checking
|
||||
if not isinstance(document, Document):
|
||||
raise HaystackError("Document must not be of type 'dict' but of type 'Document'.")
|
||||
|
||||
if type(document.content) is not str:
|
||||
logger.error("Document content is not of type str. Nothing to clean.")
|
||||
return document
|
||||
|
||||
text = document.content
|
||||
if clean_header_footer:
|
||||
text = self._find_and_remove_header_footer(
|
||||
@ -286,11 +296,16 @@ class PreProcessor(BasePreProcessor):
|
||||
if split_respect_sentence_boundary and split_by != "word":
|
||||
raise NotImplementedError("'split_respect_sentence_boundary=True' is only compatible with split_by='word'.")
|
||||
|
||||
if type(document.content) is not str:
|
||||
logger.error("Document content is not of type str. Nothing to split.")
|
||||
return [document]
|
||||
|
||||
text = document.content
|
||||
|
||||
if split_respect_sentence_boundary and split_by == "word":
|
||||
# split by words ensuring no sub sentence splits
|
||||
sentences = nltk.tokenize.sent_tokenize(text, language=self.language)
|
||||
sentences = self._split_sentences(text)
|
||||
|
||||
word_count = 0
|
||||
list_splits = []
|
||||
current_slice: List[str] = []
|
||||
@ -334,7 +349,7 @@ class PreProcessor(BasePreProcessor):
|
||||
if split_by == "passage":
|
||||
elements = text.split("\n\n")
|
||||
elif split_by == "sentence":
|
||||
elements = nltk.tokenize.sent_tokenize(text, language=self.language)
|
||||
elements = self._split_sentences(text)
|
||||
elif split_by == "word":
|
||||
elements = text.split(" ")
|
||||
else:
|
||||
@ -444,3 +459,50 @@ class PreProcessor(BasePreProcessor):
|
||||
# no common sequence found
|
||||
longest = ""
|
||||
return longest if longest.strip() else None
|
||||
|
||||
def _split_sentences(self, text: str) -> List[str]:
|
||||
"""
|
||||
Tokenize text into sentences.
|
||||
:param text: str, text to tokenize
|
||||
:return: list[str], list of sentences
|
||||
"""
|
||||
sentences = []
|
||||
|
||||
language_name = iso639_to_nltk.get(self.language)
|
||||
|
||||
# Try to load a custom model from 'tokenizer_model_path'
|
||||
if self.tokenizer_model_folder is not None:
|
||||
tokenizer_model_path = Path(self.tokenizer_model_folder).absolute() / f"{self.language}.pickle"
|
||||
try:
|
||||
sentence_tokenizer = nltk.data.load(f"file:{str(tokenizer_model_path)}", format="pickle")
|
||||
sentences = sentence_tokenizer.tokenize(text)
|
||||
except LookupError:
|
||||
logger.exception(f"PreProcessor couldn't load sentence tokenizer from {str(tokenizer_model_path)}")
|
||||
except (UnpicklingError, ValueError) as e:
|
||||
logger.exception(
|
||||
f"PreProcessor couldn't determine model format of sentence tokenizer at {str(tokenizer_model_path)}."
|
||||
)
|
||||
if sentences:
|
||||
return sentences
|
||||
|
||||
# NLTK failed to split, fallback to the default model or to English
|
||||
if language_name is not None:
|
||||
logger.error(
|
||||
f"PreProcessor couldn't find custom sentence tokenizer model for {self.language}. Using default {self.language} model."
|
||||
)
|
||||
return nltk.tokenize.sent_tokenize(text, language=language_name)
|
||||
|
||||
logger.error(
|
||||
f"PreProcessor couldn't find default or custom sentence tokenizer model for {self.language}. Using English instead."
|
||||
)
|
||||
return nltk.tokenize.sent_tokenize(text, language="english")
|
||||
|
||||
# Use a default NLTK model
|
||||
if language_name is not None:
|
||||
return nltk.tokenize.sent_tokenize(text, language=language_name)
|
||||
|
||||
logger.error(
|
||||
f"PreProcessor couldn't find default sentence tokenizer model for {self.language}. Using English instead. "
|
||||
"You may train your own model and use the 'tokenizer_model_folder' parameter."
|
||||
)
|
||||
return nltk.tokenize.sent_tokenize(text, language="english")
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
@ -9,6 +10,10 @@ from haystack.nodes.preprocessor.preprocessor import PreProcessor
|
||||
|
||||
from ..conftest import SAMPLES_PATH
|
||||
|
||||
|
||||
NLTK_TEST_MODELS = SAMPLES_PATH.absolute() / "preprocessor" / "nltk_models"
|
||||
|
||||
|
||||
TEXT = """
|
||||
This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in
|
||||
paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1.
|
||||
@ -21,20 +26,90 @@ paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test
|
||||
in the sentence.
|
||||
"""
|
||||
|
||||
LEGAL_TEXT_PT = """
|
||||
A Lei nº 9.514/1997, que instituiu a alienação fiduciária de
|
||||
bens imóveis, é norma especial e posterior ao Código de Defesa do
|
||||
Consumidor – CDC. Em tais circunstâncias, o inadimplemento do
|
||||
devedor fiduciante enseja a aplicação da regra prevista nos arts. 26 e 27
|
||||
da lei especial” (REsp 1.871.911/SP, rel. Min. Nancy Andrighi, DJe
|
||||
25/8/2020).
|
||||
|
||||
A Emenda Constitucional n. 35 alterou substancialmente esse mecanismo,
|
||||
ao determinar, na nova redação conferida ao art. 53: “§ 3º Recebida a
|
||||
denúncia contra o Senador ou Deputado, por crime ocorrido após a
|
||||
diplomação, o Supremo Tribunal Federal dará ciência à Casa respectiva, que,
|
||||
por iniciativa de partido político nela representado e pelo voto da maioria de
|
||||
seus membros, poderá, até a decisão final, sustar o andamento da ação”.
|
||||
Vale ressaltar, contudo, que existem, antes do encaminhamento ao
|
||||
Presidente da República, os chamados autógrafos. Os autógrafos ocorrem já
|
||||
com o texto definitivamente aprovado pelo Plenário ou pelas comissões,
|
||||
quando for o caso. Os autógrafos devem reproduzir com absoluta fidelidade a
|
||||
redação final aprovada. O projeto aprovado será encaminhado em autógrafos
|
||||
ao Presidente da República. O tema encontra-se regulamentado pelo art. 200
|
||||
do RICD e arts. 328 a 331 do RISF.
|
||||
"""
|
||||
|
||||
|
||||
@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
|
||||
def test_preprocess_sentence_split(split_length_and_results):
|
||||
split_length, expected_documents_count = split_length_and_results
|
||||
|
||||
def test_preprocess_sentence_split():
|
||||
document = Document(content=TEXT)
|
||||
preprocessor = PreProcessor(
|
||||
split_length=1, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
|
||||
split_length=split_length, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
|
||||
)
|
||||
documents = preprocessor.process(document)
|
||||
assert len(documents) == 15
|
||||
assert len(documents) == expected_documents_count
|
||||
|
||||
|
||||
@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
|
||||
def test_preprocess_sentence_split_custom_models_wrong_file_format(split_length_and_results):
|
||||
split_length, expected_documents_count = split_length_and_results
|
||||
|
||||
document = Document(content=TEXT)
|
||||
preprocessor = PreProcessor(
|
||||
split_length=10, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
|
||||
split_length=split_length,
|
||||
split_overlap=0,
|
||||
split_by="sentence",
|
||||
split_respect_sentence_boundary=False,
|
||||
tokenizer_model_folder=NLTK_TEST_MODELS / "wrong",
|
||||
language="en",
|
||||
)
|
||||
documents = preprocessor.process(document)
|
||||
assert len(documents) == 2
|
||||
assert len(documents) == expected_documents_count
|
||||
|
||||
|
||||
@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
|
||||
def test_preprocess_sentence_split_custom_models_non_default_language(split_length_and_results):
|
||||
split_length, expected_documents_count = split_length_and_results
|
||||
|
||||
document = Document(content=TEXT)
|
||||
preprocessor = PreProcessor(
|
||||
split_length=split_length,
|
||||
split_overlap=0,
|
||||
split_by="sentence",
|
||||
split_respect_sentence_boundary=False,
|
||||
language="ca",
|
||||
)
|
||||
documents = preprocessor.process(document)
|
||||
assert len(documents) == expected_documents_count
|
||||
|
||||
|
||||
@pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)])
|
||||
def test_preprocess_sentence_split_custom_models(split_length_and_results):
|
||||
split_length, expected_documents_count = split_length_and_results
|
||||
|
||||
document = Document(content=LEGAL_TEXT_PT)
|
||||
preprocessor = PreProcessor(
|
||||
split_length=split_length,
|
||||
split_overlap=0,
|
||||
split_by="sentence",
|
||||
split_respect_sentence_boundary=False,
|
||||
language="pt",
|
||||
tokenizer_model_folder=NLTK_TEST_MODELS,
|
||||
)
|
||||
documents = preprocessor.process(document)
|
||||
assert len(documents) == expected_documents_count
|
||||
|
||||
|
||||
def test_preprocess_word_split():
|
||||
@ -64,19 +139,16 @@ def test_preprocess_word_split():
|
||||
assert len(documents) == 15
|
||||
|
||||
|
||||
def test_preprocess_passage_split():
|
||||
@pytest.mark.parametrize("split_length_and_results", [(1, 3), (2, 2)])
|
||||
def test_preprocess_passage_split(split_length_and_results):
|
||||
split_length, expected_documents_count = split_length_and_results
|
||||
|
||||
document = Document(content=TEXT)
|
||||
preprocessor = PreProcessor(
|
||||
split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
|
||||
split_length=split_length, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
|
||||
)
|
||||
documents = preprocessor.process(document)
|
||||
assert len(documents) == 3
|
||||
|
||||
preprocessor = PreProcessor(
|
||||
split_length=2, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
|
||||
)
|
||||
documents = preprocessor.process(document)
|
||||
assert len(documents) == 2
|
||||
assert len(documents) == expected_documents_count
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="FIXME Footer not detected correctly on Windows")
|
||||
|
||||
BIN
test/samples/preprocessor/nltk_models/pt.pickle
Normal file
BIN
test/samples/preprocessor/nltk_models/pt.pickle
Normal file
Binary file not shown.
2
test/samples/preprocessor/nltk_models/wrong/en.pickle
Normal file
2
test/samples/preprocessor/nltk_models/wrong/en.pickle
Normal file
@ -0,0 +1,2 @@
|
||||
This is a text file, not a real PunktSentenceTokenizer model.
|
||||
Loading it should not work on sentence tokenizer.
|
||||
Loading…
x
Reference in New Issue
Block a user