Add support for custom trained PunktTokenizer in PreProcessor (#2783)

* Add support for model folder into BasePreProcessor * First draft of custom model on PreProcessor * Update Documentation & Code Style * Update tests to support custom models * Update Documentation & Code Style * Test for wrong models in custom folder * Default to ISO names on custom model folder Use long names only when needed * Update Documentation & Code Style * Refactoring language names usage * Update fallback logic * Check unpickling error * Updated tests using parametrize Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai> * Refactored common logic * Add format control to NLTK load * Tests improvements Add a sample for specialized model * Update Documentation & Code Style * Minor log text update * Log model format exception details * Change pickle protocol version to 4 for 3.7 compat * Removed unnecessary model folder parameter Changed logic comparisons Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai> * Update Documentation & Code Style * Removed unused import * Change errors with warnings * Change to absolute path * Rename sentence tokenizer method Co-authored-by: tstadel * Check document content is a string before process * Change to log errors and not warnings * Update Documentation & Code Style * Improve split sentences method Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai> * Update Documentation & Code Style * Empty commit - trigger workflow * Remove superfluous parameters Co-authored-by: tstadel * Explicit None checking Co-authored-by: tstadel Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
2025-12-25 22:18:39 +00:00 · 2022-07-21 04:50:45 -03:00 · 2022-07-21 04:50:45 -03:00 · 3948b997b2
commit 3948b997b2
parent f51587b4ad
7 changed files with 168 additions and 18 deletions
--- a/docs/_src/api/api/preprocessor.md
+++ b/docs/_src/api/api/preprocessor.md
@ -39,7 +39,7 @@ class PreProcessor(BasePreProcessor)
 #### PreProcessor.\_\_init\_\_

 ```python
-def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, language: str = "en", id_hash_keys: Optional[List[str]] = None)
+def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, tokenizer_model_folder: Optional[Union[str, Path]] = None, language: str = "en", id_hash_keys: Optional[List[str]] = None)
 ```

 **Arguments**:
@ -64,6 +64,7 @@ Set the value to 0 to ensure there is no overlap among the documents after split
 to True, the individual split will always have complete sentences &
 the number of words will be <= split_length.
 - `language`: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more.
+- `tokenizer_model_folder`: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise.
 - `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
 attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
 not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
--- a/haystack/json-schemas/haystack-pipeline-master.schema.json
+++ b/haystack/json-schemas/haystack-pipeline-master.schema.json
@ -3572,6 +3572,18 @@
              "default": true,
              "type": "boolean"
            },
+            "tokenizer_model_folder": {
+              "title": "Tokenizer Model Folder",
+              "anyOf": [
+                {
+                  "type": "string"
+                },
+                {
+                  "type": "string",
+                  "format": "path"
+                }
+              ]
+            },
            "language": {
              "title": "Language",
              "default": "en",
--- a/haystack/nodes/preprocessor/base.py
+++ b/haystack/nodes/preprocessor/base.py
@ -1,6 +1,7 @@
 from typing import List, Optional, Union

 from abc import abstractmethod
+
 from haystack.nodes.base import BaseComponent
 from haystack.schema import Document

--- a/haystack/nodes/preprocessor/preprocessor.py
+++ b/haystack/nodes/preprocessor/preprocessor.py
@ -5,6 +5,8 @@ from functools import partial, reduce
 from itertools import chain
 from typing import List, Optional, Generator, Set, Union
 import warnings
+from pathlib import Path
+from pickle import UnpicklingError

 import nltk
 from more_itertools import windowed
@ -51,6 +53,7 @@ class PreProcessor(BasePreProcessor):
        split_length: int = 200,
        split_overlap: int = 0,
        split_respect_sentence_boundary: bool = True,
+        tokenizer_model_folder: Optional[Union[str, Path]] = None,
        language: str = "en",
        id_hash_keys: Optional[List[str]] = None,
    ):
@ -75,6 +78,7 @@ class PreProcessor(BasePreProcessor):
                                                to True, the individual split will always have complete sentences &
                                                the number of words will be <= split_length.
        :param language: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more.
+        :param tokenizer_model_folder: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise.
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
@ -95,7 +99,8 @@ class PreProcessor(BasePreProcessor):
        self.split_length = split_length
        self.split_overlap = split_overlap
        self.split_respect_sentence_boundary = split_respect_sentence_boundary
-        self.language = iso639_to_nltk.get(language, language)
+        self.language = language
+        self.tokenizer_model_folder = tokenizer_model_folder
        self.print_log: Set[str] = set()
        self.id_hash_keys = id_hash_keys

@ -229,6 +234,11 @@ class PreProcessor(BasePreProcessor):
        # Mainly needed for type checking
        if not isinstance(document, Document):
            raise HaystackError("Document must not be of type 'dict' but of type 'Document'.")
+
+        if type(document.content) is not str:
+            logger.error("Document content is not of type str. Nothing to clean.")
+            return document
+
        text = document.content
        if clean_header_footer:
            text = self._find_and_remove_header_footer(
@ -286,11 +296,16 @@ class PreProcessor(BasePreProcessor):
        if split_respect_sentence_boundary and split_by != "word":
            raise NotImplementedError("'split_respect_sentence_boundary=True' is only compatible with split_by='word'.")

+        if type(document.content) is not str:
+            logger.error("Document content is not of type str. Nothing to split.")
+            return [document]
+
        text = document.content

        if split_respect_sentence_boundary and split_by == "word":
            # split by words ensuring no sub sentence splits
-            sentences = nltk.tokenize.sent_tokenize(text, language=self.language)
+            sentences = self._split_sentences(text)
+
            word_count = 0
            list_splits = []
            current_slice: List[str] = []
@ -334,7 +349,7 @@ class PreProcessor(BasePreProcessor):
            if split_by == "passage":
                elements = text.split("\n\n")
            elif split_by == "sentence":
-                elements = nltk.tokenize.sent_tokenize(text, language=self.language)
+                elements = self._split_sentences(text)
            elif split_by == "word":
                elements = text.split(" ")
            else:
@ -444,3 +459,50 @@ class PreProcessor(BasePreProcessor):
            # no common sequence found
            longest = ""
        return longest if longest.strip() else None
+
+    def _split_sentences(self, text: str) -> List[str]:
+        """
+        Tokenize text into sentences.
+        :param text: str, text to tokenize
+        :return: list[str], list of sentences
+        """
+        sentences = []
+
+        language_name = iso639_to_nltk.get(self.language)
+
+        # Try to load a custom model from 'tokenizer_model_path'
+        if self.tokenizer_model_folder is not None:
+            tokenizer_model_path = Path(self.tokenizer_model_folder).absolute() / f"{self.language}.pickle"
+            try:
+                sentence_tokenizer = nltk.data.load(f"file:{str(tokenizer_model_path)}", format="pickle")
+                sentences = sentence_tokenizer.tokenize(text)
+            except LookupError:
+                logger.exception(f"PreProcessor couldn't load sentence tokenizer from {str(tokenizer_model_path)}")
+            except (UnpicklingError, ValueError) as e:
+                logger.exception(
+                    f"PreProcessor couldn't determine model format of sentence tokenizer at {str(tokenizer_model_path)}."
+                )
+            if sentences:
+                return sentences
+
+            # NLTK failed to split, fallback to the default model or to English
+            if language_name is not None:
+                logger.error(
+                    f"PreProcessor couldn't find custom sentence tokenizer model for {self.language}. Using default {self.language} model."
+                )
+                return nltk.tokenize.sent_tokenize(text, language=language_name)
+
+            logger.error(
+                f"PreProcessor couldn't find default or custom sentence tokenizer model for {self.language}. Using English instead."
+            )
+            return nltk.tokenize.sent_tokenize(text, language="english")
+
+        # Use a default NLTK model
+        if language_name is not None:
+            return nltk.tokenize.sent_tokenize(text, language=language_name)
+
+        logger.error(
+            f"PreProcessor couldn't find default sentence tokenizer model for {self.language}. Using English instead. "
+            "You may train your own model and use the 'tokenizer_model_folder' parameter."
+        )
+        return nltk.tokenize.sent_tokenize(text, language="english")
--- a/test/nodes/test_preprocessor.py
+++ b/test/nodes/test_preprocessor.py
@ -1,5 +1,6 @@
 import sys
 from pathlib import Path
+import os

 import pytest

@ -9,6 +10,10 @@ from haystack.nodes.preprocessor.preprocessor import PreProcessor

 from ..conftest import SAMPLES_PATH

+
+NLTK_TEST_MODELS = SAMPLES_PATH.absolute() / "preprocessor" / "nltk_models"
+
+
 TEXT = """
 This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in 
 paragraph_1. This is a sample sentence in paragraph_1. This is a sample sentence in paragraph_1.
@ -21,20 +26,90 @@ paragraph_3. This is a sample sentence in paragraph_3. This is to trick the test
 in the sentence. 
 """

+LEGAL_TEXT_PT = """
+A Lei nº 9.514/1997, que instituiu a alienação fiduciária de
+bens imóveis, é norma especial e posterior ao Código de Defesa do
+Consumidor – CDC. Em tais circunstâncias, o inadimplemento do
+devedor fiduciante enseja a aplicação da regra prevista nos arts. 26 e 27
+da lei especial” (REsp 1.871.911/SP, rel. Min. Nancy Andrighi, DJe
+25/8/2020).
+
+A Emenda Constitucional n. 35 alterou substancialmente esse mecanismo,
+ao determinar, na nova redação conferida ao art. 53: “§ 3º Recebida a
+denúncia contra o Senador ou Deputado, por crime ocorrido após a
+diplomação, o Supremo Tribunal Federal dará ciência à Casa respectiva, que,
+por iniciativa de partido político nela representado e pelo voto da maioria de
+seus membros, poderá, até a decisão final, sustar o andamento da ação”.
+Vale ressaltar, contudo, que existem, antes do encaminhamento ao
+Presidente da República, os chamados autógrafos. Os autógrafos ocorrem já
+com o texto definitivamente aprovado pelo Plenário ou pelas comissões,
+quando for o caso. Os autógrafos devem reproduzir com absoluta fidelidade a
+redação final aprovada. O projeto aprovado será encaminhado em autógrafos
+ao Presidente da República. O tema encontra-se regulamentado pelo art. 200
+do RICD e arts. 328 a 331 do RISF.
+"""
+
+
+@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
+def test_preprocess_sentence_split(split_length_and_results):
+    split_length, expected_documents_count = split_length_and_results

-def test_preprocess_sentence_split():
    document = Document(content=TEXT)
    preprocessor = PreProcessor(
-        split_length=1, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
+        split_length=split_length, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
    )
    documents = preprocessor.process(document)
-    assert len(documents) == 15
+    assert len(documents) == expected_documents_count

+
+@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
+def test_preprocess_sentence_split_custom_models_wrong_file_format(split_length_and_results):
+    split_length, expected_documents_count = split_length_and_results
+
+    document = Document(content=TEXT)
    preprocessor = PreProcessor(
-        split_length=10, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False
+        split_length=split_length,
+        split_overlap=0,
+        split_by="sentence",
+        split_respect_sentence_boundary=False,
+        tokenizer_model_folder=NLTK_TEST_MODELS / "wrong",
+        language="en",
    )
    documents = preprocessor.process(document)
-    assert len(documents) == 2
+    assert len(documents) == expected_documents_count
+
+
+@pytest.mark.parametrize("split_length_and_results", [(1, 15), (10, 2)])
+def test_preprocess_sentence_split_custom_models_non_default_language(split_length_and_results):
+    split_length, expected_documents_count = split_length_and_results
+
+    document = Document(content=TEXT)
+    preprocessor = PreProcessor(
+        split_length=split_length,
+        split_overlap=0,
+        split_by="sentence",
+        split_respect_sentence_boundary=False,
+        language="ca",
+    )
+    documents = preprocessor.process(document)
+    assert len(documents) == expected_documents_count
+
+
+@pytest.mark.parametrize("split_length_and_results", [(1, 8), (8, 1)])
+def test_preprocess_sentence_split_custom_models(split_length_and_results):
+    split_length, expected_documents_count = split_length_and_results
+
+    document = Document(content=LEGAL_TEXT_PT)
+    preprocessor = PreProcessor(
+        split_length=split_length,
+        split_overlap=0,
+        split_by="sentence",
+        split_respect_sentence_boundary=False,
+        language="pt",
+        tokenizer_model_folder=NLTK_TEST_MODELS,
+    )
+    documents = preprocessor.process(document)
+    assert len(documents) == expected_documents_count


 def test_preprocess_word_split():
@ -64,19 +139,16 @@ def test_preprocess_word_split():
    assert len(documents) == 15


-def test_preprocess_passage_split():
+@pytest.mark.parametrize("split_length_and_results", [(1, 3), (2, 2)])
+def test_preprocess_passage_split(split_length_and_results):
+    split_length, expected_documents_count = split_length_and_results
+
    document = Document(content=TEXT)
    preprocessor = PreProcessor(
-        split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
+        split_length=split_length, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
    )
    documents = preprocessor.process(document)
-    assert len(documents) == 3
-
-    preprocessor = PreProcessor(
-        split_length=2, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False
-    )
-    documents = preprocessor.process(document)
-    assert len(documents) == 2
+    assert len(documents) == expected_documents_count


@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="FIXME Footer not detected correctly on Windows")
--- a/test/samples/preprocessor/nltk_models/pt.pickle
+++ b/test/samples/preprocessor/nltk_models/pt.pickle
--- a/test/samples/preprocessor/nltk_models/wrong/en.pickle
+++ b/test/samples/preprocessor/nltk_models/wrong/en.pickle
@ -0,0 +1,2 @@
+This is a text file, not a real PunktSentenceTokenizer model.
+Loading it should not work on sentence tokenizer.