fix: Move potential nltk download to warm_up (#8646)

* Move potential nltk download to warm_up * Update tests * Add release notes * Fix tests * Uncomment * Make mypy happy * Add RuntimeError message * Update release notes --------- Co-authored-by: Julian Risch <julian.risch@deepset.ai>
2025-11-26 23:15:59 +00:00 · 2024-12-20 10:41:44 +01:00 · 2024-12-20 10:41:44 +01:00 · 286061f005
commit 286061f005
parent f4d9c2bb91
5 changed files with 90 additions and 17 deletions
--- a/haystack/components/preprocessors/document_splitter.py
+++ b/haystack/components/preprocessors/document_splitter.py
@ -107,15 +107,10 @@ class DocumentSplitter:
            splitting_function=splitting_function,
            respect_sentence_boundary=respect_sentence_boundary,
        )
-
-        if split_by == "sentence" or (respect_sentence_boundary and split_by == "word"):
+        self._use_sentence_splitter = split_by == "sentence" or (respect_sentence_boundary and split_by == "word")
+        if self._use_sentence_splitter:
            nltk_imports.check()
-            self.sentence_splitter = SentenceSplitter(
-                language=language,
-                use_split_rules=use_split_rules,
-                extend_abbreviations=extend_abbreviations,
-                keep_white_spaces=True,
-            )
+            self.sentence_splitter = None

        if split_by == "sentence":
            # ToDo: remove this warning in the next major release
@ -164,6 +159,18 @@ class DocumentSplitter:
            )
            self.respect_sentence_boundary = False

+    def warm_up(self):
+        """
+        Warm up the DocumentSplitter by loading the sentence tokenizer.
+        """
+        if self._use_sentence_splitter and self.sentence_splitter is None:
+            self.sentence_splitter = SentenceSplitter(
+                language=self.language,
+                use_split_rules=self.use_split_rules,
+                extend_abbreviations=self.extend_abbreviations,
+                keep_white_spaces=True,
+            )
+
    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        """
@ -182,6 +189,10 @@ class DocumentSplitter:
        :raises TypeError: if the input is not a list of Documents.
        :raises ValueError: if the content of a document is None.
        """
+        if self._use_sentence_splitter and self.sentence_splitter is None:
+            raise RuntimeError(
+                "The component DocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
+            )

        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
            raise TypeError("DocumentSplitter expects a List of Documents as input.")
--- a/haystack/components/preprocessors/nltk_document_splitter.py
+++ b/haystack/components/preprocessors/nltk_document_splitter.py
@ -77,16 +77,23 @@ class NLTKDocumentSplitter(DocumentSplitter):
        self.respect_sentence_boundary = respect_sentence_boundary
        self.use_split_rules = use_split_rules
        self.extend_abbreviations = extend_abbreviations
-        self.sentence_splitter = SentenceSplitter(
-            language=language,
-            use_split_rules=use_split_rules,
-            extend_abbreviations=extend_abbreviations,
-            keep_white_spaces=True,
-        )
+        self.sentence_splitter = None
        self.language = language

+    def warm_up(self):
+        """
+        Warm up the NLTKDocumentSplitter by loading the sentence tokenizer.
+        """
+        if self.sentence_splitter is None:
+            self.sentence_splitter = SentenceSplitter(
+                language=self.language,
+                use_split_rules=self.use_split_rules,
+                extend_abbreviations=self.extend_abbreviations,
+                keep_white_spaces=True,
+            )
+
    def _split_into_units(
-        self, text: str, split_by: Literal["function", "page", "passage", "sentence", "word", "line"]
+        self, text: str, split_by: Literal["function", "page", "passage", "period", "sentence", "word", "line"]
    ) -> List[str]:
        """
        Splits the text into units based on the specified split_by parameter.
@ -106,6 +113,7 @@ class NLTKDocumentSplitter(DocumentSplitter):
            # whitespace is preserved while splitting text into sentences when using keep_white_spaces=True
            # so split_at is set to an empty string
            self.split_at = ""
+            assert self.sentence_splitter is not None
            result = self.sentence_splitter.split_sentences(text)
            units = [sentence["sentence"] for sentence in result]
        elif split_by == "word":
@ -142,6 +150,11 @@ class NLTKDocumentSplitter(DocumentSplitter):
        :raises TypeError: if the input is not a list of Documents.
        :raises ValueError: if the content of a document is None.
        """
+        if self.sentence_splitter is None:
+            raise RuntimeError(
+                "The component NLTKDocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
+            )
+
        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
            raise TypeError("DocumentSplitter expects a List of Documents as input.")

@ -221,8 +234,9 @@ class NLTKDocumentSplitter(DocumentSplitter):
                break
        return num_sentences_to_keep

+    @staticmethod
    def _concatenate_sentences_based_on_word_amount(
-        self, sentences: List[str], split_length: int, split_overlap: int
+        sentences: List[str], split_length: int, split_overlap: int
    ) -> Tuple[List[str], List[int], List[int]]:
        """
        Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
@ -258,7 +272,7 @@ class NLTKDocumentSplitter(DocumentSplitter):
                split_start_indices.append(chunk_start_idx)

                # Get the number of sentences that overlap with the next chunk
-                num_sentences_to_keep = self._number_of_sentences_to_keep(
+                num_sentences_to_keep = NLTKDocumentSplitter._number_of_sentences_to_keep(
                    sentences=current_chunk, split_length=split_length, split_overlap=split_overlap
                )
                # Set up information for the new chunk
--- a/releasenotes/notes/move-nltk-download-to-warm-up-f2b22bda3a9ba673.yaml
+++ b/releasenotes/notes/move-nltk-download-to-warm-up-f2b22bda3a9ba673.yaml
@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    Moved the NLTK download of DocumentSplitter and NLTKDocumentSplitter to warm_up(). This prevents calling to an external api during instantiation. If a DocumentSplitter or NLTKDocumentSplitter is used for sentence splitting outside of a pipeline, warm_up() now needs to be called before running the component.
--- a/test/components/preprocessors/test_document_splitter.py
+++ b/test/components/preprocessors/test_document_splitter.py
@ -44,16 +44,19 @@ class TestSplittingByFunctionOrCharacterRegex:
            ValueError, match="DocumentSplitter only works with text documents but content for document ID"
        ):
            splitter = DocumentSplitter()
+            splitter.warm_up()
            splitter.run(documents=[Document()])
            assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text

    def test_single_doc(self):
        with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
            splitter = DocumentSplitter()
+            splitter.warm_up()
            splitter.run(documents=Document())

    def test_empty_list(self):
        splitter = DocumentSplitter()
+        splitter.warm_up()
        res = splitter.run(documents=[])
        assert res == {"documents": []}

@ -76,6 +79,7 @@ class TestSplittingByFunctionOrCharacterRegex:
    def test_split_by_word(self):
        splitter = DocumentSplitter(split_by="word", split_length=10)
        text = "This is a text with some words. There is a second sentence. And there is a third sentence."
+        splitter.warm_up()
        result = splitter.run(documents=[Document(content=text)])
        docs = result["documents"]
        assert len(docs) == 2
@ -88,6 +92,7 @@ class TestSplittingByFunctionOrCharacterRegex:

    def test_split_by_word_with_threshold(self):
        splitter = DocumentSplitter(split_by="word", split_length=15, split_threshold=10)
+        splitter.warm_up()
        result = splitter.run(
            documents=[
                Document(
@ -105,6 +110,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        splitter = DocumentSplitter(split_by="word", split_length=10)
        text1 = "This is a text with some words. There is a second sentence. And there is a third sentence."
        text2 = "This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence."
+        splitter.warm_up()
        result = splitter.run(documents=[Document(content=text1), Document(content=text2)])
        docs = result["documents"]
        assert len(docs) == 5
@ -132,6 +138,7 @@ class TestSplittingByFunctionOrCharacterRegex:
    def test_split_by_period(self):
        splitter = DocumentSplitter(split_by="period", split_length=1)
        text = "This is a text with some words. There is a second sentence. And there is a third sentence."
+        splitter.warm_up()
        result = splitter.run(documents=[Document(content=text)])
        docs = result["documents"]
        assert len(docs) == 3
@ -148,6 +155,7 @@ class TestSplittingByFunctionOrCharacterRegex:
    def test_split_by_passage(self):
        splitter = DocumentSplitter(split_by="passage", split_length=1)
        text = "This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage."
+        splitter.warm_up()
        result = splitter.run(documents=[Document(content=text)])
        docs = result["documents"]
        assert len(docs) == 3
@ -164,6 +172,7 @@ class TestSplittingByFunctionOrCharacterRegex:
    def test_split_by_page(self):
        splitter = DocumentSplitter(split_by="page", split_length=1)
        text = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
+        splitter.warm_up()
        result = splitter.run(documents=[Document(content=text)])
        docs = result["documents"]
        assert len(docs) == 3
@ -183,6 +192,7 @@ class TestSplittingByFunctionOrCharacterRegex:
    def test_split_by_function(self):
        splitting_function = lambda s: s.split(".")
        splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
+        splitter.warm_up()
        text = "This.Is.A.Test"
        result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
        docs = result["documents"]
@ -200,6 +210,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        splitting_function = lambda s: re.split(r"[\s]{2,}", s)
        splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
        text = "This       Is\n  A  Test"
+        splitter.warm_up()
        result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
        docs = result["documents"]
        assert len(docs) == 4
@ -215,6 +226,7 @@ class TestSplittingByFunctionOrCharacterRegex:
    def test_split_by_word_with_overlap(self):
        splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
        text = "This is a text with some words. There is a second sentence. And there is a third sentence."
+        splitter.warm_up()
        result = splitter.run(documents=[Document(content=text)])
        docs = result["documents"]
        assert len(docs) == 2
@ -234,6 +246,7 @@ class TestSplittingByFunctionOrCharacterRegex:
    def test_split_by_line(self):
        splitter = DocumentSplitter(split_by="line", split_length=1)
        text = "This is a text with some words.\nThere is a second sentence.\nAnd there is a third sentence."
+        splitter.warm_up()
        result = splitter.run(documents=[Document(content=text)])
        docs = result["documents"]

@ -252,6 +265,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        splitter = DocumentSplitter(split_by="word", split_length=10)
        doc1 = Document(content="This is a text with some words.")
        doc2 = Document(content="This is a different text with some words.")
+        splitter.warm_up()
        result = splitter.run(documents=[doc1, doc2])
        assert result["documents"][0].meta["source_id"] == doc1.id
        assert result["documents"][1].meta["source_id"] == doc2.id
@ -262,6 +276,7 @@ class TestSplittingByFunctionOrCharacterRegex:
            Document(content="Text.", meta={"name": "doc 0"}),
            Document(content="Text.", meta={"name": "doc 1"}),
        ]
+        splitter.warm_up()
        result = splitter.run(documents=documents)
        assert len(result["documents"]) == 2
        assert result["documents"][0].id != result["documents"][1].id
@ -273,6 +288,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        splitter = DocumentSplitter(split_by="word", split_length=2)
        doc1 = Document(content="This is some text.\f This text is on another page.")
        doc2 = Document(content="This content has two.\f\f page brakes.")
+        splitter.warm_up()
        result = splitter.run(documents=[doc1, doc2])

        expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
@ -283,6 +299,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        splitter = DocumentSplitter(split_by="period", split_length=1)
        doc1 = Document(content="This is some text.\f This text is on another page.")
        doc2 = Document(content="This content has two.\f\f page brakes.")
+        splitter.warm_up()
        result = splitter.run(documents=[doc1, doc2])

        expected_pages = [1, 1, 1, 1]
@ -294,6 +311,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        doc1 = Document(
            content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
        )
+        splitter.warm_up()
        result = splitter.run(documents=[doc1])

        expected_pages = [1, 2, 2, 2]
@ -305,6 +323,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        doc1 = Document(
            content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
        )
+        splitter.warm_up()
        result = splitter.run(documents=[doc1])
        expected_pages = [1, 2, 3]
        for doc, p in zip(result["documents"], expected_pages):
@ -314,6 +333,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        doc1 = Document(
            content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
        )
+        splitter.warm_up()
        result = splitter.run(documents=[doc1])
        expected_pages = [1, 3]

@ -324,6 +344,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
        doc1 = Document(content="This is some text. And\f this text is on another page.")
        doc2 = Document(content="This content has two.\f\f page brakes.")
+        splitter.warm_up()
        result = splitter.run(documents=[doc1, doc2])

        expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
@ -334,6 +355,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        splitter = DocumentSplitter(split_by="period", split_length=2, split_overlap=1)
        doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
        doc2 = Document(content="This content has two.\f\f page brakes. More text.")
+        splitter.warm_up()
        result = splitter.run(documents=[doc1, doc2])

        expected_pages = [1, 1, 1, 2, 1, 1]
@ -345,6 +367,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        doc1 = Document(
            content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
        )
+        splitter.warm_up()
        result = splitter.run(documents=[doc1])

        expected_pages = [1, 2, 2]
@ -356,6 +379,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        doc1 = Document(
            content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
        )
+        splitter.warm_up()
        result = splitter.run(documents=[doc1])
        expected_pages = [1, 2, 3]

@ -366,6 +390,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
        text = "This is a text with some words. There is a second sentence. And a third sentence."
        doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.")
+        splitter.warm_up()
        docs = splitter.run(documents=[doc])["documents"]

        # check split_overlap is added to all the documents
@ -487,6 +512,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        """
        splitter = DocumentSplitter()
        doc = Document(content="")
+        splitter.warm_up()
        results = splitter.run([doc])
        assert results["documents"] == []

@ -496,6 +522,7 @@ class TestSplittingByFunctionOrCharacterRegex:
        """
        splitter = DocumentSplitter()
        doc = Document(content="  ")
+        splitter.warm_up()
        results = splitter.run([doc])
        assert results["documents"][0].content == "  "

@ -543,6 +570,7 @@ class TestSplittingNLTKSentenceSplitter:
            "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... "
            "The moon was full."
        )
+        document_splitter.warm_up()
        documents = document_splitter.run(documents=[Document(content=text)])["documents"]

        assert len(documents) == 2
@ -568,6 +596,7 @@ class TestSplittingNLTKSentenceSplitter:
            "This is another test sentence. (This is a third test sentence.) "
            "This is the last test sentence."
        )
+        document_splitter.warm_up()
        documents = document_splitter.run(documents=[Document(content=text)])["documents"]

        assert len(documents) == 4
@ -601,6 +630,7 @@ class TestSplittingNLTKSentenceSplitter:
            use_split_rules=True,
            extend_abbreviations=True,
        )
+        document_splitter.warm_up()

        text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
        documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@ -633,6 +663,7 @@ class TestSplittingNLTKSentenceSplitter:
            use_split_rules=True,
            extend_abbreviations=True,
        )
+        document_splitter.warm_up()

        text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
        documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@ -660,6 +691,7 @@ class TestSplittingNLTKSentenceSplitter:
            language="en",
            respect_sentence_boundary=True,
        )
+        document_splitter.warm_up()

        text = (
            "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f"
@ -692,6 +724,7 @@ class TestSplittingNLTKSentenceSplitter:
            use_split_rules=False,
            extend_abbreviations=False,
        )
+        document_splitter.warm_up()
        text = (
            "This is a test sentence with many many words that exceeds the split length and should not be repeated. "
            "This is another test sentence. (This is a third test sentence.) "
@ -717,6 +750,7 @@ class TestSplittingNLTKSentenceSplitter:
            extend_abbreviations=True,
            respect_sentence_boundary=True,
        )
+        document_splitter.warm_up()

        text = (
            "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"
--- a/test/components/preprocessors/test_nltk_document_splitter.py
+++ b/test/components/preprocessors/test_nltk_document_splitter.py
@ -42,6 +42,7 @@ class TestNLTKDocumentSplitterSplitIntoUnits:
        document_splitter = NLTKDocumentSplitter(
            split_by="sentence", split_length=2, split_overlap=0, split_threshold=0, language="en"
        )
+        document_splitter.warm_up()

        text = "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night."
        units = document_splitter._split_into_units(text=text, split_by="sentence")
@ -121,11 +122,13 @@ class TestNLTKDocumentSplitterRun:
    def test_run_type_error(self) -> None:
        document_splitter = NLTKDocumentSplitter()
        with pytest.raises(TypeError):
+            document_splitter.warm_up()
            document_splitter.run(documents=Document(content="Moonlight shimmered softly."))  # type: ignore

    def test_run_value_error(self) -> None:
        document_splitter = NLTKDocumentSplitter()
        with pytest.raises(ValueError):
+            document_splitter.warm_up()
            document_splitter.run(documents=[Document(content=None)])

    def test_run_split_by_sentence_1(self) -> None:
@ -138,6 +141,7 @@ class TestNLTKDocumentSplitterRun:
            use_split_rules=True,
            extend_abbreviations=True,
        )
+        document_splitter.warm_up()

        text = (
            "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... "
@ -168,6 +172,7 @@ class TestNLTKDocumentSplitterRun:
            "This is another test sentence. (This is a third test sentence.) "
            "This is the last test sentence."
        )
+        document_splitter.warm_up()
        documents = document_splitter.run(documents=[Document(content=text)])["documents"]

        assert len(documents) == 4
@ -201,6 +206,7 @@ class TestNLTKDocumentSplitterRun:
            use_split_rules=True,
            extend_abbreviations=True,
        )
+        document_splitter.warm_up()

        text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
        documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@ -233,6 +239,7 @@ class TestNLTKDocumentSplitterRun:
            use_split_rules=True,
            extend_abbreviations=True,
        )
+        document_splitter.warm_up()

        text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
        documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@ -262,6 +269,7 @@ class TestNLTKDocumentSplitterRespectSentenceBoundary:
            language="en",
            respect_sentence_boundary=True,
        )
+        document_splitter.warm_up()

        text = (
            "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f"
@ -294,6 +302,7 @@ class TestNLTKDocumentSplitterRespectSentenceBoundary:
            use_split_rules=False,
            extend_abbreviations=False,
        )
+        document_splitter.warm_up()
        text = (
            "This is a test sentence with many many words that exceeds the split length and should not be repeated. "
            "This is another test sentence. (This is a third test sentence.) "
@ -319,6 +328,7 @@ class TestNLTKDocumentSplitterRespectSentenceBoundary:
            extend_abbreviations=True,
            respect_sentence_boundary=True,
        )
+        document_splitter.warm_up()

        text = (
            "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"