fix: Move potential nltk download to warm_up (#8646)

* Move potential nltk download to warm_up

* Update tests

* Add release notes

* Fix tests

* Uncomment

* Make mypy happy

* Add RuntimeError message

* Update release notes

---------

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
This commit is contained in:
Sebastian Husch Lee 2024-12-20 10:41:44 +01:00 committed by GitHub
parent f4d9c2bb91
commit 286061f005
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 90 additions and 17 deletions

View File

@ -107,15 +107,10 @@ class DocumentSplitter:
splitting_function=splitting_function, splitting_function=splitting_function,
respect_sentence_boundary=respect_sentence_boundary, respect_sentence_boundary=respect_sentence_boundary,
) )
self._use_sentence_splitter = split_by == "sentence" or (respect_sentence_boundary and split_by == "word")
if split_by == "sentence" or (respect_sentence_boundary and split_by == "word"): if self._use_sentence_splitter:
nltk_imports.check() nltk_imports.check()
self.sentence_splitter = SentenceSplitter( self.sentence_splitter = None
language=language,
use_split_rules=use_split_rules,
extend_abbreviations=extend_abbreviations,
keep_white_spaces=True,
)
if split_by == "sentence": if split_by == "sentence":
# ToDo: remove this warning in the next major release # ToDo: remove this warning in the next major release
@ -164,6 +159,18 @@ class DocumentSplitter:
) )
self.respect_sentence_boundary = False self.respect_sentence_boundary = False
def warm_up(self):
"""
Warm up the DocumentSplitter by loading the sentence tokenizer.
"""
if self._use_sentence_splitter and self.sentence_splitter is None:
self.sentence_splitter = SentenceSplitter(
language=self.language,
use_split_rules=self.use_split_rules,
extend_abbreviations=self.extend_abbreviations,
keep_white_spaces=True,
)
@component.output_types(documents=List[Document]) @component.output_types(documents=List[Document])
def run(self, documents: List[Document]): def run(self, documents: List[Document]):
""" """
@ -182,6 +189,10 @@ class DocumentSplitter:
:raises TypeError: if the input is not a list of Documents. :raises TypeError: if the input is not a list of Documents.
:raises ValueError: if the content of a document is None. :raises ValueError: if the content of a document is None.
""" """
if self._use_sentence_splitter and self.sentence_splitter is None:
raise RuntimeError(
"The component DocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
)
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)): if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
raise TypeError("DocumentSplitter expects a List of Documents as input.") raise TypeError("DocumentSplitter expects a List of Documents as input.")

View File

@ -77,16 +77,23 @@ class NLTKDocumentSplitter(DocumentSplitter):
self.respect_sentence_boundary = respect_sentence_boundary self.respect_sentence_boundary = respect_sentence_boundary
self.use_split_rules = use_split_rules self.use_split_rules = use_split_rules
self.extend_abbreviations = extend_abbreviations self.extend_abbreviations = extend_abbreviations
self.sentence_splitter = SentenceSplitter( self.sentence_splitter = None
language=language,
use_split_rules=use_split_rules,
extend_abbreviations=extend_abbreviations,
keep_white_spaces=True,
)
self.language = language self.language = language
def warm_up(self):
"""
Warm up the NLTKDocumentSplitter by loading the sentence tokenizer.
"""
if self.sentence_splitter is None:
self.sentence_splitter = SentenceSplitter(
language=self.language,
use_split_rules=self.use_split_rules,
extend_abbreviations=self.extend_abbreviations,
keep_white_spaces=True,
)
def _split_into_units( def _split_into_units(
self, text: str, split_by: Literal["function", "page", "passage", "sentence", "word", "line"] self, text: str, split_by: Literal["function", "page", "passage", "period", "sentence", "word", "line"]
) -> List[str]: ) -> List[str]:
""" """
Splits the text into units based on the specified split_by parameter. Splits the text into units based on the specified split_by parameter.
@ -106,6 +113,7 @@ class NLTKDocumentSplitter(DocumentSplitter):
# whitespace is preserved while splitting text into sentences when using keep_white_spaces=True # whitespace is preserved while splitting text into sentences when using keep_white_spaces=True
# so split_at is set to an empty string # so split_at is set to an empty string
self.split_at = "" self.split_at = ""
assert self.sentence_splitter is not None
result = self.sentence_splitter.split_sentences(text) result = self.sentence_splitter.split_sentences(text)
units = [sentence["sentence"] for sentence in result] units = [sentence["sentence"] for sentence in result]
elif split_by == "word": elif split_by == "word":
@ -142,6 +150,11 @@ class NLTKDocumentSplitter(DocumentSplitter):
:raises TypeError: if the input is not a list of Documents. :raises TypeError: if the input is not a list of Documents.
:raises ValueError: if the content of a document is None. :raises ValueError: if the content of a document is None.
""" """
if self.sentence_splitter is None:
raise RuntimeError(
"The component NLTKDocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
)
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)): if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
raise TypeError("DocumentSplitter expects a List of Documents as input.") raise TypeError("DocumentSplitter expects a List of Documents as input.")
@ -221,8 +234,9 @@ class NLTKDocumentSplitter(DocumentSplitter):
break break
return num_sentences_to_keep return num_sentences_to_keep
@staticmethod
def _concatenate_sentences_based_on_word_amount( def _concatenate_sentences_based_on_word_amount(
self, sentences: List[str], split_length: int, split_overlap: int sentences: List[str], split_length: int, split_overlap: int
) -> Tuple[List[str], List[int], List[int]]: ) -> Tuple[List[str], List[int], List[int]]:
""" """
Groups the sentences into chunks of `split_length` words while respecting sentence boundaries. Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
@ -258,7 +272,7 @@ class NLTKDocumentSplitter(DocumentSplitter):
split_start_indices.append(chunk_start_idx) split_start_indices.append(chunk_start_idx)
# Get the number of sentences that overlap with the next chunk # Get the number of sentences that overlap with the next chunk
num_sentences_to_keep = self._number_of_sentences_to_keep( num_sentences_to_keep = NLTKDocumentSplitter._number_of_sentences_to_keep(
sentences=current_chunk, split_length=split_length, split_overlap=split_overlap sentences=current_chunk, split_length=split_length, split_overlap=split_overlap
) )
# Set up information for the new chunk # Set up information for the new chunk

View File

@ -0,0 +1,4 @@
---
fixes:
- |
Moved the NLTK download of DocumentSplitter and NLTKDocumentSplitter to warm_up(). This prevents calling to an external api during instantiation. If a DocumentSplitter or NLTKDocumentSplitter is used for sentence splitting outside of a pipeline, warm_up() now needs to be called before running the component.

View File

@ -44,16 +44,19 @@ class TestSplittingByFunctionOrCharacterRegex:
ValueError, match="DocumentSplitter only works with text documents but content for document ID" ValueError, match="DocumentSplitter only works with text documents but content for document ID"
): ):
splitter = DocumentSplitter() splitter = DocumentSplitter()
splitter.warm_up()
splitter.run(documents=[Document()]) splitter.run(documents=[Document()])
assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text
def test_single_doc(self): def test_single_doc(self):
with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."): with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
splitter = DocumentSplitter() splitter = DocumentSplitter()
splitter.warm_up()
splitter.run(documents=Document()) splitter.run(documents=Document())
def test_empty_list(self): def test_empty_list(self):
splitter = DocumentSplitter() splitter = DocumentSplitter()
splitter.warm_up()
res = splitter.run(documents=[]) res = splitter.run(documents=[])
assert res == {"documents": []} assert res == {"documents": []}
@ -76,6 +79,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_word(self): def test_split_by_word(self):
splitter = DocumentSplitter(split_by="word", split_length=10) splitter = DocumentSplitter(split_by="word", split_length=10)
text = "This is a text with some words. There is a second sentence. And there is a third sentence." text = "This is a text with some words. There is a second sentence. And there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)]) result = splitter.run(documents=[Document(content=text)])
docs = result["documents"] docs = result["documents"]
assert len(docs) == 2 assert len(docs) == 2
@ -88,6 +92,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_word_with_threshold(self): def test_split_by_word_with_threshold(self):
splitter = DocumentSplitter(split_by="word", split_length=15, split_threshold=10) splitter = DocumentSplitter(split_by="word", split_length=15, split_threshold=10)
splitter.warm_up()
result = splitter.run( result = splitter.run(
documents=[ documents=[
Document( Document(
@ -105,6 +110,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_by="word", split_length=10) splitter = DocumentSplitter(split_by="word", split_length=10)
text1 = "This is a text with some words. There is a second sentence. And there is a third sentence." text1 = "This is a text with some words. There is a second sentence. And there is a third sentence."
text2 = "This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence." text2 = "This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text1), Document(content=text2)]) result = splitter.run(documents=[Document(content=text1), Document(content=text2)])
docs = result["documents"] docs = result["documents"]
assert len(docs) == 5 assert len(docs) == 5
@ -132,6 +138,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_period(self): def test_split_by_period(self):
splitter = DocumentSplitter(split_by="period", split_length=1) splitter = DocumentSplitter(split_by="period", split_length=1)
text = "This is a text with some words. There is a second sentence. And there is a third sentence." text = "This is a text with some words. There is a second sentence. And there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)]) result = splitter.run(documents=[Document(content=text)])
docs = result["documents"] docs = result["documents"]
assert len(docs) == 3 assert len(docs) == 3
@ -148,6 +155,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_passage(self): def test_split_by_passage(self):
splitter = DocumentSplitter(split_by="passage", split_length=1) splitter = DocumentSplitter(split_by="passage", split_length=1)
text = "This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage." text = "This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)]) result = splitter.run(documents=[Document(content=text)])
docs = result["documents"] docs = result["documents"]
assert len(docs) == 3 assert len(docs) == 3
@ -164,6 +172,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_page(self): def test_split_by_page(self):
splitter = DocumentSplitter(split_by="page", split_length=1) splitter = DocumentSplitter(split_by="page", split_length=1)
text = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." text = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)]) result = splitter.run(documents=[Document(content=text)])
docs = result["documents"] docs = result["documents"]
assert len(docs) == 3 assert len(docs) == 3
@ -183,6 +192,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_function(self): def test_split_by_function(self):
splitting_function = lambda s: s.split(".") splitting_function = lambda s: s.split(".")
splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function) splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
splitter.warm_up()
text = "This.Is.A.Test" text = "This.Is.A.Test"
result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})]) result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
docs = result["documents"] docs = result["documents"]
@ -200,6 +210,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitting_function = lambda s: re.split(r"[\s]{2,}", s) splitting_function = lambda s: re.split(r"[\s]{2,}", s)
splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function) splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
text = "This Is\n A Test" text = "This Is\n A Test"
splitter.warm_up()
result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})]) result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
docs = result["documents"] docs = result["documents"]
assert len(docs) == 4 assert len(docs) == 4
@ -215,6 +226,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_word_with_overlap(self): def test_split_by_word_with_overlap(self):
splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2) splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
text = "This is a text with some words. There is a second sentence. And there is a third sentence." text = "This is a text with some words. There is a second sentence. And there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)]) result = splitter.run(documents=[Document(content=text)])
docs = result["documents"] docs = result["documents"]
assert len(docs) == 2 assert len(docs) == 2
@ -234,6 +246,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_line(self): def test_split_by_line(self):
splitter = DocumentSplitter(split_by="line", split_length=1) splitter = DocumentSplitter(split_by="line", split_length=1)
text = "This is a text with some words.\nThere is a second sentence.\nAnd there is a third sentence." text = "This is a text with some words.\nThere is a second sentence.\nAnd there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)]) result = splitter.run(documents=[Document(content=text)])
docs = result["documents"] docs = result["documents"]
@ -252,6 +265,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_by="word", split_length=10) splitter = DocumentSplitter(split_by="word", split_length=10)
doc1 = Document(content="This is a text with some words.") doc1 = Document(content="This is a text with some words.")
doc2 = Document(content="This is a different text with some words.") doc2 = Document(content="This is a different text with some words.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2]) result = splitter.run(documents=[doc1, doc2])
assert result["documents"][0].meta["source_id"] == doc1.id assert result["documents"][0].meta["source_id"] == doc1.id
assert result["documents"][1].meta["source_id"] == doc2.id assert result["documents"][1].meta["source_id"] == doc2.id
@ -262,6 +276,7 @@ class TestSplittingByFunctionOrCharacterRegex:
Document(content="Text.", meta={"name": "doc 0"}), Document(content="Text.", meta={"name": "doc 0"}),
Document(content="Text.", meta={"name": "doc 1"}), Document(content="Text.", meta={"name": "doc 1"}),
] ]
splitter.warm_up()
result = splitter.run(documents=documents) result = splitter.run(documents=documents)
assert len(result["documents"]) == 2 assert len(result["documents"]) == 2
assert result["documents"][0].id != result["documents"][1].id assert result["documents"][0].id != result["documents"][1].id
@ -273,6 +288,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_by="word", split_length=2) splitter = DocumentSplitter(split_by="word", split_length=2)
doc1 = Document(content="This is some text.\f This text is on another page.") doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.") doc2 = Document(content="This content has two.\f\f page brakes.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2]) result = splitter.run(documents=[doc1, doc2])
expected_pages = [1, 1, 2, 2, 2, 1, 1, 3] expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
@ -283,6 +299,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_by="period", split_length=1) splitter = DocumentSplitter(split_by="period", split_length=1)
doc1 = Document(content="This is some text.\f This text is on another page.") doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.") doc2 = Document(content="This content has two.\f\f page brakes.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2]) result = splitter.run(documents=[doc1, doc2])
expected_pages = [1, 1, 1, 1] expected_pages = [1, 1, 1, 1]
@ -294,6 +311,7 @@ class TestSplittingByFunctionOrCharacterRegex:
doc1 = Document( doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage." content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
) )
splitter.warm_up()
result = splitter.run(documents=[doc1]) result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 2, 2] expected_pages = [1, 2, 2, 2]
@ -305,6 +323,7 @@ class TestSplittingByFunctionOrCharacterRegex:
doc1 = Document( doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
) )
splitter.warm_up()
result = splitter.run(documents=[doc1]) result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3] expected_pages = [1, 2, 3]
for doc, p in zip(result["documents"], expected_pages): for doc, p in zip(result["documents"], expected_pages):
@ -314,6 +333,7 @@ class TestSplittingByFunctionOrCharacterRegex:
doc1 = Document( doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
) )
splitter.warm_up()
result = splitter.run(documents=[doc1]) result = splitter.run(documents=[doc1])
expected_pages = [1, 3] expected_pages = [1, 3]
@ -324,6 +344,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1) splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
doc1 = Document(content="This is some text. And\f this text is on another page.") doc1 = Document(content="This is some text. And\f this text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.") doc2 = Document(content="This content has two.\f\f page brakes.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2]) result = splitter.run(documents=[doc1, doc2])
expected_pages = [1, 1, 1, 2, 2, 1, 1, 3] expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
@ -334,6 +355,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_by="period", split_length=2, split_overlap=1) splitter = DocumentSplitter(split_by="period", split_length=2, split_overlap=1)
doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.") doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
doc2 = Document(content="This content has two.\f\f page brakes. More text.") doc2 = Document(content="This content has two.\f\f page brakes. More text.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2]) result = splitter.run(documents=[doc1, doc2])
expected_pages = [1, 1, 1, 2, 1, 1] expected_pages = [1, 1, 1, 2, 1, 1]
@ -345,6 +367,7 @@ class TestSplittingByFunctionOrCharacterRegex:
doc1 = Document( doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage." content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
) )
splitter.warm_up()
result = splitter.run(documents=[doc1]) result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 2] expected_pages = [1, 2, 2]
@ -356,6 +379,7 @@ class TestSplittingByFunctionOrCharacterRegex:
doc1 = Document( doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
) )
splitter.warm_up()
result = splitter.run(documents=[doc1]) result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3] expected_pages = [1, 2, 3]
@ -366,6 +390,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word") splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
text = "This is a text with some words. There is a second sentence. And a third sentence." text = "This is a text with some words. There is a second sentence. And a third sentence."
doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.") doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.")
splitter.warm_up()
docs = splitter.run(documents=[doc])["documents"] docs = splitter.run(documents=[doc])["documents"]
# check split_overlap is added to all the documents # check split_overlap is added to all the documents
@ -487,6 +512,7 @@ class TestSplittingByFunctionOrCharacterRegex:
""" """
splitter = DocumentSplitter() splitter = DocumentSplitter()
doc = Document(content="") doc = Document(content="")
splitter.warm_up()
results = splitter.run([doc]) results = splitter.run([doc])
assert results["documents"] == [] assert results["documents"] == []
@ -496,6 +522,7 @@ class TestSplittingByFunctionOrCharacterRegex:
""" """
splitter = DocumentSplitter() splitter = DocumentSplitter()
doc = Document(content=" ") doc = Document(content=" ")
splitter.warm_up()
results = splitter.run([doc]) results = splitter.run([doc])
assert results["documents"][0].content == " " assert results["documents"][0].content == " "
@ -543,6 +570,7 @@ class TestSplittingNLTKSentenceSplitter:
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... " "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... "
"The moon was full." "The moon was full."
) )
document_splitter.warm_up()
documents = document_splitter.run(documents=[Document(content=text)])["documents"] documents = document_splitter.run(documents=[Document(content=text)])["documents"]
assert len(documents) == 2 assert len(documents) == 2
@ -568,6 +596,7 @@ class TestSplittingNLTKSentenceSplitter:
"This is another test sentence. (This is a third test sentence.) " "This is another test sentence. (This is a third test sentence.) "
"This is the last test sentence." "This is the last test sentence."
) )
document_splitter.warm_up()
documents = document_splitter.run(documents=[Document(content=text)])["documents"] documents = document_splitter.run(documents=[Document(content=text)])["documents"]
assert len(documents) == 4 assert len(documents) == 4
@ -601,6 +630,7 @@ class TestSplittingNLTKSentenceSplitter:
use_split_rules=True, use_split_rules=True,
extend_abbreviations=True, extend_abbreviations=True,
) )
document_splitter.warm_up()
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5." text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
documents = document_splitter.run(documents=[Document(content=text)])["documents"] documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@ -633,6 +663,7 @@ class TestSplittingNLTKSentenceSplitter:
use_split_rules=True, use_split_rules=True,
extend_abbreviations=True, extend_abbreviations=True,
) )
document_splitter.warm_up()
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5." text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
documents = document_splitter.run(documents=[Document(content=text)])["documents"] documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@ -660,6 +691,7 @@ class TestSplittingNLTKSentenceSplitter:
language="en", language="en",
respect_sentence_boundary=True, respect_sentence_boundary=True,
) )
document_splitter.warm_up()
text = ( text = (
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f" "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f"
@ -692,6 +724,7 @@ class TestSplittingNLTKSentenceSplitter:
use_split_rules=False, use_split_rules=False,
extend_abbreviations=False, extend_abbreviations=False,
) )
document_splitter.warm_up()
text = ( text = (
"This is a test sentence with many many words that exceeds the split length and should not be repeated. " "This is a test sentence with many many words that exceeds the split length and should not be repeated. "
"This is another test sentence. (This is a third test sentence.) " "This is another test sentence. (This is a third test sentence.) "
@ -717,6 +750,7 @@ class TestSplittingNLTKSentenceSplitter:
extend_abbreviations=True, extend_abbreviations=True,
respect_sentence_boundary=True, respect_sentence_boundary=True,
) )
document_splitter.warm_up()
text = ( text = (
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"

View File

@ -42,6 +42,7 @@ class TestNLTKDocumentSplitterSplitIntoUnits:
document_splitter = NLTKDocumentSplitter( document_splitter = NLTKDocumentSplitter(
split_by="sentence", split_length=2, split_overlap=0, split_threshold=0, language="en" split_by="sentence", split_length=2, split_overlap=0, split_threshold=0, language="en"
) )
document_splitter.warm_up()
text = "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night." text = "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night."
units = document_splitter._split_into_units(text=text, split_by="sentence") units = document_splitter._split_into_units(text=text, split_by="sentence")
@ -121,11 +122,13 @@ class TestNLTKDocumentSplitterRun:
def test_run_type_error(self) -> None: def test_run_type_error(self) -> None:
document_splitter = NLTKDocumentSplitter() document_splitter = NLTKDocumentSplitter()
with pytest.raises(TypeError): with pytest.raises(TypeError):
document_splitter.warm_up()
document_splitter.run(documents=Document(content="Moonlight shimmered softly.")) # type: ignore document_splitter.run(documents=Document(content="Moonlight shimmered softly.")) # type: ignore
def test_run_value_error(self) -> None: def test_run_value_error(self) -> None:
document_splitter = NLTKDocumentSplitter() document_splitter = NLTKDocumentSplitter()
with pytest.raises(ValueError): with pytest.raises(ValueError):
document_splitter.warm_up()
document_splitter.run(documents=[Document(content=None)]) document_splitter.run(documents=[Document(content=None)])
def test_run_split_by_sentence_1(self) -> None: def test_run_split_by_sentence_1(self) -> None:
@ -138,6 +141,7 @@ class TestNLTKDocumentSplitterRun:
use_split_rules=True, use_split_rules=True,
extend_abbreviations=True, extend_abbreviations=True,
) )
document_splitter.warm_up()
text = ( text = (
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... " "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... "
@ -168,6 +172,7 @@ class TestNLTKDocumentSplitterRun:
"This is another test sentence. (This is a third test sentence.) " "This is another test sentence. (This is a third test sentence.) "
"This is the last test sentence." "This is the last test sentence."
) )
document_splitter.warm_up()
documents = document_splitter.run(documents=[Document(content=text)])["documents"] documents = document_splitter.run(documents=[Document(content=text)])["documents"]
assert len(documents) == 4 assert len(documents) == 4
@ -201,6 +206,7 @@ class TestNLTKDocumentSplitterRun:
use_split_rules=True, use_split_rules=True,
extend_abbreviations=True, extend_abbreviations=True,
) )
document_splitter.warm_up()
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5." text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
documents = document_splitter.run(documents=[Document(content=text)])["documents"] documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@ -233,6 +239,7 @@ class TestNLTKDocumentSplitterRun:
use_split_rules=True, use_split_rules=True,
extend_abbreviations=True, extend_abbreviations=True,
) )
document_splitter.warm_up()
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5." text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
documents = document_splitter.run(documents=[Document(content=text)])["documents"] documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@ -262,6 +269,7 @@ class TestNLTKDocumentSplitterRespectSentenceBoundary:
language="en", language="en",
respect_sentence_boundary=True, respect_sentence_boundary=True,
) )
document_splitter.warm_up()
text = ( text = (
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f" "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f"
@ -294,6 +302,7 @@ class TestNLTKDocumentSplitterRespectSentenceBoundary:
use_split_rules=False, use_split_rules=False,
extend_abbreviations=False, extend_abbreviations=False,
) )
document_splitter.warm_up()
text = ( text = (
"This is a test sentence with many many words that exceeds the split length and should not be repeated. " "This is a test sentence with many many words that exceeds the split length and should not be repeated. "
"This is another test sentence. (This is a third test sentence.) " "This is another test sentence. (This is a third test sentence.) "
@ -319,6 +328,7 @@ class TestNLTKDocumentSplitterRespectSentenceBoundary:
extend_abbreviations=True, extend_abbreviations=True,
respect_sentence_boundary=True, respect_sentence_boundary=True,
) )
document_splitter.warm_up()
text = ( text = (
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f" "Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"