fix: Move potential nltk download to warm_up (#8646)

* Move potential nltk download to warm_up

* Update tests

* Add release notes

* Fix tests

* Uncomment

* Make mypy happy

* Add RuntimeError message

* Update release notes

---------

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
This commit is contained in:
Sebastian Husch Lee 2024-12-20 10:41:44 +01:00 committed by GitHub
parent f4d9c2bb91
commit 286061f005
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 90 additions and 17 deletions

View File

@ -107,15 +107,10 @@ class DocumentSplitter:
splitting_function=splitting_function,
respect_sentence_boundary=respect_sentence_boundary,
)
if split_by == "sentence" or (respect_sentence_boundary and split_by == "word"):
self._use_sentence_splitter = split_by == "sentence" or (respect_sentence_boundary and split_by == "word")
if self._use_sentence_splitter:
nltk_imports.check()
self.sentence_splitter = SentenceSplitter(
language=language,
use_split_rules=use_split_rules,
extend_abbreviations=extend_abbreviations,
keep_white_spaces=True,
)
self.sentence_splitter = None
if split_by == "sentence":
# ToDo: remove this warning in the next major release
@ -164,6 +159,18 @@ class DocumentSplitter:
)
self.respect_sentence_boundary = False
def warm_up(self):
"""
Warm up the DocumentSplitter by loading the sentence tokenizer.
"""
if self._use_sentence_splitter and self.sentence_splitter is None:
self.sentence_splitter = SentenceSplitter(
language=self.language,
use_split_rules=self.use_split_rules,
extend_abbreviations=self.extend_abbreviations,
keep_white_spaces=True,
)
@component.output_types(documents=List[Document])
def run(self, documents: List[Document]):
"""
@ -182,6 +189,10 @@ class DocumentSplitter:
:raises TypeError: if the input is not a list of Documents.
:raises ValueError: if the content of a document is None.
"""
if self._use_sentence_splitter and self.sentence_splitter is None:
raise RuntimeError(
"The component DocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
)
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
raise TypeError("DocumentSplitter expects a List of Documents as input.")

View File

@ -77,16 +77,23 @@ class NLTKDocumentSplitter(DocumentSplitter):
self.respect_sentence_boundary = respect_sentence_boundary
self.use_split_rules = use_split_rules
self.extend_abbreviations = extend_abbreviations
self.sentence_splitter = SentenceSplitter(
language=language,
use_split_rules=use_split_rules,
extend_abbreviations=extend_abbreviations,
keep_white_spaces=True,
)
self.sentence_splitter = None
self.language = language
def warm_up(self):
"""
Warm up the NLTKDocumentSplitter by loading the sentence tokenizer.
"""
if self.sentence_splitter is None:
self.sentence_splitter = SentenceSplitter(
language=self.language,
use_split_rules=self.use_split_rules,
extend_abbreviations=self.extend_abbreviations,
keep_white_spaces=True,
)
def _split_into_units(
self, text: str, split_by: Literal["function", "page", "passage", "sentence", "word", "line"]
self, text: str, split_by: Literal["function", "page", "passage", "period", "sentence", "word", "line"]
) -> List[str]:
"""
Splits the text into units based on the specified split_by parameter.
@ -106,6 +113,7 @@ class NLTKDocumentSplitter(DocumentSplitter):
# whitespace is preserved while splitting text into sentences when using keep_white_spaces=True
# so split_at is set to an empty string
self.split_at = ""
assert self.sentence_splitter is not None
result = self.sentence_splitter.split_sentences(text)
units = [sentence["sentence"] for sentence in result]
elif split_by == "word":
@ -142,6 +150,11 @@ class NLTKDocumentSplitter(DocumentSplitter):
:raises TypeError: if the input is not a list of Documents.
:raises ValueError: if the content of a document is None.
"""
if self.sentence_splitter is None:
raise RuntimeError(
"The component NLTKDocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
)
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
raise TypeError("DocumentSplitter expects a List of Documents as input.")
@ -221,8 +234,9 @@ class NLTKDocumentSplitter(DocumentSplitter):
break
return num_sentences_to_keep
@staticmethod
def _concatenate_sentences_based_on_word_amount(
self, sentences: List[str], split_length: int, split_overlap: int
sentences: List[str], split_length: int, split_overlap: int
) -> Tuple[List[str], List[int], List[int]]:
"""
Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
@ -258,7 +272,7 @@ class NLTKDocumentSplitter(DocumentSplitter):
split_start_indices.append(chunk_start_idx)
# Get the number of sentences that overlap with the next chunk
num_sentences_to_keep = self._number_of_sentences_to_keep(
num_sentences_to_keep = NLTKDocumentSplitter._number_of_sentences_to_keep(
sentences=current_chunk, split_length=split_length, split_overlap=split_overlap
)
# Set up information for the new chunk

View File

@ -0,0 +1,4 @@
---
fixes:
- |
Moved the NLTK download of DocumentSplitter and NLTKDocumentSplitter to warm_up(). This prevents calling to an external api during instantiation. If a DocumentSplitter or NLTKDocumentSplitter is used for sentence splitting outside of a pipeline, warm_up() now needs to be called before running the component.

View File

@ -44,16 +44,19 @@ class TestSplittingByFunctionOrCharacterRegex:
ValueError, match="DocumentSplitter only works with text documents but content for document ID"
):
splitter = DocumentSplitter()
splitter.warm_up()
splitter.run(documents=[Document()])
assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text
def test_single_doc(self):
with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
splitter = DocumentSplitter()
splitter.warm_up()
splitter.run(documents=Document())
def test_empty_list(self):
splitter = DocumentSplitter()
splitter.warm_up()
res = splitter.run(documents=[])
assert res == {"documents": []}
@ -76,6 +79,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_word(self):
splitter = DocumentSplitter(split_by="word", split_length=10)
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 2
@ -88,6 +92,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_word_with_threshold(self):
splitter = DocumentSplitter(split_by="word", split_length=15, split_threshold=10)
splitter.warm_up()
result = splitter.run(
documents=[
Document(
@ -105,6 +110,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_by="word", split_length=10)
text1 = "This is a text with some words. There is a second sentence. And there is a third sentence."
text2 = "This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text1), Document(content=text2)])
docs = result["documents"]
assert len(docs) == 5
@ -132,6 +138,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_period(self):
splitter = DocumentSplitter(split_by="period", split_length=1)
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 3
@ -148,6 +155,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_passage(self):
splitter = DocumentSplitter(split_by="passage", split_length=1)
text = "This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 3
@ -164,6 +172,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_page(self):
splitter = DocumentSplitter(split_by="page", split_length=1)
text = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 3
@ -183,6 +192,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_function(self):
splitting_function = lambda s: s.split(".")
splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
splitter.warm_up()
text = "This.Is.A.Test"
result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
docs = result["documents"]
@ -200,6 +210,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitting_function = lambda s: re.split(r"[\s]{2,}", s)
splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
text = "This Is\n A Test"
splitter.warm_up()
result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
docs = result["documents"]
assert len(docs) == 4
@ -215,6 +226,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_word_with_overlap(self):
splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
assert len(docs) == 2
@ -234,6 +246,7 @@ class TestSplittingByFunctionOrCharacterRegex:
def test_split_by_line(self):
splitter = DocumentSplitter(split_by="line", split_length=1)
text = "This is a text with some words.\nThere is a second sentence.\nAnd there is a third sentence."
splitter.warm_up()
result = splitter.run(documents=[Document(content=text)])
docs = result["documents"]
@ -252,6 +265,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_by="word", split_length=10)
doc1 = Document(content="This is a text with some words.")
doc2 = Document(content="This is a different text with some words.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])
assert result["documents"][0].meta["source_id"] == doc1.id
assert result["documents"][1].meta["source_id"] == doc2.id
@ -262,6 +276,7 @@ class TestSplittingByFunctionOrCharacterRegex:
Document(content="Text.", meta={"name": "doc 0"}),
Document(content="Text.", meta={"name": "doc 1"}),
]
splitter.warm_up()
result = splitter.run(documents=documents)
assert len(result["documents"]) == 2
assert result["documents"][0].id != result["documents"][1].id
@ -273,6 +288,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_by="word", split_length=2)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])
expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
@ -283,6 +299,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_by="period", split_length=1)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])
expected_pages = [1, 1, 1, 1]
@ -294,6 +311,7 @@ class TestSplittingByFunctionOrCharacterRegex:
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 2, 2]
@ -305,6 +323,7 @@ class TestSplittingByFunctionOrCharacterRegex:
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3]
for doc, p in zip(result["documents"], expected_pages):
@ -314,6 +333,7 @@ class TestSplittingByFunctionOrCharacterRegex:
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])
expected_pages = [1, 3]
@ -324,6 +344,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
doc1 = Document(content="This is some text. And\f this text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])
expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
@ -334,6 +355,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_by="period", split_length=2, split_overlap=1)
doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
doc2 = Document(content="This content has two.\f\f page brakes. More text.")
splitter.warm_up()
result = splitter.run(documents=[doc1, doc2])
expected_pages = [1, 1, 1, 2, 1, 1]
@ -345,6 +367,7 @@ class TestSplittingByFunctionOrCharacterRegex:
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 2]
@ -356,6 +379,7 @@ class TestSplittingByFunctionOrCharacterRegex:
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
)
splitter.warm_up()
result = splitter.run(documents=[doc1])
expected_pages = [1, 2, 3]
@ -366,6 +390,7 @@ class TestSplittingByFunctionOrCharacterRegex:
splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
text = "This is a text with some words. There is a second sentence. And a third sentence."
doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.")
splitter.warm_up()
docs = splitter.run(documents=[doc])["documents"]
# check split_overlap is added to all the documents
@ -487,6 +512,7 @@ class TestSplittingByFunctionOrCharacterRegex:
"""
splitter = DocumentSplitter()
doc = Document(content="")
splitter.warm_up()
results = splitter.run([doc])
assert results["documents"] == []
@ -496,6 +522,7 @@ class TestSplittingByFunctionOrCharacterRegex:
"""
splitter = DocumentSplitter()
doc = Document(content=" ")
splitter.warm_up()
results = splitter.run([doc])
assert results["documents"][0].content == " "
@ -543,6 +570,7 @@ class TestSplittingNLTKSentenceSplitter:
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... "
"The moon was full."
)
document_splitter.warm_up()
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
assert len(documents) == 2
@ -568,6 +596,7 @@ class TestSplittingNLTKSentenceSplitter:
"This is another test sentence. (This is a third test sentence.) "
"This is the last test sentence."
)
document_splitter.warm_up()
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
assert len(documents) == 4
@ -601,6 +630,7 @@ class TestSplittingNLTKSentenceSplitter:
use_split_rules=True,
extend_abbreviations=True,
)
document_splitter.warm_up()
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@ -633,6 +663,7 @@ class TestSplittingNLTKSentenceSplitter:
use_split_rules=True,
extend_abbreviations=True,
)
document_splitter.warm_up()
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@ -660,6 +691,7 @@ class TestSplittingNLTKSentenceSplitter:
language="en",
respect_sentence_boundary=True,
)
document_splitter.warm_up()
text = (
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f"
@ -692,6 +724,7 @@ class TestSplittingNLTKSentenceSplitter:
use_split_rules=False,
extend_abbreviations=False,
)
document_splitter.warm_up()
text = (
"This is a test sentence with many many words that exceeds the split length and should not be repeated. "
"This is another test sentence. (This is a third test sentence.) "
@ -717,6 +750,7 @@ class TestSplittingNLTKSentenceSplitter:
extend_abbreviations=True,
respect_sentence_boundary=True,
)
document_splitter.warm_up()
text = (
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"

View File

@ -42,6 +42,7 @@ class TestNLTKDocumentSplitterSplitIntoUnits:
document_splitter = NLTKDocumentSplitter(
split_by="sentence", split_length=2, split_overlap=0, split_threshold=0, language="en"
)
document_splitter.warm_up()
text = "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night."
units = document_splitter._split_into_units(text=text, split_by="sentence")
@ -121,11 +122,13 @@ class TestNLTKDocumentSplitterRun:
def test_run_type_error(self) -> None:
document_splitter = NLTKDocumentSplitter()
with pytest.raises(TypeError):
document_splitter.warm_up()
document_splitter.run(documents=Document(content="Moonlight shimmered softly.")) # type: ignore
def test_run_value_error(self) -> None:
document_splitter = NLTKDocumentSplitter()
with pytest.raises(ValueError):
document_splitter.warm_up()
document_splitter.run(documents=[Document(content=None)])
def test_run_split_by_sentence_1(self) -> None:
@ -138,6 +141,7 @@ class TestNLTKDocumentSplitterRun:
use_split_rules=True,
extend_abbreviations=True,
)
document_splitter.warm_up()
text = (
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... "
@ -168,6 +172,7 @@ class TestNLTKDocumentSplitterRun:
"This is another test sentence. (This is a third test sentence.) "
"This is the last test sentence."
)
document_splitter.warm_up()
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
assert len(documents) == 4
@ -201,6 +206,7 @@ class TestNLTKDocumentSplitterRun:
use_split_rules=True,
extend_abbreviations=True,
)
document_splitter.warm_up()
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@ -233,6 +239,7 @@ class TestNLTKDocumentSplitterRun:
use_split_rules=True,
extend_abbreviations=True,
)
document_splitter.warm_up()
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
@ -262,6 +269,7 @@ class TestNLTKDocumentSplitterRespectSentenceBoundary:
language="en",
respect_sentence_boundary=True,
)
document_splitter.warm_up()
text = (
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f"
@ -294,6 +302,7 @@ class TestNLTKDocumentSplitterRespectSentenceBoundary:
use_split_rules=False,
extend_abbreviations=False,
)
document_splitter.warm_up()
text = (
"This is a test sentence with many many words that exceeds the split length and should not be repeated. "
"This is another test sentence. (This is a third test sentence.) "
@ -319,6 +328,7 @@ class TestNLTKDocumentSplitterRespectSentenceBoundary:
extend_abbreviations=True,
respect_sentence_boundary=True,
)
document_splitter.warm_up()
text = (
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"