mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-26 23:15:59 +00:00
fix: Move potential nltk download to warm_up (#8646)
* Move potential nltk download to warm_up * Update tests * Add release notes * Fix tests * Uncomment * Make mypy happy * Add RuntimeError message * Update release notes --------- Co-authored-by: Julian Risch <julian.risch@deepset.ai>
This commit is contained in:
parent
f4d9c2bb91
commit
286061f005
@ -107,15 +107,10 @@ class DocumentSplitter:
|
||||
splitting_function=splitting_function,
|
||||
respect_sentence_boundary=respect_sentence_boundary,
|
||||
)
|
||||
|
||||
if split_by == "sentence" or (respect_sentence_boundary and split_by == "word"):
|
||||
self._use_sentence_splitter = split_by == "sentence" or (respect_sentence_boundary and split_by == "word")
|
||||
if self._use_sentence_splitter:
|
||||
nltk_imports.check()
|
||||
self.sentence_splitter = SentenceSplitter(
|
||||
language=language,
|
||||
use_split_rules=use_split_rules,
|
||||
extend_abbreviations=extend_abbreviations,
|
||||
keep_white_spaces=True,
|
||||
)
|
||||
self.sentence_splitter = None
|
||||
|
||||
if split_by == "sentence":
|
||||
# ToDo: remove this warning in the next major release
|
||||
@ -164,6 +159,18 @@ class DocumentSplitter:
|
||||
)
|
||||
self.respect_sentence_boundary = False
|
||||
|
||||
def warm_up(self):
|
||||
"""
|
||||
Warm up the DocumentSplitter by loading the sentence tokenizer.
|
||||
"""
|
||||
if self._use_sentence_splitter and self.sentence_splitter is None:
|
||||
self.sentence_splitter = SentenceSplitter(
|
||||
language=self.language,
|
||||
use_split_rules=self.use_split_rules,
|
||||
extend_abbreviations=self.extend_abbreviations,
|
||||
keep_white_spaces=True,
|
||||
)
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(self, documents: List[Document]):
|
||||
"""
|
||||
@ -182,6 +189,10 @@ class DocumentSplitter:
|
||||
:raises TypeError: if the input is not a list of Documents.
|
||||
:raises ValueError: if the content of a document is None.
|
||||
"""
|
||||
if self._use_sentence_splitter and self.sentence_splitter is None:
|
||||
raise RuntimeError(
|
||||
"The component DocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
|
||||
)
|
||||
|
||||
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
|
||||
raise TypeError("DocumentSplitter expects a List of Documents as input.")
|
||||
|
||||
@ -77,16 +77,23 @@ class NLTKDocumentSplitter(DocumentSplitter):
|
||||
self.respect_sentence_boundary = respect_sentence_boundary
|
||||
self.use_split_rules = use_split_rules
|
||||
self.extend_abbreviations = extend_abbreviations
|
||||
self.sentence_splitter = SentenceSplitter(
|
||||
language=language,
|
||||
use_split_rules=use_split_rules,
|
||||
extend_abbreviations=extend_abbreviations,
|
||||
keep_white_spaces=True,
|
||||
)
|
||||
self.sentence_splitter = None
|
||||
self.language = language
|
||||
|
||||
def warm_up(self):
|
||||
"""
|
||||
Warm up the NLTKDocumentSplitter by loading the sentence tokenizer.
|
||||
"""
|
||||
if self.sentence_splitter is None:
|
||||
self.sentence_splitter = SentenceSplitter(
|
||||
language=self.language,
|
||||
use_split_rules=self.use_split_rules,
|
||||
extend_abbreviations=self.extend_abbreviations,
|
||||
keep_white_spaces=True,
|
||||
)
|
||||
|
||||
def _split_into_units(
|
||||
self, text: str, split_by: Literal["function", "page", "passage", "sentence", "word", "line"]
|
||||
self, text: str, split_by: Literal["function", "page", "passage", "period", "sentence", "word", "line"]
|
||||
) -> List[str]:
|
||||
"""
|
||||
Splits the text into units based on the specified split_by parameter.
|
||||
@ -106,6 +113,7 @@ class NLTKDocumentSplitter(DocumentSplitter):
|
||||
# whitespace is preserved while splitting text into sentences when using keep_white_spaces=True
|
||||
# so split_at is set to an empty string
|
||||
self.split_at = ""
|
||||
assert self.sentence_splitter is not None
|
||||
result = self.sentence_splitter.split_sentences(text)
|
||||
units = [sentence["sentence"] for sentence in result]
|
||||
elif split_by == "word":
|
||||
@ -142,6 +150,11 @@ class NLTKDocumentSplitter(DocumentSplitter):
|
||||
:raises TypeError: if the input is not a list of Documents.
|
||||
:raises ValueError: if the content of a document is None.
|
||||
"""
|
||||
if self.sentence_splitter is None:
|
||||
raise RuntimeError(
|
||||
"The component NLTKDocumentSplitter wasn't warmed up. Run 'warm_up()' before calling 'run()'."
|
||||
)
|
||||
|
||||
if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
|
||||
raise TypeError("DocumentSplitter expects a List of Documents as input.")
|
||||
|
||||
@ -221,8 +234,9 @@ class NLTKDocumentSplitter(DocumentSplitter):
|
||||
break
|
||||
return num_sentences_to_keep
|
||||
|
||||
@staticmethod
|
||||
def _concatenate_sentences_based_on_word_amount(
|
||||
self, sentences: List[str], split_length: int, split_overlap: int
|
||||
sentences: List[str], split_length: int, split_overlap: int
|
||||
) -> Tuple[List[str], List[int], List[int]]:
|
||||
"""
|
||||
Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
|
||||
@ -258,7 +272,7 @@ class NLTKDocumentSplitter(DocumentSplitter):
|
||||
split_start_indices.append(chunk_start_idx)
|
||||
|
||||
# Get the number of sentences that overlap with the next chunk
|
||||
num_sentences_to_keep = self._number_of_sentences_to_keep(
|
||||
num_sentences_to_keep = NLTKDocumentSplitter._number_of_sentences_to_keep(
|
||||
sentences=current_chunk, split_length=split_length, split_overlap=split_overlap
|
||||
)
|
||||
# Set up information for the new chunk
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Moved the NLTK download of DocumentSplitter and NLTKDocumentSplitter to warm_up(). This prevents calling to an external api during instantiation. If a DocumentSplitter or NLTKDocumentSplitter is used for sentence splitting outside of a pipeline, warm_up() now needs to be called before running the component.
|
||||
@ -44,16 +44,19 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
ValueError, match="DocumentSplitter only works with text documents but content for document ID"
|
||||
):
|
||||
splitter = DocumentSplitter()
|
||||
splitter.warm_up()
|
||||
splitter.run(documents=[Document()])
|
||||
assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text
|
||||
|
||||
def test_single_doc(self):
|
||||
with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
|
||||
splitter = DocumentSplitter()
|
||||
splitter.warm_up()
|
||||
splitter.run(documents=Document())
|
||||
|
||||
def test_empty_list(self):
|
||||
splitter = DocumentSplitter()
|
||||
splitter.warm_up()
|
||||
res = splitter.run(documents=[])
|
||||
assert res == {"documents": []}
|
||||
|
||||
@ -76,6 +79,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
def test_split_by_word(self):
|
||||
splitter = DocumentSplitter(split_by="word", split_length=10)
|
||||
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[Document(content=text)])
|
||||
docs = result["documents"]
|
||||
assert len(docs) == 2
|
||||
@ -88,6 +92,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
|
||||
def test_split_by_word_with_threshold(self):
|
||||
splitter = DocumentSplitter(split_by="word", split_length=15, split_threshold=10)
|
||||
splitter.warm_up()
|
||||
result = splitter.run(
|
||||
documents=[
|
||||
Document(
|
||||
@ -105,6 +110,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
splitter = DocumentSplitter(split_by="word", split_length=10)
|
||||
text1 = "This is a text with some words. There is a second sentence. And there is a third sentence."
|
||||
text2 = "This is a different text with some words. There is a second sentence. And there is a third sentence. And there is a fourth sentence."
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[Document(content=text1), Document(content=text2)])
|
||||
docs = result["documents"]
|
||||
assert len(docs) == 5
|
||||
@ -132,6 +138,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
def test_split_by_period(self):
|
||||
splitter = DocumentSplitter(split_by="period", split_length=1)
|
||||
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[Document(content=text)])
|
||||
docs = result["documents"]
|
||||
assert len(docs) == 3
|
||||
@ -148,6 +155,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
def test_split_by_passage(self):
|
||||
splitter = DocumentSplitter(split_by="passage", split_length=1)
|
||||
text = "This is a text with some words. There is a second sentence.\n\nAnd there is a third sentence.\n\n And another passage."
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[Document(content=text)])
|
||||
docs = result["documents"]
|
||||
assert len(docs) == 3
|
||||
@ -164,6 +172,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
def test_split_by_page(self):
|
||||
splitter = DocumentSplitter(split_by="page", split_length=1)
|
||||
text = "This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[Document(content=text)])
|
||||
docs = result["documents"]
|
||||
assert len(docs) == 3
|
||||
@ -183,6 +192,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
def test_split_by_function(self):
|
||||
splitting_function = lambda s: s.split(".")
|
||||
splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
|
||||
splitter.warm_up()
|
||||
text = "This.Is.A.Test"
|
||||
result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
|
||||
docs = result["documents"]
|
||||
@ -200,6 +210,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
splitting_function = lambda s: re.split(r"[\s]{2,}", s)
|
||||
splitter = DocumentSplitter(split_by="function", splitting_function=splitting_function)
|
||||
text = "This Is\n A Test"
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[Document(id="1", content=text, meta={"key": "value"})])
|
||||
docs = result["documents"]
|
||||
assert len(docs) == 4
|
||||
@ -215,6 +226,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
def test_split_by_word_with_overlap(self):
|
||||
splitter = DocumentSplitter(split_by="word", split_length=10, split_overlap=2)
|
||||
text = "This is a text with some words. There is a second sentence. And there is a third sentence."
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[Document(content=text)])
|
||||
docs = result["documents"]
|
||||
assert len(docs) == 2
|
||||
@ -234,6 +246,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
def test_split_by_line(self):
|
||||
splitter = DocumentSplitter(split_by="line", split_length=1)
|
||||
text = "This is a text with some words.\nThere is a second sentence.\nAnd there is a third sentence."
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[Document(content=text)])
|
||||
docs = result["documents"]
|
||||
|
||||
@ -252,6 +265,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
splitter = DocumentSplitter(split_by="word", split_length=10)
|
||||
doc1 = Document(content="This is a text with some words.")
|
||||
doc2 = Document(content="This is a different text with some words.")
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[doc1, doc2])
|
||||
assert result["documents"][0].meta["source_id"] == doc1.id
|
||||
assert result["documents"][1].meta["source_id"] == doc2.id
|
||||
@ -262,6 +276,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
Document(content="Text.", meta={"name": "doc 0"}),
|
||||
Document(content="Text.", meta={"name": "doc 1"}),
|
||||
]
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=documents)
|
||||
assert len(result["documents"]) == 2
|
||||
assert result["documents"][0].id != result["documents"][1].id
|
||||
@ -273,6 +288,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
splitter = DocumentSplitter(split_by="word", split_length=2)
|
||||
doc1 = Document(content="This is some text.\f This text is on another page.")
|
||||
doc2 = Document(content="This content has two.\f\f page brakes.")
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[doc1, doc2])
|
||||
|
||||
expected_pages = [1, 1, 2, 2, 2, 1, 1, 3]
|
||||
@ -283,6 +299,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
splitter = DocumentSplitter(split_by="period", split_length=1)
|
||||
doc1 = Document(content="This is some text.\f This text is on another page.")
|
||||
doc2 = Document(content="This content has two.\f\f page brakes.")
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[doc1, doc2])
|
||||
|
||||
expected_pages = [1, 1, 1, 1]
|
||||
@ -294,6 +311,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
doc1 = Document(
|
||||
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
|
||||
)
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[doc1])
|
||||
|
||||
expected_pages = [1, 2, 2, 2]
|
||||
@ -305,6 +323,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
doc1 = Document(
|
||||
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
||||
)
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[doc1])
|
||||
expected_pages = [1, 2, 3]
|
||||
for doc, p in zip(result["documents"], expected_pages):
|
||||
@ -314,6 +333,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
doc1 = Document(
|
||||
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
||||
)
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[doc1])
|
||||
expected_pages = [1, 3]
|
||||
|
||||
@ -324,6 +344,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
|
||||
doc1 = Document(content="This is some text. And\f this text is on another page.")
|
||||
doc2 = Document(content="This content has two.\f\f page brakes.")
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[doc1, doc2])
|
||||
|
||||
expected_pages = [1, 1, 1, 2, 2, 1, 1, 3]
|
||||
@ -334,6 +355,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
splitter = DocumentSplitter(split_by="period", split_length=2, split_overlap=1)
|
||||
doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
|
||||
doc2 = Document(content="This content has two.\f\f page brakes. More text.")
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[doc1, doc2])
|
||||
|
||||
expected_pages = [1, 1, 1, 2, 1, 1]
|
||||
@ -345,6 +367,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
doc1 = Document(
|
||||
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
|
||||
)
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[doc1])
|
||||
|
||||
expected_pages = [1, 2, 2]
|
||||
@ -356,6 +379,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
doc1 = Document(
|
||||
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
|
||||
)
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[doc1])
|
||||
expected_pages = [1, 2, 3]
|
||||
|
||||
@ -366,6 +390,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
splitter = DocumentSplitter(split_length=10, split_overlap=5, split_by="word")
|
||||
text = "This is a text with some words. There is a second sentence. And a third sentence."
|
||||
doc = Document(content="This is a text with some words. There is a second sentence. And a third sentence.")
|
||||
splitter.warm_up()
|
||||
docs = splitter.run(documents=[doc])["documents"]
|
||||
|
||||
# check split_overlap is added to all the documents
|
||||
@ -487,6 +512,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
"""
|
||||
splitter = DocumentSplitter()
|
||||
doc = Document(content="")
|
||||
splitter.warm_up()
|
||||
results = splitter.run([doc])
|
||||
assert results["documents"] == []
|
||||
|
||||
@ -496,6 +522,7 @@ class TestSplittingByFunctionOrCharacterRegex:
|
||||
"""
|
||||
splitter = DocumentSplitter()
|
||||
doc = Document(content=" ")
|
||||
splitter.warm_up()
|
||||
results = splitter.run([doc])
|
||||
assert results["documents"][0].content == " "
|
||||
|
||||
@ -543,6 +570,7 @@ class TestSplittingNLTKSentenceSplitter:
|
||||
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... "
|
||||
"The moon was full."
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
|
||||
|
||||
assert len(documents) == 2
|
||||
@ -568,6 +596,7 @@ class TestSplittingNLTKSentenceSplitter:
|
||||
"This is another test sentence. (This is a third test sentence.) "
|
||||
"This is the last test sentence."
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
|
||||
|
||||
assert len(documents) == 4
|
||||
@ -601,6 +630,7 @@ class TestSplittingNLTKSentenceSplitter:
|
||||
use_split_rules=True,
|
||||
extend_abbreviations=True,
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
|
||||
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
|
||||
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
|
||||
@ -633,6 +663,7 @@ class TestSplittingNLTKSentenceSplitter:
|
||||
use_split_rules=True,
|
||||
extend_abbreviations=True,
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
|
||||
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
|
||||
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
|
||||
@ -660,6 +691,7 @@ class TestSplittingNLTKSentenceSplitter:
|
||||
language="en",
|
||||
respect_sentence_boundary=True,
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
|
||||
text = (
|
||||
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f"
|
||||
@ -692,6 +724,7 @@ class TestSplittingNLTKSentenceSplitter:
|
||||
use_split_rules=False,
|
||||
extend_abbreviations=False,
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
text = (
|
||||
"This is a test sentence with many many words that exceeds the split length and should not be repeated. "
|
||||
"This is another test sentence. (This is a third test sentence.) "
|
||||
@ -717,6 +750,7 @@ class TestSplittingNLTKSentenceSplitter:
|
||||
extend_abbreviations=True,
|
||||
respect_sentence_boundary=True,
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
|
||||
text = (
|
||||
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"
|
||||
|
||||
@ -42,6 +42,7 @@ class TestNLTKDocumentSplitterSplitIntoUnits:
|
||||
document_splitter = NLTKDocumentSplitter(
|
||||
split_by="sentence", split_length=2, split_overlap=0, split_threshold=0, language="en"
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
|
||||
text = "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night."
|
||||
units = document_splitter._split_into_units(text=text, split_by="sentence")
|
||||
@ -121,11 +122,13 @@ class TestNLTKDocumentSplitterRun:
|
||||
def test_run_type_error(self) -> None:
|
||||
document_splitter = NLTKDocumentSplitter()
|
||||
with pytest.raises(TypeError):
|
||||
document_splitter.warm_up()
|
||||
document_splitter.run(documents=Document(content="Moonlight shimmered softly.")) # type: ignore
|
||||
|
||||
def test_run_value_error(self) -> None:
|
||||
document_splitter = NLTKDocumentSplitter()
|
||||
with pytest.raises(ValueError):
|
||||
document_splitter.warm_up()
|
||||
document_splitter.run(documents=[Document(content=None)])
|
||||
|
||||
def test_run_split_by_sentence_1(self) -> None:
|
||||
@ -138,6 +141,7 @@ class TestNLTKDocumentSplitterRun:
|
||||
use_split_rules=True,
|
||||
extend_abbreviations=True,
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
|
||||
text = (
|
||||
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... "
|
||||
@ -168,6 +172,7 @@ class TestNLTKDocumentSplitterRun:
|
||||
"This is another test sentence. (This is a third test sentence.) "
|
||||
"This is the last test sentence."
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
|
||||
|
||||
assert len(documents) == 4
|
||||
@ -201,6 +206,7 @@ class TestNLTKDocumentSplitterRun:
|
||||
use_split_rules=True,
|
||||
extend_abbreviations=True,
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
|
||||
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
|
||||
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
|
||||
@ -233,6 +239,7 @@ class TestNLTKDocumentSplitterRun:
|
||||
use_split_rules=True,
|
||||
extend_abbreviations=True,
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
|
||||
text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5."
|
||||
documents = document_splitter.run(documents=[Document(content=text)])["documents"]
|
||||
@ -262,6 +269,7 @@ class TestNLTKDocumentSplitterRespectSentenceBoundary:
|
||||
language="en",
|
||||
respect_sentence_boundary=True,
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
|
||||
text = (
|
||||
"Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f"
|
||||
@ -294,6 +302,7 @@ class TestNLTKDocumentSplitterRespectSentenceBoundary:
|
||||
use_split_rules=False,
|
||||
extend_abbreviations=False,
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
text = (
|
||||
"This is a test sentence with many many words that exceeds the split length and should not be repeated. "
|
||||
"This is another test sentence. (This is a third test sentence.) "
|
||||
@ -319,6 +328,7 @@ class TestNLTKDocumentSplitterRespectSentenceBoundary:
|
||||
extend_abbreviations=True,
|
||||
respect_sentence_boundary=True,
|
||||
)
|
||||
document_splitter.warm_up()
|
||||
|
||||
text = (
|
||||
"Sentence on page 1. Another on page 1.\fSentence on page 2. Another on page 2.\f"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user