from typing import List import pytest from haystack import Document from pytest import LogCaptureFixture from haystack.components.preprocessors.nltk_document_splitter import NLTKDocumentSplitter, SentenceSplitter def test_init_warning_message(caplog: LogCaptureFixture) -> None: _ = NLTKDocumentSplitter(split_by="page", respect_sentence_boundary=True) assert "The 'respect_sentence_boundary' option is only supported for" in caplog.text class TestNLTKDocumentSplitterSplitIntoUnits: def test_document_splitter_split_into_units_word(self) -> None: document_splitter = NLTKDocumentSplitter( split_by="word", split_length=3, split_overlap=0, split_threshold=0, language="en" ) text = "Moonlight shimmered softly, wolves howled nearby, night enveloped everything." units = document_splitter._split_into_units(text=text, split_by="word") assert units == [ "Moonlight ", "shimmered ", "softly, ", "wolves ", "howled ", "nearby, ", "night ", "enveloped ", "everything.", ] def test_document_splitter_split_into_units_sentence(self) -> None: document_splitter = NLTKDocumentSplitter( split_by="sentence", split_length=2, split_overlap=0, split_threshold=0, language="en" ) text = "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night." units = document_splitter._split_into_units(text=text, split_by="sentence") assert units == [ "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. ", "It was a dark night.", ] def test_document_splitter_split_into_units_passage(self) -> None: document_splitter = NLTKDocumentSplitter( split_by="passage", split_length=2, split_overlap=0, split_threshold=0, language="en" ) text = "Moonlight shimmered softly, wolves howled nearby, night enveloped everything.\n\nIt was a dark night." units = document_splitter._split_into_units(text=text, split_by="passage") assert units == [ "Moonlight shimmered softly, wolves howled nearby, night enveloped everything.\n\n", "It was a dark night.", ] def test_document_splitter_split_into_units_page(self) -> None: document_splitter = NLTKDocumentSplitter( split_by="page", split_length=2, split_overlap=0, split_threshold=0, language="en" ) text = "Moonlight shimmered softly, wolves howled nearby, night enveloped everything.\fIt was a dark night." units = document_splitter._split_into_units(text=text, split_by="page") assert units == [ "Moonlight shimmered softly, wolves howled nearby, night enveloped everything.\f", "It was a dark night.", ] def test_document_splitter_split_into_units_raise_error(self) -> None: document_splitter = NLTKDocumentSplitter( split_by="word", split_length=3, split_overlap=0, split_threshold=0, language="en" ) text = "Moonlight shimmered softly, wolves howled nearby, night enveloped everything." with pytest.raises(NotImplementedError): document_splitter._split_into_units(text=text, split_by="invalid") # type: ignore class TestNLTKDocumentSplitterNumberOfSentencesToKeep: @pytest.mark.parametrize( "sentences, expected_num_sentences", [ (["Moonlight shimmered softly, wolves howled nearby, night enveloped everything."], 0), ([" It was a dark night ..."], 0), ([" The moon was full."], 1), ], ) def test_number_of_sentences_to_keep(self, sentences: List[str], expected_num_sentences: int) -> None: num_sentences = NLTKDocumentSplitter._number_of_sentences_to_keep( sentences=sentences, split_length=5, split_overlap=2 ) assert num_sentences == expected_num_sentences def test_number_of_sentences_to_keep_split_overlap_zero(self) -> None: sentences = [ "Moonlight shimmered softly, wolves howled nearby, night enveloped everything.", " It was a dark night ...", " The moon was full.", ] num_sentences = NLTKDocumentSplitter._number_of_sentences_to_keep( sentences=sentences, split_length=5, split_overlap=0 ) assert num_sentences == 0 class TestNLTKDocumentSplitterRun: def test_run_type_error(self) -> None: document_splitter = NLTKDocumentSplitter() with pytest.raises(TypeError): document_splitter.run(documents=Document(content="Moonlight shimmered softly.")) # type: ignore def test_run_value_error(self) -> None: document_splitter = NLTKDocumentSplitter() with pytest.raises(ValueError): document_splitter.run(documents=[Document(content=None)]) def test_run_split_by_sentence_1(self) -> None: document_splitter = NLTKDocumentSplitter( split_by="sentence", split_length=2, split_overlap=0, split_threshold=0, language="en", use_split_rules=True, extend_abbreviations=True, ) text = ( "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night ... " "The moon was full." ) documents = document_splitter.run(documents=[Document(content=text)])["documents"] assert len(documents) == 2 assert ( documents[0].content == "Moonlight shimmered softly, wolves howled nearby, night enveloped " "everything. It was a dark night ... " ) assert documents[1].content == "The moon was full." def test_run_split_by_sentence_2(self) -> None: document_splitter = NLTKDocumentSplitter( split_by="sentence", split_length=1, split_overlap=0, split_threshold=0, language="en", use_split_rules=False, extend_abbreviations=True, ) text = ( "This is a test sentence with many many words that exceeds the split length and should not be repeated. " "This is another test sentence. (This is a third test sentence.) " "This is the last test sentence." ) documents = document_splitter.run(documents=[Document(content=text)])["documents"] assert len(documents) == 4 assert ( documents[0].content == "This is a test sentence with many many words that exceeds the split length and should not be repeated. " ) assert documents[0].meta["page_number"] == 1 assert documents[0].meta["split_id"] == 0 assert documents[0].meta["split_idx_start"] == text.index(documents[0].content) assert documents[1].content == "This is another test sentence. " assert documents[1].meta["page_number"] == 1 assert documents[1].meta["split_id"] == 1 assert documents[1].meta["split_idx_start"] == text.index(documents[1].content) assert documents[2].content == "(This is a third test sentence.) " assert documents[2].meta["page_number"] == 1 assert documents[2].meta["split_id"] == 2 assert documents[2].meta["split_idx_start"] == text.index(documents[2].content) assert documents[3].content == "This is the last test sentence." assert documents[3].meta["page_number"] == 1 assert documents[3].meta["split_id"] == 3 assert documents[3].meta["split_idx_start"] == text.index(documents[3].content) def test_run_split_by_sentence_3(self) -> None: document_splitter = NLTKDocumentSplitter( split_by="sentence", split_length=1, split_overlap=0, split_threshold=0, language="en", use_split_rules=True, extend_abbreviations=True, ) text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5." documents = document_splitter.run(documents=[Document(content=text)])["documents"] assert len(documents) == 4 assert documents[0].content == "Sentence on page 1.\f" assert documents[0].meta["page_number"] == 1 assert documents[0].meta["split_id"] == 0 assert documents[0].meta["split_idx_start"] == text.index(documents[0].content) assert documents[1].content == "Sentence on page 2. \f" assert documents[1].meta["page_number"] == 2 assert documents[1].meta["split_id"] == 1 assert documents[1].meta["split_idx_start"] == text.index(documents[1].content) assert documents[2].content == "Sentence on page 3. \f\f " assert documents[2].meta["page_number"] == 3 assert documents[2].meta["split_id"] == 2 assert documents[2].meta["split_idx_start"] == text.index(documents[2].content) assert documents[3].content == "Sentence on page 5." assert documents[3].meta["page_number"] == 5 assert documents[3].meta["split_id"] == 3 assert documents[3].meta["split_idx_start"] == text.index(documents[3].content) def test_run_split_by_sentence_4(self) -> None: document_splitter = NLTKDocumentSplitter( split_by="sentence", split_length=2, split_overlap=1, split_threshold=0, language="en", use_split_rules=True, extend_abbreviations=True, ) text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5." documents = document_splitter.run(documents=[Document(content=text)])["documents"] assert len(documents) == 3 assert documents[0].content == "Sentence on page 1.\fSentence on page 2. \f" assert documents[0].meta["page_number"] == 1 assert documents[0].meta["split_id"] == 0 assert documents[0].meta["split_idx_start"] == text.index(documents[0].content) assert documents[1].content == "Sentence on page 2. \fSentence on page 3. \f\f " assert documents[1].meta["page_number"] == 2 assert documents[1].meta["split_id"] == 1 assert documents[1].meta["split_idx_start"] == text.index(documents[1].content) assert documents[2].content == "Sentence on page 3. \f\f Sentence on page 5." assert documents[2].meta["page_number"] == 3 assert documents[2].meta["split_id"] == 2 assert documents[2].meta["split_idx_start"] == text.index(documents[2].content) class TestNLTKDocumentSplitterRespectSentenceBoundary: def test_run_split_by_word_respect_sentence_boundary(self) -> None: document_splitter = NLTKDocumentSplitter( split_by="word", split_length=3, split_overlap=0, split_threshold=0, language="en", respect_sentence_boundary=True, ) text = ( "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. It was a dark night.\f" "The moon was full." ) documents = document_splitter.run(documents=[Document(content=text)])["documents"] assert len(documents) == 3 assert documents[0].content == "Moonlight shimmered softly, wolves howled nearby, night enveloped everything. " assert documents[0].meta["page_number"] == 1 assert documents[0].meta["split_id"] == 0 assert documents[0].meta["split_idx_start"] == text.index(documents[0].content) assert documents[1].content == "It was a dark night.\f" assert documents[1].meta["page_number"] == 1 assert documents[1].meta["split_id"] == 1 assert documents[1].meta["split_idx_start"] == text.index(documents[1].content) assert documents[2].content == "The moon was full." assert documents[2].meta["page_number"] == 2 assert documents[2].meta["split_id"] == 2 assert documents[2].meta["split_idx_start"] == text.index(documents[2].content) def test_run_split_by_word_respect_sentence_boundary_no_repeats(self) -> None: document_splitter = NLTKDocumentSplitter( split_by="word", split_length=13, split_overlap=3, split_threshold=0, language="en", respect_sentence_boundary=True, use_split_rules=False, extend_abbreviations=False, ) text = ( "This is a test sentence with many many words that exceeds the split length and should not be repeated. " "This is another test sentence. (This is a third test sentence.) " "This is the last test sentence." ) documents = document_splitter.run([Document(content=text)])["documents"] assert len(documents) == 3 assert ( documents[0].content == "This is a test sentence with many many words that exceeds the split length and should not be repeated. " ) assert "This is a test sentence with many many words" not in documents[1].content assert "This is a test sentence with many many words" not in documents[2].content def test_run_split_by_word_respect_sentence_boundary_with_split_overlap_and_page_breaks(self) -> None: document_splitter = NLTKDocumentSplitter( split_by="word", split_length=5, split_overlap=1, split_threshold=0, language="en", use_split_rules=True, extend_abbreviations=True, respect_sentence_boundary=True, ) text = "Sentence on page 1.\fSentence on page 2. \fSentence on page 3. \f\f Sentence on page 5." documents = document_splitter.run(documents=[Document(content=text)])["documents"] assert len(documents) == 4 assert documents[0].content == "Sentence on page 1.\f" assert documents[0].meta["page_number"] == 1 assert documents[0].meta["split_id"] == 0 assert documents[0].meta["split_idx_start"] == text.index(documents[0].content) assert documents[1].content == "Sentence on page 1.\fSentence on page 2. \f" assert documents[1].meta["page_number"] == 1 assert documents[1].meta["split_id"] == 1 assert documents[1].meta["split_idx_start"] == text.index(documents[1].content) assert documents[2].content == "Sentence on page 2. \fSentence on page 3. \f\f " assert documents[2].meta["page_number"] == 2 assert documents[2].meta["split_id"] == 2 assert documents[2].meta["split_idx_start"] == text.index(documents[2].content) assert documents[3].content == "Sentence on page 3. \f\f Sentence on page 5." assert documents[3].meta["page_number"] == 3 assert documents[3].meta["split_id"] == 3 assert documents[3].meta["split_idx_start"] == text.index(documents[3].content) class TestSentenceSplitter: def test_apply_split_rules_second_while_loop(self) -> None: text = "This is a test. (With a parenthetical statement.) And another sentence." spans = [(0, 15), (16, 50), (51, 74)] result = SentenceSplitter._apply_split_rules(text, spans) assert len(result) == 2 assert result == [(0, 50), (51, 74)] def test_apply_split_rules_no_join(self) -> None: text = "This is a test. This is another test. And a third test." spans = [(0, 15), (16, 36), (37, 54)] result = SentenceSplitter._apply_split_rules(text, spans) assert len(result) == 3 assert result == [(0, 15), (16, 36), (37, 54)] @pytest.mark.parametrize( "text,span,next_span,quote_spans,expected", [ # triggers sentence boundary is inside a quote ('He said, "Hello World." Then left.', (0, 15), (16, 23), [(9, 23)], True) ], ) def test_needs_join_cases(self, text, span, next_span, quote_spans, expected): result = SentenceSplitter._needs_join(text, span, next_span, quote_spans) assert result == expected, f"Expected {expected} for input: {text}, {span}, {next_span}, {quote_spans}"