haystack/test/components/preprocessors/test_sentence_tokenizer.py

import time
import pytest
from unittest.mock import patch
from pathlib import Path

from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter
from haystack.components.preprocessors.sentence_tokenizer import QUOTE_SPANS_RE

from pytest import LogCaptureFixture


def test_apply_split_rules_no_join() -> None:
    text = "This is a test. This is another test. And a third test."
    spans = [(0, 15), (16, 36), (37, 54)]
    result = SentenceSplitter._apply_split_rules(text, spans)
    assert len(result) == 3
    assert result == [(0, 15), (16, 36), (37, 54)]


def test_apply_split_rules_join_case_1():
    text = 'He said "This is sentence one. This is sentence two." Then he left.'
    result = SentenceSplitter._apply_split_rules(text, [(0, 30), (31, 53), (54, 67)])
    assert len(result) == 2
    assert result == [(0, 53), (54, 67)]


def test_apply_split_rules_join_case_3():
    splitter = SentenceSplitter(language="en", use_split_rules=True)
    text = """
    1. First item
    2. Second item
    3. Third item."""
    spans = [(0, 7), (8, 25), (26, 44), (45, 56)]
    result = splitter._apply_split_rules(text, spans)
    assert len(result) == 1
    assert result == [(0, 56)]


def test_apply_split_rules_join_case_4() -> None:
    text = "This is a test. (With a parenthetical statement.) And another sentence."
    spans = [(0, 15), (16, 50), (51, 74)]
    result = SentenceSplitter._apply_split_rules(text, spans)
    assert len(result) == 2
    assert result == [(0, 50), (51, 74)]


@pytest.fixture
def mock_file_content():
    return "Mr.\nDr.\nProf."


def test_read_abbreviations_existing_file(tmp_path, mock_file_content):
    abbrev_dir = tmp_path / "data" / "abbreviations"
    abbrev_dir.mkdir(parents=True)
    abbrev_file = abbrev_dir / f"en.txt"
    abbrev_file.write_text(mock_file_content)

    with patch("haystack.components.preprocessors.sentence_tokenizer.Path") as mock_path:
        mock_path.return_value.parent.parent.parent = tmp_path
        result = SentenceSplitter._read_abbreviations("en")
        assert result == ["Mr.", "Dr.", "Prof."]


def test_read_abbreviations_missing_file(caplog: LogCaptureFixture):
    with patch("haystack.components.preprocessors.sentence_tokenizer.Path") as mock_path:
        mock_path.return_value.parent.parent = Path("/nonexistent")
        result = SentenceSplitter._read_abbreviations("pt")
        assert result == []
        assert "No abbreviations file found for pt. Using default abbreviations." in caplog.text


def test_quote_spans_regex():
    # double quotes
    text1 = 'He said "Hello world" and left.'
    matches1 = list(QUOTE_SPANS_RE.finditer(text1))
    assert len(matches1) == 1
    assert matches1[0].group() == '"Hello world"'

    # single quotes
    text2 = "She replied 'Goodbye world' and smiled."
    matches2 = list(QUOTE_SPANS_RE.finditer(text2))
    assert len(matches2) == 1
    assert matches2[0].group() == "'Goodbye world'"

    # multiple quotes
    text3 = 'First "quote" and second "quote" in same text.'
    matches3 = list(QUOTE_SPANS_RE.finditer(text3))
    assert len(matches3) == 2
    assert matches3[0].group() == '"quote"'
    assert matches3[1].group() == '"quote"'

    # quotes containing newlines
    text4 = 'Text with "quote\nspanning\nmultiple\nlines"'
    matches4 = list(QUOTE_SPANS_RE.finditer(text4))
    assert len(matches4) == 1
    assert matches4[0].group() == '"quote\nspanning\nmultiple\nlines"'

    # no quotes
    text5 = "This text has no quotes."
    matches5 = list(QUOTE_SPANS_RE.finditer(text5))
    assert len(matches5) == 0


def test_split_sentences_performance() -> None:
    # make sure our regex is not vulnerable to Regex Denial of Service (ReDoS)
    # https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
    # this is a very long string, roughly 50 MB, but it should not take more than 2 seconds to process
    splitter = SentenceSplitter()
    text = " " + '"' * 20 + "A" * 50000000 + "B"
    start = time.time()
    _ = splitter.split_sentences(text)
    end = time.time()

    assert end - start < 2, f"Execution time exceeded 2 seconds: {end - start:.2f} seconds"
fix: make `SentenceSplitter` QUOTE_SPANS_RE regex ReDoS-safe (#9338) * fix: make QUOTE_SPANS_RE regex ReDoS-safe * Removing the capture of leading non-character on double quotes, allowing quote with new lines, adding tests * cleaning * fixing release notes * changing import * adding test for Regex Denial of Service (ReDoS) * reducing the size/time of tests * Update test/components/preprocessors/test_sentence_tokenizer.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update test/components/preprocessors/test_sentence_tokenizer.py --------- Co-authored-by: Waivey <waivey@proton.me> Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> 2025-05-02 16:40:17 +01:00			`import time`
!feat: unify NLTKDocumentSplitter and DocumentSplitter (#8617) * wip: initial import * wip: refactoring * wip: refactoring tests * wip: refactoring tests * making all NLTKSplitter related tests work * refactoring * docstrings * refactoring and removing NLTKDocumentSplitter * fixing tests for custom sentence tokenizer * fixing tests for custom sentence tokenizer * cleaning up * adding release notes * reverting some changes * cleaning up tests * fixing serialisation and adding tests * cleaning up * wip * renaming and cleaning * adding NLTK files * updating docstring * adding import to init * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * updating tests * wip * adding sentence/period change warning * fixing LICENSE header * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> 2024-12-12 15:22:27 +01:00			`import pytest`
			`from unittest.mock import patch`
			`from pathlib import Path`

			`from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter`
fix: make `SentenceSplitter` QUOTE_SPANS_RE regex ReDoS-safe (#9338) * fix: make QUOTE_SPANS_RE regex ReDoS-safe * Removing the capture of leading non-character on double quotes, allowing quote with new lines, adding tests * cleaning * fixing release notes * changing import * adding test for Regex Denial of Service (ReDoS) * reducing the size/time of tests * Update test/components/preprocessors/test_sentence_tokenizer.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update test/components/preprocessors/test_sentence_tokenizer.py --------- Co-authored-by: Waivey <waivey@proton.me> Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> 2025-05-02 16:40:17 +01:00			`from haystack.components.preprocessors.sentence_tokenizer import QUOTE_SPANS_RE`
!feat: unify NLTKDocumentSplitter and DocumentSplitter (#8617) * wip: initial import * wip: refactoring * wip: refactoring tests * wip: refactoring tests * making all NLTKSplitter related tests work * refactoring * docstrings * refactoring and removing NLTKDocumentSplitter * fixing tests for custom sentence tokenizer * fixing tests for custom sentence tokenizer * cleaning up * adding release notes * reverting some changes * cleaning up tests * fixing serialisation and adding tests * cleaning up * wip * renaming and cleaning * adding NLTK files * updating docstring * adding import to init * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * updating tests * wip * adding sentence/period change warning * fixing LICENSE header * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> 2024-12-12 15:22:27 +01:00
			`from pytest import LogCaptureFixture`


			`def test_apply_split_rules_no_join() -> None:`
			`text = "This is a test. This is another test. And a third test."`
			`spans = [(0, 15), (16, 36), (37, 54)]`
			`result = SentenceSplitter._apply_split_rules(text, spans)`
			`assert len(result) == 3`
			`assert result == [(0, 15), (16, 36), (37, 54)]`


			`def test_apply_split_rules_join_case_1():`
			`text = 'He said "This is sentence one. This is sentence two." Then he left.'`
			`result = SentenceSplitter._apply_split_rules(text, [(0, 30), (31, 53), (54, 67)])`
			`assert len(result) == 2`
			`assert result == [(0, 53), (54, 67)]`


			`def test_apply_split_rules_join_case_3():`
			`splitter = SentenceSplitter(language="en", use_split_rules=True)`
			`text = """`
			`1. First item`
			`2. Second item`
			`3. Third item."""`
			`spans = [(0, 7), (8, 25), (26, 44), (45, 56)]`
			`result = splitter._apply_split_rules(text, spans)`
			`assert len(result) == 1`
			`assert result == [(0, 56)]`


			`def test_apply_split_rules_join_case_4() -> None:`
			`text = "This is a test. (With a parenthetical statement.) And another sentence."`
			`spans = [(0, 15), (16, 50), (51, 74)]`
			`result = SentenceSplitter._apply_split_rules(text, spans)`
			`assert len(result) == 2`
			`assert result == [(0, 50), (51, 74)]`


			`@pytest.fixture`
			`def mock_file_content():`
			`return "Mr.\nDr.\nProf."`


			`def test_read_abbreviations_existing_file(tmp_path, mock_file_content):`
			`abbrev_dir = tmp_path / "data" / "abbreviations"`
			`abbrev_dir.mkdir(parents=True)`
			`abbrev_file = abbrev_dir / f"en.txt"`
			`abbrev_file.write_text(mock_file_content)`

			`with patch("haystack.components.preprocessors.sentence_tokenizer.Path") as mock_path:`
fix: adding missing abbreviations files for SentenceSplitter (#8660) * adding missing abbreviations files for SentenceSplitter * fixing tests path 2024-12-19 11:08:29 +01:00			`mock_path.return_value.parent.parent.parent = tmp_path`
!feat: unify NLTKDocumentSplitter and DocumentSplitter (#8617) * wip: initial import * wip: refactoring * wip: refactoring tests * wip: refactoring tests * making all NLTKSplitter related tests work * refactoring * docstrings * refactoring and removing NLTKDocumentSplitter * fixing tests for custom sentence tokenizer * fixing tests for custom sentence tokenizer * cleaning up * adding release notes * reverting some changes * cleaning up tests * fixing serialisation and adding tests * cleaning up * wip * renaming and cleaning * adding NLTK files * updating docstring * adding import to init * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * updating tests * wip * adding sentence/period change warning * fixing LICENSE header * Update haystack/components/preprocessors/document_splitter.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> 2024-12-12 15:22:27 +01:00			`result = SentenceSplitter._read_abbreviations("en")`
			`assert result == ["Mr.", "Dr.", "Prof."]`


			`def test_read_abbreviations_missing_file(caplog: LogCaptureFixture):`
			`with patch("haystack.components.preprocessors.sentence_tokenizer.Path") as mock_path:`
			`mock_path.return_value.parent.parent = Path("/nonexistent")`
			`result = SentenceSplitter._read_abbreviations("pt")`
			`assert result == []`
			`assert "No abbreviations file found for pt. Using default abbreviations." in caplog.text`
fix: make `SentenceSplitter` QUOTE_SPANS_RE regex ReDoS-safe (#9338) * fix: make QUOTE_SPANS_RE regex ReDoS-safe * Removing the capture of leading non-character on double quotes, allowing quote with new lines, adding tests * cleaning * fixing release notes * changing import * adding test for Regex Denial of Service (ReDoS) * reducing the size/time of tests * Update test/components/preprocessors/test_sentence_tokenizer.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update test/components/preprocessors/test_sentence_tokenizer.py --------- Co-authored-by: Waivey <waivey@proton.me> Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> 2025-05-02 16:40:17 +01:00

			`def test_quote_spans_regex():`
			`# double quotes`
			`text1 = 'He said "Hello world" and left.'`
			`matches1 = list(QUOTE_SPANS_RE.finditer(text1))`
			`assert len(matches1) == 1`
			`assert matches1[0].group() == '"Hello world"'`

			`# single quotes`
			`text2 = "She replied 'Goodbye world' and smiled."`
			`matches2 = list(QUOTE_SPANS_RE.finditer(text2))`
			`assert len(matches2) == 1`
			`assert matches2[0].group() == "'Goodbye world'"`

			`# multiple quotes`
			`text3 = 'First "quote" and second "quote" in same text.'`
			`matches3 = list(QUOTE_SPANS_RE.finditer(text3))`
			`assert len(matches3) == 2`
			`assert matches3[0].group() == '"quote"'`
			`assert matches3[1].group() == '"quote"'`

			`# quotes containing newlines`
			`text4 = 'Text with "quote\nspanning\nmultiple\nlines"'`
			`matches4 = list(QUOTE_SPANS_RE.finditer(text4))`
			`assert len(matches4) == 1`
			`assert matches4[0].group() == '"quote\nspanning\nmultiple\nlines"'`

			`# no quotes`
			`text5 = "This text has no quotes."`
			`matches5 = list(QUOTE_SPANS_RE.finditer(text5))`
			`assert len(matches5) == 0`


			`def test_split_sentences_performance() -> None:`
			`# make sure our regex is not vulnerable to Regex Denial of Service (ReDoS)`
			`# https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS`
			`# this is a very long string, roughly 50 MB, but it should not take more than 2 seconds to process`
			`splitter = SentenceSplitter()`
			`text = " " + '"' * 20 + "A" * 50000000 + "B"`
			`start = time.time()`
			`_ = splitter.split_sentences(text)`
			`end = time.time()`

			`assert end - start < 2, f"Execution time exceeded 2 seconds: {end - start:.2f} seconds"`