diff --git a/haystack/components/preprocessors/sentence_tokenizer.py b/haystack/components/preprocessors/sentence_tokenizer.py index 2cb77347d..6d38d14ec 100644 --- a/haystack/components/preprocessors/sentence_tokenizer.py +++ b/haystack/components/preprocessors/sentence_tokenizer.py @@ -40,7 +40,7 @@ ISO639_TO_NLTK = { "ml": "malayalam", } -QUOTE_SPANS_RE = re.compile(r"\W(\"+|\'+).*?\1") +QUOTE_SPANS_RE = re.compile(r'"[^"]*"|\'[^\']*\'') if nltk_imports.is_successful(): diff --git a/releasenotes/notes/SentenceSplitter-ReDoS-fix-ca91b020cab50bf6.yaml b/releasenotes/notes/SentenceSplitter-ReDoS-fix-ca91b020cab50bf6.yaml new file mode 100644 index 000000000..d66f68c97 --- /dev/null +++ b/releasenotes/notes/SentenceSplitter-ReDoS-fix-ca91b020cab50bf6.yaml @@ -0,0 +1,7 @@ +--- +security: + - | + Made QUOTE_SPANS_RE regex ReDoS-safe. This prevents potential catastrophic backtracking on malicious inputs +fixes: + - | + Fixed a potential ReDoS issue in QUOTE_SPANS_RE regex used inside the SentenceSplitter component. diff --git a/test/components/preprocessors/test_sentence_tokenizer.py b/test/components/preprocessors/test_sentence_tokenizer.py index f8aaf68fa..a3b7f8c80 100644 --- a/test/components/preprocessors/test_sentence_tokenizer.py +++ b/test/components/preprocessors/test_sentence_tokenizer.py @@ -1,8 +1,10 @@ +import time import pytest from unittest.mock import patch from pathlib import Path from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter +from haystack.components.preprocessors.sentence_tokenizer import QUOTE_SPANS_RE from pytest import LogCaptureFixture @@ -65,3 +67,48 @@ def test_read_abbreviations_missing_file(caplog: LogCaptureFixture): result = SentenceSplitter._read_abbreviations("pt") assert result == [] assert "No abbreviations file found for pt. Using default abbreviations." in caplog.text + + +def test_quote_spans_regex(): + # double quotes + text1 = 'He said "Hello world" and left.' + matches1 = list(QUOTE_SPANS_RE.finditer(text1)) + assert len(matches1) == 1 + assert matches1[0].group() == '"Hello world"' + + # single quotes + text2 = "She replied 'Goodbye world' and smiled." + matches2 = list(QUOTE_SPANS_RE.finditer(text2)) + assert len(matches2) == 1 + assert matches2[0].group() == "'Goodbye world'" + + # multiple quotes + text3 = 'First "quote" and second "quote" in same text.' + matches3 = list(QUOTE_SPANS_RE.finditer(text3)) + assert len(matches3) == 2 + assert matches3[0].group() == '"quote"' + assert matches3[1].group() == '"quote"' + + # quotes containing newlines + text4 = 'Text with "quote\nspanning\nmultiple\nlines"' + matches4 = list(QUOTE_SPANS_RE.finditer(text4)) + assert len(matches4) == 1 + assert matches4[0].group() == '"quote\nspanning\nmultiple\nlines"' + + # no quotes + text5 = "This text has no quotes." + matches5 = list(QUOTE_SPANS_RE.finditer(text5)) + assert len(matches5) == 0 + + +def test_split_sentences_performance() -> None: + # make sure our regex is not vulnerable to Regex Denial of Service (ReDoS) + # https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS + # this is a very long string, roughly 50 MB, but it should not take more than 2 seconds to process + splitter = SentenceSplitter() + text = " " + '"' * 20 + "A" * 50000000 + "B" + start = time.time() + _ = splitter.split_sentences(text) + end = time.time() + + assert end - start < 2, f"Execution time exceeded 2 seconds: {end - start:.2f} seconds"