fix: make SentenceSplitter QUOTE_SPANS_RE regex ReDoS-safe (#9338)

* fix: make QUOTE_SPANS_RE regex ReDoS-safe * Removing the capture of leading non-character on double quotes, allowing quote with new lines, adding tests * cleaning * fixing release notes * changing import * adding test for Regex Denial of Service (ReDoS) * reducing the size/time of tests * Update test/components/preprocessors/test_sentence_tokenizer.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update test/components/preprocessors/test_sentence_tokenizer.py --------- Co-authored-by: Waivey <waivey@proton.me> Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
2025-12-27 15:08:43 +00:00 · 2025-05-02 16:40:17 +01:00 · 2025-05-02 16:40:17 +01:00 · 0f00c1882e
commit 0f00c1882e
parent e5255d9061
3 changed files with 55 additions and 1 deletions
--- a/haystack/components/preprocessors/sentence_tokenizer.py
+++ b/haystack/components/preprocessors/sentence_tokenizer.py
@ -40,7 +40,7 @@ ISO639_TO_NLTK = {
    "ml": "malayalam",
 }

-QUOTE_SPANS_RE = re.compile(r"\W(\"+|\'+).*?\1")
+QUOTE_SPANS_RE = re.compile(r'"[^"]*"|\'[^\']*\'')

 if nltk_imports.is_successful():

--- a/releasenotes/notes/SentenceSplitter-ReDoS-fix-ca91b020cab50bf6.yaml
+++ b/releasenotes/notes/SentenceSplitter-ReDoS-fix-ca91b020cab50bf6.yaml
@ -0,0 +1,7 @@
+---
+security:
+  - |
+    Made QUOTE_SPANS_RE regex ReDoS-safe. This prevents potential catastrophic backtracking on malicious inputs
+fixes:
+  - |
+    Fixed a potential ReDoS issue in QUOTE_SPANS_RE regex used inside the SentenceSplitter component.
--- a/test/components/preprocessors/test_sentence_tokenizer.py
+++ b/test/components/preprocessors/test_sentence_tokenizer.py
@ -1,8 +1,10 @@
+import time
 import pytest
 from unittest.mock import patch
 from pathlib import Path

 from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter
+from haystack.components.preprocessors.sentence_tokenizer import QUOTE_SPANS_RE

 from pytest import LogCaptureFixture

@ -65,3 +67,48 @@ def test_read_abbreviations_missing_file(caplog: LogCaptureFixture):
        result = SentenceSplitter._read_abbreviations("pt")
        assert result == []
        assert "No abbreviations file found for pt. Using default abbreviations." in caplog.text
+
+
+def test_quote_spans_regex():
+    # double quotes
+    text1 = 'He said "Hello world" and left.'
+    matches1 = list(QUOTE_SPANS_RE.finditer(text1))
+    assert len(matches1) == 1
+    assert matches1[0].group() == '"Hello world"'
+
+    # single quotes
+    text2 = "She replied 'Goodbye world' and smiled."
+    matches2 = list(QUOTE_SPANS_RE.finditer(text2))
+    assert len(matches2) == 1
+    assert matches2[0].group() == "'Goodbye world'"
+
+    # multiple quotes
+    text3 = 'First "quote" and second "quote" in same text.'
+    matches3 = list(QUOTE_SPANS_RE.finditer(text3))
+    assert len(matches3) == 2
+    assert matches3[0].group() == '"quote"'
+    assert matches3[1].group() == '"quote"'
+
+    # quotes containing newlines
+    text4 = 'Text with "quote\nspanning\nmultiple\nlines"'
+    matches4 = list(QUOTE_SPANS_RE.finditer(text4))
+    assert len(matches4) == 1
+    assert matches4[0].group() == '"quote\nspanning\nmultiple\nlines"'
+
+    # no quotes
+    text5 = "This text has no quotes."
+    matches5 = list(QUOTE_SPANS_RE.finditer(text5))
+    assert len(matches5) == 0
+
+
+def test_split_sentences_performance() -> None:
+    # make sure our regex is not vulnerable to Regex Denial of Service (ReDoS)
+    # https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
+    # this is a very long string, roughly 50 MB, but it should not take more than 2 seconds to process
+    splitter = SentenceSplitter()
+    text = " " + '"' * 20 + "A" * 50000000 + "B"
+    start = time.time()
+    _ = splitter.split_sentences(text)
+    end = time.time()
+
+    assert end - start < 2, f"Execution time exceeded 2 seconds: {end - start:.2f} seconds"