fix: make SentenceSplitter QUOTE_SPANS_RE regex ReDoS-safe (#9338)

* fix: make QUOTE_SPANS_RE regex ReDoS-safe

* Removing the capture of leading non-character on double quotes, allowing quote with new lines, adding tests

* cleaning

* fixing release notes

* changing import

* adding test for Regex Denial of Service (ReDoS)

* reducing the size/time of tests

* Update test/components/preprocessors/test_sentence_tokenizer.py

Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>

* Update test/components/preprocessors/test_sentence_tokenizer.py

---------

Co-authored-by: Waivey <waivey@proton.me>
Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
This commit is contained in:
David S. Batista 2025-05-02 16:40:17 +01:00 committed by GitHub
parent e5255d9061
commit 0f00c1882e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 55 additions and 1 deletions

View File

@ -40,7 +40,7 @@ ISO639_TO_NLTK = {
"ml": "malayalam",
}
QUOTE_SPANS_RE = re.compile(r"\W(\"+|\'+).*?\1")
QUOTE_SPANS_RE = re.compile(r'"[^"]*"|\'[^\']*\'')
if nltk_imports.is_successful():

View File

@ -0,0 +1,7 @@
---
security:
- |
Made QUOTE_SPANS_RE regex ReDoS-safe. This prevents potential catastrophic backtracking on malicious inputs
fixes:
- |
Fixed a potential ReDoS issue in QUOTE_SPANS_RE regex used inside the SentenceSplitter component.

View File

@ -1,8 +1,10 @@
import time
import pytest
from unittest.mock import patch
from pathlib import Path
from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter
from haystack.components.preprocessors.sentence_tokenizer import QUOTE_SPANS_RE
from pytest import LogCaptureFixture
@ -65,3 +67,48 @@ def test_read_abbreviations_missing_file(caplog: LogCaptureFixture):
result = SentenceSplitter._read_abbreviations("pt")
assert result == []
assert "No abbreviations file found for pt. Using default abbreviations." in caplog.text
def test_quote_spans_regex():
# double quotes
text1 = 'He said "Hello world" and left.'
matches1 = list(QUOTE_SPANS_RE.finditer(text1))
assert len(matches1) == 1
assert matches1[0].group() == '"Hello world"'
# single quotes
text2 = "She replied 'Goodbye world' and smiled."
matches2 = list(QUOTE_SPANS_RE.finditer(text2))
assert len(matches2) == 1
assert matches2[0].group() == "'Goodbye world'"
# multiple quotes
text3 = 'First "quote" and second "quote" in same text.'
matches3 = list(QUOTE_SPANS_RE.finditer(text3))
assert len(matches3) == 2
assert matches3[0].group() == '"quote"'
assert matches3[1].group() == '"quote"'
# quotes containing newlines
text4 = 'Text with "quote\nspanning\nmultiple\nlines"'
matches4 = list(QUOTE_SPANS_RE.finditer(text4))
assert len(matches4) == 1
assert matches4[0].group() == '"quote\nspanning\nmultiple\nlines"'
# no quotes
text5 = "This text has no quotes."
matches5 = list(QUOTE_SPANS_RE.finditer(text5))
assert len(matches5) == 0
def test_split_sentences_performance() -> None:
# make sure our regex is not vulnerable to Regex Denial of Service (ReDoS)
# https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
# this is a very long string, roughly 50 MB, but it should not take more than 2 seconds to process
splitter = SentenceSplitter()
text = " " + '"' * 20 + "A" * 50000000 + "B"
start = time.time()
_ = splitter.split_sentences(text)
end = time.time()
assert end - start < 2, f"Execution time exceeded 2 seconds: {end - start:.2f} seconds"