mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-27 15:08:43 +00:00
fix: make SentenceSplitter QUOTE_SPANS_RE regex ReDoS-safe (#9338)
* fix: make QUOTE_SPANS_RE regex ReDoS-safe * Removing the capture of leading non-character on double quotes, allowing quote with new lines, adding tests * cleaning * fixing release notes * changing import * adding test for Regex Denial of Service (ReDoS) * reducing the size/time of tests * Update test/components/preprocessors/test_sentence_tokenizer.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * Update test/components/preprocessors/test_sentence_tokenizer.py --------- Co-authored-by: Waivey <waivey@proton.me> Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
This commit is contained in:
parent
e5255d9061
commit
0f00c1882e
@ -40,7 +40,7 @@ ISO639_TO_NLTK = {
|
||||
"ml": "malayalam",
|
||||
}
|
||||
|
||||
QUOTE_SPANS_RE = re.compile(r"\W(\"+|\'+).*?\1")
|
||||
QUOTE_SPANS_RE = re.compile(r'"[^"]*"|\'[^\']*\'')
|
||||
|
||||
if nltk_imports.is_successful():
|
||||
|
||||
|
||||
@ -0,0 +1,7 @@
|
||||
---
|
||||
security:
|
||||
- |
|
||||
Made QUOTE_SPANS_RE regex ReDoS-safe. This prevents potential catastrophic backtracking on malicious inputs
|
||||
fixes:
|
||||
- |
|
||||
Fixed a potential ReDoS issue in QUOTE_SPANS_RE regex used inside the SentenceSplitter component.
|
||||
@ -1,8 +1,10 @@
|
||||
import time
|
||||
import pytest
|
||||
from unittest.mock import patch
|
||||
from pathlib import Path
|
||||
|
||||
from haystack.components.preprocessors.sentence_tokenizer import SentenceSplitter
|
||||
from haystack.components.preprocessors.sentence_tokenizer import QUOTE_SPANS_RE
|
||||
|
||||
from pytest import LogCaptureFixture
|
||||
|
||||
@ -65,3 +67,48 @@ def test_read_abbreviations_missing_file(caplog: LogCaptureFixture):
|
||||
result = SentenceSplitter._read_abbreviations("pt")
|
||||
assert result == []
|
||||
assert "No abbreviations file found for pt. Using default abbreviations." in caplog.text
|
||||
|
||||
|
||||
def test_quote_spans_regex():
|
||||
# double quotes
|
||||
text1 = 'He said "Hello world" and left.'
|
||||
matches1 = list(QUOTE_SPANS_RE.finditer(text1))
|
||||
assert len(matches1) == 1
|
||||
assert matches1[0].group() == '"Hello world"'
|
||||
|
||||
# single quotes
|
||||
text2 = "She replied 'Goodbye world' and smiled."
|
||||
matches2 = list(QUOTE_SPANS_RE.finditer(text2))
|
||||
assert len(matches2) == 1
|
||||
assert matches2[0].group() == "'Goodbye world'"
|
||||
|
||||
# multiple quotes
|
||||
text3 = 'First "quote" and second "quote" in same text.'
|
||||
matches3 = list(QUOTE_SPANS_RE.finditer(text3))
|
||||
assert len(matches3) == 2
|
||||
assert matches3[0].group() == '"quote"'
|
||||
assert matches3[1].group() == '"quote"'
|
||||
|
||||
# quotes containing newlines
|
||||
text4 = 'Text with "quote\nspanning\nmultiple\nlines"'
|
||||
matches4 = list(QUOTE_SPANS_RE.finditer(text4))
|
||||
assert len(matches4) == 1
|
||||
assert matches4[0].group() == '"quote\nspanning\nmultiple\nlines"'
|
||||
|
||||
# no quotes
|
||||
text5 = "This text has no quotes."
|
||||
matches5 = list(QUOTE_SPANS_RE.finditer(text5))
|
||||
assert len(matches5) == 0
|
||||
|
||||
|
||||
def test_split_sentences_performance() -> None:
|
||||
# make sure our regex is not vulnerable to Regex Denial of Service (ReDoS)
|
||||
# https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
|
||||
# this is a very long string, roughly 50 MB, but it should not take more than 2 seconds to process
|
||||
splitter = SentenceSplitter()
|
||||
text = " " + '"' * 20 + "A" * 50000000 + "B"
|
||||
start = time.time()
|
||||
_ = splitter.split_sentences(text)
|
||||
end = time.time()
|
||||
|
||||
assert end - start < 2, f"Execution time exceeded 2 seconds: {end - start:.2f} seconds"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user