mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-28 15:38:36 +00:00
Redone: Fix concatenation of sentences in PreProcessor. Add stride for word-based splits with sentence boundaries (#641)
* Update preprocessor.py Concatenation of sentences done correctly. Stride functionality enabled for splitting by words while respecting sentence boundaries. * Simplify code, add test Co-authored-by: Krak91 <45461739+Krak91@users.noreply.github.com>
This commit is contained in:
parent
8c904d79d6
commit
efc754b166
@ -98,22 +98,35 @@ class PreProcessor(BasePreProcessor):
|
||||
# split by words ensuring no sub sentence splits
|
||||
sentences = nltk.tokenize.sent_tokenize(text)
|
||||
word_count = 0
|
||||
text_splits = []
|
||||
current_slice = ""
|
||||
list_splits = []
|
||||
current_slice: List[str] = []
|
||||
for sen in sentences:
|
||||
current_word_count = len(sen.split(" "))
|
||||
if current_word_count > self.split_length:
|
||||
logger.warning(f"A sentence found with word count higher than the split length.")
|
||||
if word_count + current_word_count > self.split_length:
|
||||
text_splits.append(current_slice)
|
||||
current_slice = ""
|
||||
word_count = 0
|
||||
if len(current_slice) != 0:
|
||||
sen = " " + sen
|
||||
current_slice += sen
|
||||
word_count += current_word_count
|
||||
list_splits.append(current_slice)
|
||||
#Enable split_stride with split_by='word' while respecting sentence boundaries.
|
||||
if self.split_stride:
|
||||
overlap = []
|
||||
w_count = 0
|
||||
for s in current_slice[::-1]:
|
||||
sen_len = len(s.split(" "))
|
||||
if w_count < self.split_stride:
|
||||
overlap.append(s)
|
||||
w_count += sen_len
|
||||
else:
|
||||
break
|
||||
current_slice = list(reversed(overlap))
|
||||
word_count = w_count
|
||||
else:
|
||||
current_slice = []
|
||||
word_count = 0
|
||||
current_slice.append(sen)
|
||||
word_count += len(sen.split(" "))
|
||||
if current_slice:
|
||||
text_splits.append(current_slice)
|
||||
list_splits.append(current_slice)
|
||||
text_splits = [' '.join(sl) for sl in list_splits]
|
||||
else:
|
||||
# create individual "elements" of passage, sentence, or word
|
||||
if self.split_by == "passage":
|
||||
|
||||
@ -47,6 +47,10 @@ def test_preprocess_word_split():
|
||||
assert len(doc["text"].split(" ")) <= 15 or doc["text"].startswith("This is to trick")
|
||||
assert len(documents) == 8
|
||||
|
||||
preprocessor = PreProcessor(split_length=40, split_stride=10, split_by="word", split_respect_sentence_boundary=True)
|
||||
documents = preprocessor.process(document)
|
||||
assert len(documents) == 5
|
||||
|
||||
|
||||
@pytest.mark.tika
|
||||
def test_preprocess_passage_split():
|
||||
@ -71,5 +75,4 @@ def test_clean_header_footer():
|
||||
assert len(documents) == 1
|
||||
|
||||
assert "This is a header." not in documents[0]["text"]
|
||||
assert "footer" not in documents[0]["text"]
|
||||
|
||||
assert "footer" not in documents[0]["text"]
|
||||
Loading…
x
Reference in New Issue
Block a user