Redone: Fix concatenation of sentences in PreProcessor. Add stride for word-based splits with sentence boundaries (#641)

* Update preprocessor.py

Concatenation of sentences done correctly. Stride functionality enabled for splitting by words while respecting sentence boundaries.

* Simplify code, add test

Co-authored-by: Krak91 <45461739+Krak91@users.noreply.github.com>
This commit is contained in:
Timo Moeller 2020-12-09 16:12:36 +01:00 committed by GitHub
parent 8c904d79d6
commit efc754b166
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 28 additions and 12 deletions

View File

@ -98,22 +98,35 @@ class PreProcessor(BasePreProcessor):
# split by words ensuring no sub sentence splits
sentences = nltk.tokenize.sent_tokenize(text)
word_count = 0
text_splits = []
current_slice = ""
list_splits = []
current_slice: List[str] = []
for sen in sentences:
current_word_count = len(sen.split(" "))
if current_word_count > self.split_length:
logger.warning(f"A sentence found with word count higher than the split length.")
if word_count + current_word_count > self.split_length:
text_splits.append(current_slice)
current_slice = ""
word_count = 0
if len(current_slice) != 0:
sen = " " + sen
current_slice += sen
word_count += current_word_count
list_splits.append(current_slice)
#Enable split_stride with split_by='word' while respecting sentence boundaries.
if self.split_stride:
overlap = []
w_count = 0
for s in current_slice[::-1]:
sen_len = len(s.split(" "))
if w_count < self.split_stride:
overlap.append(s)
w_count += sen_len
else:
break
current_slice = list(reversed(overlap))
word_count = w_count
else:
current_slice = []
word_count = 0
current_slice.append(sen)
word_count += len(sen.split(" "))
if current_slice:
text_splits.append(current_slice)
list_splits.append(current_slice)
text_splits = [' '.join(sl) for sl in list_splits]
else:
# create individual "elements" of passage, sentence, or word
if self.split_by == "passage":

View File

@ -47,6 +47,10 @@ def test_preprocess_word_split():
assert len(doc["text"].split(" ")) <= 15 or doc["text"].startswith("This is to trick")
assert len(documents) == 8
preprocessor = PreProcessor(split_length=40, split_stride=10, split_by="word", split_respect_sentence_boundary=True)
documents = preprocessor.process(document)
assert len(documents) == 5
@pytest.mark.tika
def test_preprocess_passage_split():
@ -71,5 +75,4 @@ def test_clean_header_footer():
assert len(documents) == 1
assert "This is a header." not in documents[0]["text"]
assert "footer" not in documents[0]["text"]
assert "footer" not in documents[0]["text"]