mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-26 00:24:14 +00:00
Restructure checks in PreProcessor (#504)
* restructure checks * fix variable name * Fix test
This commit is contained in:
parent
c13abba6d6
commit
956543e239
@ -80,31 +80,30 @@ class PreProcessor(BasePreProcessor):
|
|||||||
if not self.split_length:
|
if not self.split_length:
|
||||||
raise Exception("split_length needs be set when using split_by.")
|
raise Exception("split_length needs be set when using split_by.")
|
||||||
|
|
||||||
|
if self.split_respect_sentence_boundary and self.split_by not in("word","sentence"):
|
||||||
|
raise NotImplementedError("'split_respect_sentence_boundary=True' is only compatible with"
|
||||||
|
" split_by='word' or split_by='sentence'.")
|
||||||
|
|
||||||
text = document["text"]
|
text = document["text"]
|
||||||
|
|
||||||
if self.split_respect_sentence_boundary: # split by words ensuring no sub sentence splits
|
if self.split_respect_sentence_boundary and self.split_by == "word":
|
||||||
if self.split_by == "word":
|
# split by words ensuring no sub sentence splits
|
||||||
sentences = nltk.tokenize.sent_tokenize(text)
|
sentences = nltk.tokenize.sent_tokenize(text)
|
||||||
word_count = 0
|
word_count = 0
|
||||||
text_splits = []
|
text_splits = []
|
||||||
current_slice = ""
|
current_slice = ""
|
||||||
for sen in sentences:
|
for sen in sentences:
|
||||||
current_word_count = len(sen.split(" "))
|
current_word_count = len(sen.split(" "))
|
||||||
if current_word_count > self.split_length:
|
if current_word_count > self.split_length:
|
||||||
logger.warning(f"A sentence found with word count higher than the split length.")
|
logger.warning(f"A sentence found with word count higher than the split length.")
|
||||||
if word_count + current_word_count > self.split_length:
|
if word_count + current_word_count > self.split_length:
|
||||||
text_splits.append(current_slice)
|
|
||||||
current_slice = ""
|
|
||||||
word_count = 0
|
|
||||||
current_slice += sen
|
|
||||||
word_count += len(sen.split(" "))
|
|
||||||
if current_slice:
|
|
||||||
text_splits.append(current_slice)
|
text_splits.append(current_slice)
|
||||||
|
current_slice = ""
|
||||||
else:
|
word_count = 0
|
||||||
raise NotImplementedError(
|
current_slice += sen
|
||||||
"'split_respect_sentence_boundary' parameter is only compatible with " "split_by='word'."
|
word_count += len(sen.split(" "))
|
||||||
)
|
if current_slice:
|
||||||
|
text_splits.append(current_slice)
|
||||||
else:
|
else:
|
||||||
# create individual "elements" of passage, sentence, or word
|
# create individual "elements" of passage, sentence, or word
|
||||||
if self.split_by == "passage":
|
if self.split_by == "passage":
|
||||||
|
@ -44,11 +44,11 @@ def test_preprocess_word_split():
|
|||||||
|
|
||||||
def test_preprocess_passage_split():
|
def test_preprocess_passage_split():
|
||||||
document = {"text": TEXT}
|
document = {"text": TEXT}
|
||||||
preprocessor = PreProcessor(split_length=1, split_stride=0, split_by="passage")
|
preprocessor = PreProcessor(split_length=1, split_stride=0, split_by="passage", split_respect_sentence_boundary=False)
|
||||||
documents = preprocessor.process(document)
|
documents = preprocessor.process(document)
|
||||||
assert len(documents) == 3
|
assert len(documents) == 3
|
||||||
|
|
||||||
preprocessor = PreProcessor(split_length=2, split_stride=0, split_by="passage")
|
preprocessor = PreProcessor(split_length=2, split_stride=0, split_by="passage", split_respect_sentence_boundary=False)
|
||||||
documents = preprocessor.process(document)
|
documents = preprocessor.process(document)
|
||||||
assert len(documents) == 2
|
assert len(documents) == 2
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user