mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-26 08:33:51 +00:00
Restructure checks in PreProcessor (#504)
* restructure checks * fix variable name * Fix test
This commit is contained in:
parent
c13abba6d6
commit
956543e239
@ -80,10 +80,14 @@ class PreProcessor(BasePreProcessor):
|
||||
if not self.split_length:
|
||||
raise Exception("split_length needs be set when using split_by.")
|
||||
|
||||
if self.split_respect_sentence_boundary and self.split_by not in("word","sentence"):
|
||||
raise NotImplementedError("'split_respect_sentence_boundary=True' is only compatible with"
|
||||
" split_by='word' or split_by='sentence'.")
|
||||
|
||||
text = document["text"]
|
||||
|
||||
if self.split_respect_sentence_boundary: # split by words ensuring no sub sentence splits
|
||||
if self.split_by == "word":
|
||||
if self.split_respect_sentence_boundary and self.split_by == "word":
|
||||
# split by words ensuring no sub sentence splits
|
||||
sentences = nltk.tokenize.sent_tokenize(text)
|
||||
word_count = 0
|
||||
text_splits = []
|
||||
@ -100,11 +104,6 @@ class PreProcessor(BasePreProcessor):
|
||||
word_count += len(sen.split(" "))
|
||||
if current_slice:
|
||||
text_splits.append(current_slice)
|
||||
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"'split_respect_sentence_boundary' parameter is only compatible with " "split_by='word'."
|
||||
)
|
||||
else:
|
||||
# create individual "elements" of passage, sentence, or word
|
||||
if self.split_by == "passage":
|
||||
|
@ -44,11 +44,11 @@ def test_preprocess_word_split():
|
||||
|
||||
def test_preprocess_passage_split():
|
||||
document = {"text": TEXT}
|
||||
preprocessor = PreProcessor(split_length=1, split_stride=0, split_by="passage")
|
||||
preprocessor = PreProcessor(split_length=1, split_stride=0, split_by="passage", split_respect_sentence_boundary=False)
|
||||
documents = preprocessor.process(document)
|
||||
assert len(documents) == 3
|
||||
|
||||
preprocessor = PreProcessor(split_length=2, split_stride=0, split_by="passage")
|
||||
preprocessor = PreProcessor(split_length=2, split_stride=0, split_by="passage", split_respect_sentence_boundary=False)
|
||||
documents = preprocessor.process(document)
|
||||
assert len(documents) == 2
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user