mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-31 11:56:35 +00:00

* no dpr test seems worth mocking * move distillation tests * pylint * mypy * pylint * move feature_extraction tests as well * move feature_extraction tests as well * merge feature extractor suites * get_language_model tests and adaptive model tests * duplicate test * moving fixtures * mypy * mypy-again * trigger * un-mock integration test * review feedback * feedback * pylint
247 lines
9.0 KiB
Python
247 lines
9.0 KiB
Python
import re
|
|
|
|
import pytest
|
|
import numpy as np
|
|
from tokenizers.pre_tokenizers import WhitespaceSplit
|
|
|
|
from haystack.modeling.model.feature_extraction import FeatureExtractor
|
|
|
|
|
|
BERT = "bert-base-cased"
|
|
ROBERTA = "roberta-base"
|
|
XLNET = "xlnet-base-cased"
|
|
|
|
TOKENIZERS_TO_TEST = [BERT, ROBERTA, XLNET]
|
|
TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER = [(BERT, "##"), (ROBERTA, "Ġ"), (XLNET, "▁")]
|
|
|
|
REGULAR_SENTENCE = "This is a sentence"
|
|
GERMAN_SENTENCE = "Der entscheidende Pass"
|
|
OTHER_ALPHABETS = "力加勝北区ᴵᴺᵀᵃছজটডণত"
|
|
GIBBERISH_SENTENCE = "Thiso text is included tolod makelio sure Unicodeel is handled properly:"
|
|
SENTENCE_WITH_ELLIPSIS = "This is a sentence..."
|
|
SENTENCE_WITH_LINEBREAK_1 = "and another one\n\n\nwithout space"
|
|
SENTENCE_WITH_LINEBREAK_2 = """This is a sentence.
|
|
With linebreak"""
|
|
SENTENCE_WITH_LINEBREAKS = """Sentence
|
|
with
|
|
multiple
|
|
newlines
|
|
"""
|
|
SENTENCE_WITH_EXCESS_WHITESPACE = "This is a sentence with multiple spaces"
|
|
SENTENCE_WITH_TABS = "This is a sentence with multiple tabs"
|
|
SENTENCE_WITH_CUSTOM_TOKEN = "Let's see all on this text and. !23# neverseenwordspossible"
|
|
|
|
|
|
def convert_offset_from_word_reference_to_text_reference(offsets, words, word_spans):
|
|
"""
|
|
Token offsets are originally relative to the beginning of the word
|
|
We make them relative to the beginning of the sentence.
|
|
|
|
Not a fixture, just a utility.
|
|
"""
|
|
token_offsets = []
|
|
for (start, end), word_index in zip(offsets, words):
|
|
word_start = word_spans[word_index][0]
|
|
token_offsets.append((start + word_start, end + word_start))
|
|
return token_offsets
|
|
|
|
|
|
@pytest.mark.parametrize("model_name", TOKENIZERS_TO_TEST)
|
|
def test_save_load(tmp_path, model_name: str):
|
|
tokenizer = FeatureExtractor(pretrained_model_name_or_path=model_name, do_lower_case=False)
|
|
text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"
|
|
|
|
tokenizer.feature_extractor.add_tokens(new_tokens=["neverseentokens"])
|
|
original_encoding = tokenizer.feature_extractor(text)
|
|
|
|
save_dir = tmp_path / "saved_tokenizer"
|
|
tokenizer.feature_extractor.save_pretrained(save_dir)
|
|
|
|
tokenizer_loaded = FeatureExtractor(pretrained_model_name_or_path=save_dir)
|
|
new_encoding = tokenizer_loaded.feature_extractor(text)
|
|
|
|
assert original_encoding == new_encoding
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"edge_case",
|
|
[
|
|
REGULAR_SENTENCE,
|
|
OTHER_ALPHABETS,
|
|
GIBBERISH_SENTENCE,
|
|
SENTENCE_WITH_ELLIPSIS,
|
|
SENTENCE_WITH_LINEBREAK_1,
|
|
SENTENCE_WITH_LINEBREAK_2,
|
|
SENTENCE_WITH_LINEBREAKS,
|
|
SENTENCE_WITH_EXCESS_WHITESPACE,
|
|
SENTENCE_WITH_TABS,
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("model_name", TOKENIZERS_TO_TEST)
|
|
def test_tokenization_on_edge_cases_full_sequence_tokenization(model_name: str, edge_case: str):
|
|
"""
|
|
Verify that tokenization on full sequence is the same as the one on "whitespace tokenized words"
|
|
"""
|
|
tokenizer = FeatureExtractor(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True)
|
|
|
|
pre_tokenizer = WhitespaceSplit()
|
|
words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case)
|
|
words = [x[0] for x in words_and_spans]
|
|
|
|
encoded = tokenizer.feature_extractor(words, is_split_into_words=True, add_special_tokens=False).encodings[0]
|
|
expected_tokenization = tokenizer.feature_extractor.tokenize(
|
|
" ".join(edge_case.split())
|
|
) # remove multiple whitespaces
|
|
|
|
assert encoded.tokens == expected_tokenization
|
|
|
|
|
|
@pytest.mark.parametrize("edge_case", [SENTENCE_WITH_CUSTOM_TOKEN, GERMAN_SENTENCE])
|
|
@pytest.mark.parametrize("model_name", [t for t in TOKENIZERS_TO_TEST if t != ROBERTA])
|
|
def test_tokenization_on_edge_cases_full_sequence_tokenization_roberta_exceptions(model_name: str, edge_case: str):
|
|
"""
|
|
Verify that tokenization on full sequence is the same as the one on "whitespace tokenized words".
|
|
These test cases work for all tokenizers under test except for RoBERTa.
|
|
"""
|
|
tokenizer = FeatureExtractor(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True)
|
|
|
|
pre_tokenizer = WhitespaceSplit()
|
|
words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case)
|
|
words = [x[0] for x in words_and_spans]
|
|
|
|
encoded = tokenizer.feature_extractor(words, is_split_into_words=True, add_special_tokens=False).encodings[0]
|
|
expected_tokenization = tokenizer.feature_extractor.tokenize(
|
|
" ".join(edge_case.split())
|
|
) # remove multiple whitespaces
|
|
|
|
assert encoded.tokens == expected_tokenization
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"edge_case",
|
|
[
|
|
REGULAR_SENTENCE,
|
|
# OTHER_ALPHABETS, # contains [UNK] that are impossible to match back to original text space
|
|
GIBBERISH_SENTENCE,
|
|
SENTENCE_WITH_ELLIPSIS,
|
|
SENTENCE_WITH_LINEBREAK_1,
|
|
SENTENCE_WITH_LINEBREAK_2,
|
|
SENTENCE_WITH_LINEBREAKS,
|
|
SENTENCE_WITH_EXCESS_WHITESPACE,
|
|
SENTENCE_WITH_TABS,
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("model_name,marker", TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER)
|
|
def test_tokenization_on_edge_cases_full_sequence_verify_spans(model_name: str, marker: str, edge_case: str):
|
|
tokenizer = FeatureExtractor(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True)
|
|
|
|
pre_tokenizer = WhitespaceSplit()
|
|
words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case)
|
|
words = [x[0] for x in words_and_spans]
|
|
word_spans = [x[1] for x in words_and_spans]
|
|
|
|
encoded = tokenizer.feature_extractor(words, is_split_into_words=True, add_special_tokens=False).encodings[0]
|
|
|
|
# subword-tokens have special chars depending on model type. To align with original text we get rid of them
|
|
tokens = [token.replace(marker, "") for token in encoded.tokens]
|
|
token_offsets = convert_offset_from_word_reference_to_text_reference(encoded.offsets, encoded.words, word_spans)
|
|
|
|
for token, (start, end) in zip(tokens, token_offsets):
|
|
assert token == edge_case[start:end]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"edge_case",
|
|
[
|
|
REGULAR_SENTENCE,
|
|
GERMAN_SENTENCE,
|
|
SENTENCE_WITH_EXCESS_WHITESPACE,
|
|
OTHER_ALPHABETS,
|
|
GIBBERISH_SENTENCE,
|
|
SENTENCE_WITH_ELLIPSIS,
|
|
SENTENCE_WITH_CUSTOM_TOKEN,
|
|
SENTENCE_WITH_LINEBREAK_1,
|
|
SENTENCE_WITH_LINEBREAK_2,
|
|
SENTENCE_WITH_LINEBREAKS,
|
|
SENTENCE_WITH_TABS,
|
|
],
|
|
)
|
|
def test_detokenization_for_bert(edge_case):
|
|
tokenizer = FeatureExtractor(pretrained_model_name_or_path=BERT, do_lower_case=False)
|
|
|
|
encoded = tokenizer(edge_case, add_special_tokens=False).encodings[0]
|
|
|
|
detokenized = " ".join(encoded.tokens)
|
|
detokenized = re.sub(r"(^|\s+)(##)", "", detokenized)
|
|
|
|
detokenized_ids = tokenizer(detokenized, add_special_tokens=False)["input_ids"]
|
|
detokenized_tokens = [tokenizer.decode([tok_id]).strip() for tok_id in detokenized_ids]
|
|
|
|
assert encoded.tokens == detokenized_tokens
|
|
|
|
|
|
def test_encode_plus_for_bert():
|
|
tokenizer = FeatureExtractor(pretrained_model_name_or_path=BERT, do_lower_case=False)
|
|
text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"
|
|
|
|
encoded_batch = tokenizer(text)
|
|
encoded = encoded_batch.encodings[0]
|
|
|
|
words = np.array(encoded.words)
|
|
words[0] = -1
|
|
words[-1] = -1
|
|
|
|
print(words.tolist())
|
|
|
|
tokens = encoded.tokens
|
|
offsets = [x[0] for x in encoded.offsets]
|
|
start_of_word = [False] + list(np.ediff1d(words) > 0)
|
|
|
|
assert list(zip(tokens, offsets, start_of_word)) == [
|
|
("[CLS]", 0, False),
|
|
("Some", 0, True),
|
|
("Text", 5, True),
|
|
("with", 10, True),
|
|
("never", 15, True),
|
|
("##see", 20, False),
|
|
("##nto", 23, False),
|
|
("##ken", 26, False),
|
|
("##s", 29, False),
|
|
("plus", 31, True),
|
|
("!", 36, True),
|
|
("215", 37, True),
|
|
("?", 40, True),
|
|
("#", 41, True),
|
|
(".", 42, True),
|
|
("and", 44, True),
|
|
("a", 48, True),
|
|
("combined", 50, True),
|
|
("-", 58, True),
|
|
("token", 59, True),
|
|
("_", 64, True),
|
|
("with", 65, True),
|
|
("/", 69, True),
|
|
("ch", 70, True),
|
|
("##ars", 72, False),
|
|
("[SEP]", 0, False),
|
|
]
|
|
|
|
|
|
def test_tokenize_custom_vocab_bert():
|
|
tokenizer = FeatureExtractor(pretrained_model_name_or_path=BERT, do_lower_case=False)
|
|
|
|
tokenizer.feature_extractor.add_tokens(new_tokens=["neverseentokens"])
|
|
text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"
|
|
|
|
tokenized = tokenizer.feature_extractor.tokenize(text)
|
|
assert (
|
|
tokenized == "Some Text with neverseentokens plus ! 215 ? # . and a combined - token _ with / ch ##ars".split()
|
|
)
|
|
encoded = tokenizer.feature_extractor(text, add_special_tokens=False).encodings[0]
|
|
offsets = [x[0] for x in encoded.offsets]
|
|
start_of_word_single = [True] + list(np.ediff1d(encoded.words) > 0)
|
|
|
|
assert encoded.tokens == tokenized
|
|
assert offsets == [0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72]
|
|
assert start_of_word_single == [True] * 19 + [False]
|