haystack/e2e/modeling/test_feature_extraction.py
ZanSara b60d9a2cbf
test: move several modeling tests in e2e/ (#4308)
* no dpr test seems worth mocking

* move distillation tests

* pylint

* mypy

* pylint

* move feature_extraction tests as well

* move feature_extraction tests as well

* merge feature extractor suites

* get_language_model tests and adaptive model tests

* duplicate test

* moving fixtures

* mypy

* mypy-again

* trigger

* un-mock integration test

* review feedback

* feedback

* pylint
2023-04-28 17:08:41 +02:00

247 lines
9.0 KiB
Python

import re
import pytest
import numpy as np
from tokenizers.pre_tokenizers import WhitespaceSplit
from haystack.modeling.model.feature_extraction import FeatureExtractor
BERT = "bert-base-cased"
ROBERTA = "roberta-base"
XLNET = "xlnet-base-cased"
TOKENIZERS_TO_TEST = [BERT, ROBERTA, XLNET]
TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER = [(BERT, "##"), (ROBERTA, "Ġ"), (XLNET, "")]
REGULAR_SENTENCE = "This is a sentence"
GERMAN_SENTENCE = "Der entscheidende Pass"
OTHER_ALPHABETS = "力加勝北区ᴵᴺᵀᵃছজটডণত"
GIBBERISH_SENTENCE = "Thiso text is included tolod makelio sure Unicodeel is handled properly:"
SENTENCE_WITH_ELLIPSIS = "This is a sentence..."
SENTENCE_WITH_LINEBREAK_1 = "and another one\n\n\nwithout space"
SENTENCE_WITH_LINEBREAK_2 = """This is a sentence.
With linebreak"""
SENTENCE_WITH_LINEBREAKS = """Sentence
with
multiple
newlines
"""
SENTENCE_WITH_EXCESS_WHITESPACE = "This is a sentence with multiple spaces"
SENTENCE_WITH_TABS = "This is a sentence with multiple tabs"
SENTENCE_WITH_CUSTOM_TOKEN = "Let's see all on this text and. !23# neverseenwordspossible"
def convert_offset_from_word_reference_to_text_reference(offsets, words, word_spans):
"""
Token offsets are originally relative to the beginning of the word
We make them relative to the beginning of the sentence.
Not a fixture, just a utility.
"""
token_offsets = []
for (start, end), word_index in zip(offsets, words):
word_start = word_spans[word_index][0]
token_offsets.append((start + word_start, end + word_start))
return token_offsets
@pytest.mark.parametrize("model_name", TOKENIZERS_TO_TEST)
def test_save_load(tmp_path, model_name: str):
tokenizer = FeatureExtractor(pretrained_model_name_or_path=model_name, do_lower_case=False)
text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"
tokenizer.feature_extractor.add_tokens(new_tokens=["neverseentokens"])
original_encoding = tokenizer.feature_extractor(text)
save_dir = tmp_path / "saved_tokenizer"
tokenizer.feature_extractor.save_pretrained(save_dir)
tokenizer_loaded = FeatureExtractor(pretrained_model_name_or_path=save_dir)
new_encoding = tokenizer_loaded.feature_extractor(text)
assert original_encoding == new_encoding
@pytest.mark.parametrize(
"edge_case",
[
REGULAR_SENTENCE,
OTHER_ALPHABETS,
GIBBERISH_SENTENCE,
SENTENCE_WITH_ELLIPSIS,
SENTENCE_WITH_LINEBREAK_1,
SENTENCE_WITH_LINEBREAK_2,
SENTENCE_WITH_LINEBREAKS,
SENTENCE_WITH_EXCESS_WHITESPACE,
SENTENCE_WITH_TABS,
],
)
@pytest.mark.parametrize("model_name", TOKENIZERS_TO_TEST)
def test_tokenization_on_edge_cases_full_sequence_tokenization(model_name: str, edge_case: str):
"""
Verify that tokenization on full sequence is the same as the one on "whitespace tokenized words"
"""
tokenizer = FeatureExtractor(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True)
pre_tokenizer = WhitespaceSplit()
words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case)
words = [x[0] for x in words_and_spans]
encoded = tokenizer.feature_extractor(words, is_split_into_words=True, add_special_tokens=False).encodings[0]
expected_tokenization = tokenizer.feature_extractor.tokenize(
" ".join(edge_case.split())
) # remove multiple whitespaces
assert encoded.tokens == expected_tokenization
@pytest.mark.parametrize("edge_case", [SENTENCE_WITH_CUSTOM_TOKEN, GERMAN_SENTENCE])
@pytest.mark.parametrize("model_name", [t for t in TOKENIZERS_TO_TEST if t != ROBERTA])
def test_tokenization_on_edge_cases_full_sequence_tokenization_roberta_exceptions(model_name: str, edge_case: str):
"""
Verify that tokenization on full sequence is the same as the one on "whitespace tokenized words".
These test cases work for all tokenizers under test except for RoBERTa.
"""
tokenizer = FeatureExtractor(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True)
pre_tokenizer = WhitespaceSplit()
words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case)
words = [x[0] for x in words_and_spans]
encoded = tokenizer.feature_extractor(words, is_split_into_words=True, add_special_tokens=False).encodings[0]
expected_tokenization = tokenizer.feature_extractor.tokenize(
" ".join(edge_case.split())
) # remove multiple whitespaces
assert encoded.tokens == expected_tokenization
@pytest.mark.parametrize(
"edge_case",
[
REGULAR_SENTENCE,
# OTHER_ALPHABETS, # contains [UNK] that are impossible to match back to original text space
GIBBERISH_SENTENCE,
SENTENCE_WITH_ELLIPSIS,
SENTENCE_WITH_LINEBREAK_1,
SENTENCE_WITH_LINEBREAK_2,
SENTENCE_WITH_LINEBREAKS,
SENTENCE_WITH_EXCESS_WHITESPACE,
SENTENCE_WITH_TABS,
],
)
@pytest.mark.parametrize("model_name,marker", TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER)
def test_tokenization_on_edge_cases_full_sequence_verify_spans(model_name: str, marker: str, edge_case: str):
tokenizer = FeatureExtractor(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True)
pre_tokenizer = WhitespaceSplit()
words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case)
words = [x[0] for x in words_and_spans]
word_spans = [x[1] for x in words_and_spans]
encoded = tokenizer.feature_extractor(words, is_split_into_words=True, add_special_tokens=False).encodings[0]
# subword-tokens have special chars depending on model type. To align with original text we get rid of them
tokens = [token.replace(marker, "") for token in encoded.tokens]
token_offsets = convert_offset_from_word_reference_to_text_reference(encoded.offsets, encoded.words, word_spans)
for token, (start, end) in zip(tokens, token_offsets):
assert token == edge_case[start:end]
@pytest.mark.parametrize(
"edge_case",
[
REGULAR_SENTENCE,
GERMAN_SENTENCE,
SENTENCE_WITH_EXCESS_WHITESPACE,
OTHER_ALPHABETS,
GIBBERISH_SENTENCE,
SENTENCE_WITH_ELLIPSIS,
SENTENCE_WITH_CUSTOM_TOKEN,
SENTENCE_WITH_LINEBREAK_1,
SENTENCE_WITH_LINEBREAK_2,
SENTENCE_WITH_LINEBREAKS,
SENTENCE_WITH_TABS,
],
)
def test_detokenization_for_bert(edge_case):
tokenizer = FeatureExtractor(pretrained_model_name_or_path=BERT, do_lower_case=False)
encoded = tokenizer(edge_case, add_special_tokens=False).encodings[0]
detokenized = " ".join(encoded.tokens)
detokenized = re.sub(r"(^|\s+)(##)", "", detokenized)
detokenized_ids = tokenizer(detokenized, add_special_tokens=False)["input_ids"]
detokenized_tokens = [tokenizer.decode([tok_id]).strip() for tok_id in detokenized_ids]
assert encoded.tokens == detokenized_tokens
def test_encode_plus_for_bert():
tokenizer = FeatureExtractor(pretrained_model_name_or_path=BERT, do_lower_case=False)
text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"
encoded_batch = tokenizer(text)
encoded = encoded_batch.encodings[0]
words = np.array(encoded.words)
words[0] = -1
words[-1] = -1
print(words.tolist())
tokens = encoded.tokens
offsets = [x[0] for x in encoded.offsets]
start_of_word = [False] + list(np.ediff1d(words) > 0)
assert list(zip(tokens, offsets, start_of_word)) == [
("[CLS]", 0, False),
("Some", 0, True),
("Text", 5, True),
("with", 10, True),
("never", 15, True),
("##see", 20, False),
("##nto", 23, False),
("##ken", 26, False),
("##s", 29, False),
("plus", 31, True),
("!", 36, True),
("215", 37, True),
("?", 40, True),
("#", 41, True),
(".", 42, True),
("and", 44, True),
("a", 48, True),
("combined", 50, True),
("-", 58, True),
("token", 59, True),
("_", 64, True),
("with", 65, True),
("/", 69, True),
("ch", 70, True),
("##ars", 72, False),
("[SEP]", 0, False),
]
def test_tokenize_custom_vocab_bert():
tokenizer = FeatureExtractor(pretrained_model_name_or_path=BERT, do_lower_case=False)
tokenizer.feature_extractor.add_tokens(new_tokens=["neverseentokens"])
text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars"
tokenized = tokenizer.feature_extractor.tokenize(text)
assert (
tokenized == "Some Text with neverseentokens plus ! 215 ? # . and a combined - token _ with / ch ##ars".split()
)
encoded = tokenizer.feature_extractor(text, add_special_tokens=False).encodings[0]
offsets = [x[0] for x in encoded.offsets]
start_of_word_single = [True] + list(np.ediff1d(encoded.words) > 0)
assert encoded.tokens == tokenized
assert offsets == [0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72]
assert start_of_word_single == [True] * 19 + [False]