import copy import logging import pytest from transformers import AutoTokenizer from haystack.modeling.data_handler.processor import SquadProcessor # during inference (parameter return_baskets = False) we do not convert labels def test_dataset_from_dicts_qa_inference(samples_path, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) models = [ "deepset/roberta-base-squad2", "deepset/bert-base-cased-squad2", "deepset/xlm-roberta-large-squad2", "deepset/minilm-uncased-squad2", "deepset/electra-base-squad2", ] sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"] for model in models: tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None) for sample_type in sample_types: dicts = processor.file_to_dicts(samples_path / "qa" / f"{sample_type}.json") dataset, tensor_names, problematic_sample_ids, baskets = processor.dataset_from_dicts( dicts, indices=[1], return_baskets=True ) assert tensor_names == [ "input_ids", "padding_mask", "segment_ids", "passage_start_t", "start_of_word", "labels", "id", "seq_2_start_t", "span_mask", ], f"Processing for {model} has changed." assert len(problematic_sample_ids) == 0, f"Processing for {model} has changed." assert baskets[0].id_external == "5ad3d560604f3c001a3ff2c8", f"Processing for {model} has changed." assert baskets[0].id_internal == "1-0", f"Processing for {model} has changed." # roberta if model == "deepset/roberta-base-squad2": assert ( len(baskets[0].samples[0].tokenized["passage_tokens"]) == 6 ), f"Processing for {model} has changed." assert ( len(baskets[0].samples[0].tokenized["question_tokens"]) == 7 ), f"Processing for {model} has changed." if sample_type == "noanswer": assert baskets[0].samples[0].features[0]["input_ids"][:13] == [ 0, 6179, 171, 82, 697, 11, 2201, 116, 2, 2, 26795, 2614, 34, ], f"Processing for {model} and {sample_type}-testsample has changed." else: assert baskets[0].samples[0].features[0]["input_ids"][:13] == [ 0, 6179, 171, 82, 697, 11, 5459, 116, 2, 2, 26795, 2614, 34, ], f"Processing for {model} and {sample_type}-testsample has changed." # bert if model == "deepset/bert-base-cased-squad2": assert ( len(baskets[0].samples[0].tokenized["passage_tokens"]) == 5 ), f"Processing for {model} has changed." assert ( len(baskets[0].samples[0].tokenized["question_tokens"]) == 7 ), f"Processing for {model} has changed." if sample_type == "noanswer": assert baskets[0].samples[0].features[0]["input_ids"][:10] == [ 101, 1731, 1242, 1234, 1686, 1107, 2123, 136, 102, 3206, ], f"Processing for {model} and {sample_type}-testsample has changed." else: assert baskets[0].samples[0].features[0]["input_ids"][:10] == [ 101, 1731, 1242, 1234, 1686, 1107, 3206, 136, 102, 3206, ], f"Processing for {model} and {sample_type}-testsample has changed." # xlm-roberta if model == "deepset/xlm-roberta-large-squad2": assert ( len(baskets[0].samples[0].tokenized["passage_tokens"]) == 7 ), f"Processing for {model} has changed." assert ( len(baskets[0].samples[0].tokenized["question_tokens"]) == 7 ), f"Processing for {model} has changed." if sample_type == "noanswer": assert baskets[0].samples[0].features[0]["input_ids"][:12] == [ 0, 11249, 5941, 3395, 6867, 23, 7270, 32, 2, 2, 10271, 1556, ], f"Processing for {model} and {sample_type}-testsample has changed." else: assert baskets[0].samples[0].features[0]["input_ids"][:12] == [ 0, 11249, 5941, 3395, 6867, 23, 10271, 32, 2, 2, 10271, 1556, ], f"Processing for {model} and {sample_type}-testsample has changed." # minilm and electra have same vocab + tokenizer if model == "deepset/minilm-uncased-squad2" or model == "deepset/electra-base-squad2": assert ( len(baskets[0].samples[0].tokenized["passage_tokens"]) == 5 ), f"Processing for {model} has changed." assert ( len(baskets[0].samples[0].tokenized["question_tokens"]) == 7 ), f"Processing for {model} has changed." if sample_type == "noanswer": assert baskets[0].samples[0].features[0]["input_ids"][:10] == [ 101, 2129, 2116, 2111, 2444, 1999, 3000, 1029, 102, 4068, ], f"Processing for {model} and {sample_type}-testsample has changed." else: assert baskets[0].samples[0].features[0]["input_ids"][:10] == [ 101, 2129, 2116, 2111, 2444, 1999, 4068, 1029, 102, 4068, ], f"Processing for {model} and {sample_type}-testsample has changed." def test_batch_encoding_flatten_rename(): from haystack.modeling.data_handler.dataset import flatten_rename tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") batch_sentences = ["Hello I'm a single sentence", "And another sentence", "And the very very last one"] encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True) keys = ["input_ids", "token_type_ids", "attention_mask"] rename_keys = ["input_ids", "segment_ids", "padding_mask"] features_flat = flatten_rename(encoded_inputs, keys, rename_keys) assert len(features_flat) == 3, "should have three elements in the feature dict list" for e in features_flat: for k in rename_keys: assert k in e, f"feature dict list item {e} in a list should have a key {k}" # rename no keys/rename keys features_flat = flatten_rename(encoded_inputs) assert len(features_flat) == 3, "should have three elements in the feature dict list" for e in features_flat: for k in keys: assert k in e, f"feature dict list item {e} in a list should have a key {k}" # empty input keys flatten_rename(encoded_inputs, []) # empty keys and rename keys flatten_rename(encoded_inputs, [], []) # no encoding_batch provided flatten_rename(None, [], []) # keys and renamed_keys have different sizes try: flatten_rename(encoded_inputs, [], ["blah"]) except AssertionError: pass def test_dataset_from_dicts_qa_label_conversion(samples_path, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) models = [ "deepset/roberta-base-squad2", "deepset/bert-base-cased-squad2", "deepset/xlm-roberta-large-squad2", "deepset/minilm-uncased-squad2", "deepset/electra-base-squad2", ] sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"] for model in models: tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None, max_answers=6) for sample_type in sample_types: dicts = processor.file_to_dicts(samples_path / "qa" / f"{sample_type}.json") dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts( dicts, indices=[1], return_baskets=False ) if sample_type == "answer-wrong" or sample_type == "answer-offset-wrong": assert len(problematic_sample_ids) == 1, f"Processing labels for {model} has changed." if sample_type == "noanswer": assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [ 0, 0, ], f"Processing labels for {model} has changed." assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 1, :]) == [ -1, -1, ], f"Processing labels for {model} has changed." if sample_type == "vanilla": # roberta if model == "deepset/roberta-base-squad2": assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [ 13, 13, ], f"Processing labels for {model} has changed." assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 1, :]) == [ 13, 14, ], f"Processing labels for {model} has changed." # bert, minilm, electra if ( model == "deepset/bert-base-cased-squad2" or model == "deepset/minilm-uncased-squad2" or model == "deepset/electra-base-squad2" ): assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [ 11, 11, ], f"Processing labels for {model} has changed." # xlm-roberta if model == "deepset/xlm-roberta-large-squad2": assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [ 12, 12, ], f"Processing labels for {model} has changed." @pytest.mark.integration def test_dataset_from_dicts_auto_determine_max_answers(samples_path, caplog=None): """ SquadProcessor should determine the number of answers for the pytorch dataset based on the maximum number of answers for each question. Vanilla.json has one question with two answers, so the number of answers should be two. """ model = "deepset/roberta-base-squad2" tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None) dicts = processor.file_to_dicts(samples_path / "qa" / "vanilla.json") dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts, indices=[1]) assert len(dataset[0][tensor_names.index("labels")]) == 2 # check that a max_answers will be adjusted when processing a different dataset with the same SquadProcessor dicts_more_answers = copy.deepcopy(dicts) dicts_more_answers[0]["qas"][0]["answers"] = dicts_more_answers[0]["qas"][0]["answers"] * 3 dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts_more_answers, indices=[1]) assert len(dataset[0][tensor_names.index("labels")]) == 6 @pytest.mark.integration def test_dataset_from_dicts_truncate_max_answers(samples_path, caplog=None): """ Test that it is possible to manually set the number of answers, truncating the answers in the data. """ model = "deepset/roberta-base-squad2" tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None, max_answers=1) dicts = processor.file_to_dicts(samples_path / "qa" / "vanilla.json") dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts, indices=[1]) assert len(dataset[0][tensor_names.index("labels")]) == 1