import copy import logging from pathlib import Path import pytest from transformers import AutoTokenizer from haystack.modeling.data_handler.processor import SquadProcessor, _is_json import contextlib # during inference (parameter return_baskets = False) we do not convert labels def test_dataset_from_dicts_qa_inference(samples_path, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) models = [ "deepset/roberta-base-squad2", "deepset/bert-base-cased-squad2", "deepset/xlm-roberta-large-squad2", "deepset/minilm-uncased-squad2", "deepset/electra-base-squad2", ] sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"] for model in models: tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None) for sample_type in sample_types: dicts = processor.file_to_dicts(samples_path / "qa" / f"{sample_type}.json") dataset, tensor_names, problematic_sample_ids, baskets = processor.dataset_from_dicts( dicts, indices=[1], return_baskets=True ) assert tensor_names == [ "input_ids", "padding_mask", "segment_ids", "passage_start_t", "start_of_word", "labels", "id", "seq_2_start_t", "span_mask", ], f"Processing for {model} has changed." assert len(problematic_sample_ids) == 0, f"Processing for {model} has changed." assert baskets[0].id_external == "5ad3d560604f3c001a3ff2c8", f"Processing for {model} has changed." assert baskets[0].id_internal == "1-0", f"Processing for {model} has changed." # roberta if model == "deepset/roberta-base-squad2": assert ( len(baskets[0].samples[0].tokenized["passage_tokens"]) == 6 ), f"Processing for {model} has changed." assert ( len(baskets[0].samples[0].tokenized["question_tokens"]) == 7 ), f"Processing for {model} has changed." if sample_type == "noanswer": assert baskets[0].samples[0].features[0]["input_ids"][:13] == [ 0, 6179, 171, 82, 697, 11, 2201, 116, 2, 2, 26795, 2614, 34, ], f"Processing for {model} and {sample_type}-testsample has changed." else: assert baskets[0].samples[0].features[0]["input_ids"][:13] == [ 0, 6179, 171, 82, 697, 11, 5459, 116, 2, 2, 26795, 2614, 34, ], f"Processing for {model} and {sample_type}-testsample has changed." # bert if model == "deepset/bert-base-cased-squad2": assert ( len(baskets[0].samples[0].tokenized["passage_tokens"]) == 5 ), f"Processing for {model} has changed." assert ( len(baskets[0].samples[0].tokenized["question_tokens"]) == 7 ), f"Processing for {model} has changed." if sample_type == "noanswer": assert baskets[0].samples[0].features[0]["input_ids"][:10] == [ 101, 1731, 1242, 1234, 1686, 1107, 2123, 136, 102, 3206, ], f"Processing for {model} and {sample_type}-testsample has changed." else: assert baskets[0].samples[0].features[0]["input_ids"][:10] == [ 101, 1731, 1242, 1234, 1686, 1107, 3206, 136, 102, 3206, ], f"Processing for {model} and {sample_type}-testsample has changed." # xlm-roberta if model == "deepset/xlm-roberta-large-squad2": assert ( len(baskets[0].samples[0].tokenized["passage_tokens"]) == 7 ), f"Processing for {model} has changed." assert ( len(baskets[0].samples[0].tokenized["question_tokens"]) == 7 ), f"Processing for {model} has changed." if sample_type == "noanswer": assert baskets[0].samples[0].features[0]["input_ids"][:12] == [ 0, 11249, 5941, 3395, 6867, 23, 7270, 32, 2, 2, 10271, 1556, ], f"Processing for {model} and {sample_type}-testsample has changed." else: assert baskets[0].samples[0].features[0]["input_ids"][:12] == [ 0, 11249, 5941, 3395, 6867, 23, 10271, 32, 2, 2, 10271, 1556, ], f"Processing for {model} and {sample_type}-testsample has changed." # minilm and electra have same vocab + tokenizer if model == "deepset/minilm-uncased-squad2" or model == "deepset/electra-base-squad2": assert ( len(baskets[0].samples[0].tokenized["passage_tokens"]) == 5 ), f"Processing for {model} has changed." assert ( len(baskets[0].samples[0].tokenized["question_tokens"]) == 7 ), f"Processing for {model} has changed." if sample_type == "noanswer": assert baskets[0].samples[0].features[0]["input_ids"][:10] == [ 101, 2129, 2116, 2111, 2444, 1999, 3000, 1029, 102, 4068, ], f"Processing for {model} and {sample_type}-testsample has changed." else: assert baskets[0].samples[0].features[0]["input_ids"][:10] == [ 101, 2129, 2116, 2111, 2444, 1999, 4068, 1029, 102, 4068, ], f"Processing for {model} and {sample_type}-testsample has changed." def test_batch_encoding_flatten_rename(): from haystack.modeling.data_handler.dataset import flatten_rename tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") batch_sentences = ["Hello I'm a single sentence", "And another sentence", "And the very very last one"] encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True) keys = ["input_ids", "token_type_ids", "attention_mask"] rename_keys = ["input_ids", "segment_ids", "padding_mask"] features_flat = flatten_rename(encoded_inputs, keys, rename_keys) assert len(features_flat) == 3, "should have three elements in the feature dict list" for e in features_flat: for k in rename_keys: assert k in e, f"feature dict list item {e} in a list should have a key {k}" # rename no keys/rename keys features_flat = flatten_rename(encoded_inputs) assert len(features_flat) == 3, "should have three elements in the feature dict list" for e in features_flat: for k in keys: assert k in e, f"feature dict list item {e} in a list should have a key {k}" # empty input keys flatten_rename(encoded_inputs, []) # empty keys and rename keys flatten_rename(encoded_inputs, [], []) # no encoding_batch provided flatten_rename(None, [], []) # keys and renamed_keys have different sizes with contextlib.suppress(AssertionError): flatten_rename(encoded_inputs, [], ["blah"]) def test_dataset_from_dicts_qa_label_conversion(samples_path, caplog=None): if caplog: caplog.set_level(logging.CRITICAL) models = [ "deepset/roberta-base-squad2", "deepset/bert-base-cased-squad2", "deepset/xlm-roberta-large-squad2", "deepset/minilm-uncased-squad2", "deepset/electra-base-squad2", ] sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"] for model in models: tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None, max_answers=6) for sample_type in sample_types: dicts = processor.file_to_dicts(samples_path / "qa" / f"{sample_type}.json") dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts( dicts, indices=[1], return_baskets=False ) if sample_type == "answer-wrong" or sample_type == "answer-offset-wrong": assert len(problematic_sample_ids) == 1, f"Processing labels for {model} has changed." if sample_type == "noanswer": assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [ 0, 0, ], f"Processing labels for {model} has changed." assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 1, :]) == [ -1, -1, ], f"Processing labels for {model} has changed." if sample_type == "vanilla": # roberta if model == "deepset/roberta-base-squad2": assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [ 13, 13, ], f"Processing labels for {model} has changed." assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 1, :]) == [ 13, 14, ], f"Processing labels for {model} has changed." # bert, minilm, electra if ( model == "deepset/bert-base-cased-squad2" or model == "deepset/minilm-uncased-squad2" or model == "deepset/electra-base-squad2" ): assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [ 11, 11, ], f"Processing labels for {model} has changed." # xlm-roberta if model == "deepset/xlm-roberta-large-squad2": assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [ 12, 12, ], f"Processing labels for {model} has changed." @pytest.mark.unit def test_is_json_identifies_json_objects(): """Test that _is_json correctly identifies json objects""" # Paths to json files should be considered json assert _is_json(Path("processor_config.json")) # dicts should be considered json assert _is_json({"a": 1}) # non-serializable objects should not be considered json assert not _is_json(AutoTokenizer) @pytest.mark.integration def test_dataset_from_dicts_auto_determine_max_answers(samples_path, caplog=None): """ SquadProcessor should determine the number of answers for the pytorch dataset based on the maximum number of answers for each question. Vanilla.json has one question with two answers, so the number of answers should be two. """ model = "deepset/roberta-base-squad2" tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None) dicts = processor.file_to_dicts(samples_path / "qa" / "vanilla.json") dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts, indices=[1]) assert len(dataset[0][tensor_names.index("labels")]) == 2 # check that a max_answers will be adjusted when processing a different dataset with the same SquadProcessor dicts_more_answers = copy.deepcopy(dicts) dicts_more_answers[0]["qas"][0]["answers"] = dicts_more_answers[0]["qas"][0]["answers"] * 3 dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts_more_answers, indices=[1]) assert len(dataset[0][tensor_names.index("labels")]) == 6 @pytest.mark.integration def test_dataset_from_dicts_truncate_max_answers(samples_path, caplog=None): """ Test that it is possible to manually set the number of answers, truncating the answers in the data. """ model = "deepset/roberta-base-squad2" tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None, max_answers=1) dicts = processor.file_to_dicts(samples_path / "qa" / "vanilla.json") dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts, indices=[1]) assert len(dataset[0][tensor_names.index("labels")]) == 1