haystack/test/modeling/test_processor.py
Julian Risch 8cfeed095d
build: Remove mmh3 dependency (#4896)
* build: Remove mmh3 dependency

* resolve circular import

* pylint

* make mmh3.py sibling of schema.py

* pylint import order

* pylint

* undo example changes

* increase coverage in modeling module

* increase coverage further

* rename new unit tests
2023-05-17 21:31:08 +02:00

346 lines
14 KiB
Python

import copy
import logging
from pathlib import Path
import pytest
from transformers import AutoTokenizer
from haystack.modeling.data_handler.processor import SquadProcessor, _is_json
# during inference (parameter return_baskets = False) we do not convert labels
def test_dataset_from_dicts_qa_inference(samples_path, caplog=None):
if caplog:
caplog.set_level(logging.CRITICAL)
models = [
"deepset/roberta-base-squad2",
"deepset/bert-base-cased-squad2",
"deepset/xlm-roberta-large-squad2",
"deepset/minilm-uncased-squad2",
"deepset/electra-base-squad2",
]
sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"]
for model in models:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None)
for sample_type in sample_types:
dicts = processor.file_to_dicts(samples_path / "qa" / f"{sample_type}.json")
dataset, tensor_names, problematic_sample_ids, baskets = processor.dataset_from_dicts(
dicts, indices=[1], return_baskets=True
)
assert tensor_names == [
"input_ids",
"padding_mask",
"segment_ids",
"passage_start_t",
"start_of_word",
"labels",
"id",
"seq_2_start_t",
"span_mask",
], f"Processing for {model} has changed."
assert len(problematic_sample_ids) == 0, f"Processing for {model} has changed."
assert baskets[0].id_external == "5ad3d560604f3c001a3ff2c8", f"Processing for {model} has changed."
assert baskets[0].id_internal == "1-0", f"Processing for {model} has changed."
# roberta
if model == "deepset/roberta-base-squad2":
assert (
len(baskets[0].samples[0].tokenized["passage_tokens"]) == 6
), f"Processing for {model} has changed."
assert (
len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
), f"Processing for {model} has changed."
if sample_type == "noanswer":
assert baskets[0].samples[0].features[0]["input_ids"][:13] == [
0,
6179,
171,
82,
697,
11,
2201,
116,
2,
2,
26795,
2614,
34,
], f"Processing for {model} and {sample_type}-testsample has changed."
else:
assert baskets[0].samples[0].features[0]["input_ids"][:13] == [
0,
6179,
171,
82,
697,
11,
5459,
116,
2,
2,
26795,
2614,
34,
], f"Processing for {model} and {sample_type}-testsample has changed."
# bert
if model == "deepset/bert-base-cased-squad2":
assert (
len(baskets[0].samples[0].tokenized["passage_tokens"]) == 5
), f"Processing for {model} has changed."
assert (
len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
), f"Processing for {model} has changed."
if sample_type == "noanswer":
assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
101,
1731,
1242,
1234,
1686,
1107,
2123,
136,
102,
3206,
], f"Processing for {model} and {sample_type}-testsample has changed."
else:
assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
101,
1731,
1242,
1234,
1686,
1107,
3206,
136,
102,
3206,
], f"Processing for {model} and {sample_type}-testsample has changed."
# xlm-roberta
if model == "deepset/xlm-roberta-large-squad2":
assert (
len(baskets[0].samples[0].tokenized["passage_tokens"]) == 7
), f"Processing for {model} has changed."
assert (
len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
), f"Processing for {model} has changed."
if sample_type == "noanswer":
assert baskets[0].samples[0].features[0]["input_ids"][:12] == [
0,
11249,
5941,
3395,
6867,
23,
7270,
32,
2,
2,
10271,
1556,
], f"Processing for {model} and {sample_type}-testsample has changed."
else:
assert baskets[0].samples[0].features[0]["input_ids"][:12] == [
0,
11249,
5941,
3395,
6867,
23,
10271,
32,
2,
2,
10271,
1556,
], f"Processing for {model} and {sample_type}-testsample has changed."
# minilm and electra have same vocab + tokenizer
if model == "deepset/minilm-uncased-squad2" or model == "deepset/electra-base-squad2":
assert (
len(baskets[0].samples[0].tokenized["passage_tokens"]) == 5
), f"Processing for {model} has changed."
assert (
len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
), f"Processing for {model} has changed."
if sample_type == "noanswer":
assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
101,
2129,
2116,
2111,
2444,
1999,
3000,
1029,
102,
4068,
], f"Processing for {model} and {sample_type}-testsample has changed."
else:
assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
101,
2129,
2116,
2111,
2444,
1999,
4068,
1029,
102,
4068,
], f"Processing for {model} and {sample_type}-testsample has changed."
def test_batch_encoding_flatten_rename():
from haystack.modeling.data_handler.dataset import flatten_rename
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
batch_sentences = ["Hello I'm a single sentence", "And another sentence", "And the very very last one"]
encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True)
keys = ["input_ids", "token_type_ids", "attention_mask"]
rename_keys = ["input_ids", "segment_ids", "padding_mask"]
features_flat = flatten_rename(encoded_inputs, keys, rename_keys)
assert len(features_flat) == 3, "should have three elements in the feature dict list"
for e in features_flat:
for k in rename_keys:
assert k in e, f"feature dict list item {e} in a list should have a key {k}"
# rename no keys/rename keys
features_flat = flatten_rename(encoded_inputs)
assert len(features_flat) == 3, "should have three elements in the feature dict list"
for e in features_flat:
for k in keys:
assert k in e, f"feature dict list item {e} in a list should have a key {k}"
# empty input keys
flatten_rename(encoded_inputs, [])
# empty keys and rename keys
flatten_rename(encoded_inputs, [], [])
# no encoding_batch provided
flatten_rename(None, [], [])
# keys and renamed_keys have different sizes
try:
flatten_rename(encoded_inputs, [], ["blah"])
except AssertionError:
pass
def test_dataset_from_dicts_qa_label_conversion(samples_path, caplog=None):
if caplog:
caplog.set_level(logging.CRITICAL)
models = [
"deepset/roberta-base-squad2",
"deepset/bert-base-cased-squad2",
"deepset/xlm-roberta-large-squad2",
"deepset/minilm-uncased-squad2",
"deepset/electra-base-squad2",
]
sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"]
for model in models:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None, max_answers=6)
for sample_type in sample_types:
dicts = processor.file_to_dicts(samples_path / "qa" / f"{sample_type}.json")
dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(
dicts, indices=[1], return_baskets=False
)
if sample_type == "answer-wrong" or sample_type == "answer-offset-wrong":
assert len(problematic_sample_ids) == 1, f"Processing labels for {model} has changed."
if sample_type == "noanswer":
assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
0,
0,
], f"Processing labels for {model} has changed."
assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 1, :]) == [
-1,
-1,
], f"Processing labels for {model} has changed."
if sample_type == "vanilla":
# roberta
if model == "deepset/roberta-base-squad2":
assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
13,
13,
], f"Processing labels for {model} has changed."
assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 1, :]) == [
13,
14,
], f"Processing labels for {model} has changed."
# bert, minilm, electra
if (
model == "deepset/bert-base-cased-squad2"
or model == "deepset/minilm-uncased-squad2"
or model == "deepset/electra-base-squad2"
):
assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
11,
11,
], f"Processing labels for {model} has changed."
# xlm-roberta
if model == "deepset/xlm-roberta-large-squad2":
assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
12,
12,
], f"Processing labels for {model} has changed."
@pytest.mark.unit
def test_is_json_identifies_json_objects():
"""Test that _is_json correctly identifies json objects"""
# Paths to json files should be considered json
assert _is_json(Path("processor_config.json"))
# dicts should be considered json
assert _is_json({"a": 1})
# non-serializable objects should not be considered json
assert not _is_json(AutoTokenizer)
@pytest.mark.integration
def test_dataset_from_dicts_auto_determine_max_answers(samples_path, caplog=None):
"""
SquadProcessor should determine the number of answers for the pytorch dataset based on
the maximum number of answers for each question. Vanilla.json has one question with two answers,
so the number of answers should be two.
"""
model = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None)
dicts = processor.file_to_dicts(samples_path / "qa" / "vanilla.json")
dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts, indices=[1])
assert len(dataset[0][tensor_names.index("labels")]) == 2
# check that a max_answers will be adjusted when processing a different dataset with the same SquadProcessor
dicts_more_answers = copy.deepcopy(dicts)
dicts_more_answers[0]["qas"][0]["answers"] = dicts_more_answers[0]["qas"][0]["answers"] * 3
dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts_more_answers, indices=[1])
assert len(dataset[0][tensor_names.index("labels")]) == 6
@pytest.mark.integration
def test_dataset_from_dicts_truncate_max_answers(samples_path, caplog=None):
"""
Test that it is possible to manually set the number of answers, truncating the answers in the data.
"""
model = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None, max_answers=1)
dicts = processor.file_to_dicts(samples_path / "qa" / "vanilla.json")
dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts, indices=[1])
assert len(dataset[0][tensor_names.index("labels")]) == 1