haystack/test/modeling/test_processor.py

import copy
import logging
from pathlib import Path

import pytest
from transformers import AutoTokenizer

from haystack.modeling.data_handler.processor import SquadProcessor, _is_json
import contextlib


# during inference (parameter return_baskets = False) we do not convert labels
def test_dataset_from_dicts_qa_inference(samples_path, caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    models = [
        "deepset/roberta-base-squad2",
        "deepset/bert-base-cased-squad2",
        "deepset/xlm-roberta-large-squad2",
        "deepset/minilm-uncased-squad2",
        "deepset/electra-base-squad2",
    ]
    sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"]

    for model in models:
        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
        processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None)

        for sample_type in sample_types:
            dicts = processor.file_to_dicts(samples_path / "qa" / f"{sample_type}.json")
            dataset, tensor_names, problematic_sample_ids, baskets = processor.dataset_from_dicts(
                dicts, indices=[1], return_baskets=True
            )
            assert tensor_names == [
                "input_ids",
                "padding_mask",
                "segment_ids",
                "passage_start_t",
                "start_of_word",
                "labels",
                "id",
                "seq_2_start_t",
                "span_mask",
            ], f"Processing for {model} has changed."
            assert len(problematic_sample_ids) == 0, f"Processing for {model} has changed."
            assert baskets[0].id_external == "5ad3d560604f3c001a3ff2c8", f"Processing for {model} has changed."
            assert baskets[0].id_internal == "1-0", f"Processing for {model} has changed."

            # roberta
            if model == "deepset/roberta-base-squad2":
                assert (
                    len(baskets[0].samples[0].tokenized["passage_tokens"]) == 6
                ), f"Processing for {model} has changed."
                assert (
                    len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
                ), f"Processing for {model} has changed."
                if sample_type == "noanswer":
                    assert baskets[0].samples[0].features[0]["input_ids"][:13] == [
                        0,
                        6179,
                        171,
                        82,
                        697,
                        11,
                        2201,
                        116,
                        2,
                        2,
                        26795,
                        2614,
                        34,
                    ], f"Processing for {model} and {sample_type}-testsample has changed."
                else:
                    assert baskets[0].samples[0].features[0]["input_ids"][:13] == [
                        0,
                        6179,
                        171,
                        82,
                        697,
                        11,
                        5459,
                        116,
                        2,
                        2,
                        26795,
                        2614,
                        34,
                    ], f"Processing for {model} and {sample_type}-testsample has changed."

            # bert
            if model == "deepset/bert-base-cased-squad2":
                assert (
                    len(baskets[0].samples[0].tokenized["passage_tokens"]) == 5
                ), f"Processing for {model} has changed."
                assert (
                    len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
                ), f"Processing for {model} has changed."
                if sample_type == "noanswer":
                    assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
                        101,
                        1731,
                        1242,
                        1234,
                        1686,
                        1107,
                        2123,
                        136,
                        102,
                        3206,
                    ], f"Processing for {model} and {sample_type}-testsample has changed."
                else:
                    assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
                        101,
                        1731,
                        1242,
                        1234,
                        1686,
                        1107,
                        3206,
                        136,
                        102,
                        3206,
                    ], f"Processing for {model} and {sample_type}-testsample has changed."

            # xlm-roberta
            if model == "deepset/xlm-roberta-large-squad2":
                assert (
                    len(baskets[0].samples[0].tokenized["passage_tokens"]) == 7
                ), f"Processing for {model} has changed."
                assert (
                    len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
                ), f"Processing for {model} has changed."
                if sample_type == "noanswer":
                    assert baskets[0].samples[0].features[0]["input_ids"][:12] == [
                        0,
                        11249,
                        5941,
                        3395,
                        6867,
                        23,
                        7270,
                        32,
                        2,
                        2,
                        10271,
                        1556,
                    ], f"Processing for {model} and {sample_type}-testsample has changed."
                else:
                    assert baskets[0].samples[0].features[0]["input_ids"][:12] == [
                        0,
                        11249,
                        5941,
                        3395,
                        6867,
                        23,
                        10271,
                        32,
                        2,
                        2,
                        10271,
                        1556,
                    ], f"Processing for {model} and {sample_type}-testsample has changed."

            # minilm and electra have same vocab + tokenizer
            if model == "deepset/minilm-uncased-squad2" or model == "deepset/electra-base-squad2":
                assert (
                    len(baskets[0].samples[0].tokenized["passage_tokens"]) == 5
                ), f"Processing for {model} has changed."
                assert (
                    len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
                ), f"Processing for {model} has changed."
                if sample_type == "noanswer":
                    assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
                        101,
                        2129,
                        2116,
                        2111,
                        2444,
                        1999,
                        3000,
                        1029,
                        102,
                        4068,
                    ], f"Processing for {model} and {sample_type}-testsample has changed."
                else:
                    assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
                        101,
                        2129,
                        2116,
                        2111,
                        2444,
                        1999,
                        4068,
                        1029,
                        102,
                        4068,
                    ], f"Processing for {model} and {sample_type}-testsample has changed."


def test_batch_encoding_flatten_rename():
    from haystack.modeling.data_handler.dataset import flatten_rename

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    batch_sentences = ["Hello I'm a single sentence", "And another sentence", "And the very very last one"]
    encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True)

    keys = ["input_ids", "token_type_ids", "attention_mask"]
    rename_keys = ["input_ids", "segment_ids", "padding_mask"]
    features_flat = flatten_rename(encoded_inputs, keys, rename_keys)

    assert len(features_flat) == 3, "should have three elements in the feature dict list"
    for e in features_flat:
        for k in rename_keys:
            assert k in e, f"feature dict list item {e} in a list should have a key {k}"

    # rename no keys/rename keys
    features_flat = flatten_rename(encoded_inputs)
    assert len(features_flat) == 3, "should have three elements in the feature dict list"
    for e in features_flat:
        for k in keys:
            assert k in e, f"feature dict list item {e} in a list should have a key {k}"

    # empty input keys
    flatten_rename(encoded_inputs, [])

    # empty keys and rename keys
    flatten_rename(encoded_inputs, [], [])

    # no encoding_batch provided
    flatten_rename(None, [], [])

    # keys and renamed_keys have different sizes
    with contextlib.suppress(AssertionError):
        flatten_rename(encoded_inputs, [], ["blah"])


def test_dataset_from_dicts_qa_label_conversion(samples_path, caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    models = [
        "deepset/roberta-base-squad2",
        "deepset/bert-base-cased-squad2",
        "deepset/xlm-roberta-large-squad2",
        "deepset/minilm-uncased-squad2",
        "deepset/electra-base-squad2",
    ]
    sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"]

    for model in models:
        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
        processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None, max_answers=6)

        for sample_type in sample_types:
            dicts = processor.file_to_dicts(samples_path / "qa" / f"{sample_type}.json")
            dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(
                dicts, indices=[1], return_baskets=False
            )

            if sample_type == "answer-wrong" or sample_type == "answer-offset-wrong":
                assert len(problematic_sample_ids) == 1, f"Processing labels for {model} has changed."

            if sample_type == "noanswer":
                assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
                    0,
                    0,
                ], f"Processing labels for {model} has changed."
                assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 1, :]) == [
                    -1,
                    -1,
                ], f"Processing labels for {model} has changed."

            if sample_type == "vanilla":
                # roberta
                if model == "deepset/roberta-base-squad2":
                    assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
                        13,
                        13,
                    ], f"Processing labels for {model} has changed."
                    assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 1, :]) == [
                        13,
                        14,
                    ], f"Processing labels for {model} has changed."
                # bert, minilm, electra
                if (
                    model == "deepset/bert-base-cased-squad2"
                    or model == "deepset/minilm-uncased-squad2"
                    or model == "deepset/electra-base-squad2"
                ):
                    assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
                        11,
                        11,
                    ], f"Processing labels for {model} has changed."
                # xlm-roberta
                if model == "deepset/xlm-roberta-large-squad2":
                    assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
                        12,
                        12,
                    ], f"Processing labels for {model} has changed."


@pytest.mark.unit
def test_is_json_identifies_json_objects():
    """Test that _is_json correctly identifies json objects"""
    # Paths to json files should be considered json
    assert _is_json(Path("processor_config.json"))
    # dicts should be considered json
    assert _is_json({"a": 1})
    # non-serializable objects should not be considered json
    assert not _is_json(AutoTokenizer)


@pytest.mark.integration
def test_dataset_from_dicts_auto_determine_max_answers(samples_path, caplog=None):
    """
    SquadProcessor should determine the number of answers for the pytorch dataset based on
    the maximum number of answers for each question. Vanilla.json has one question with two answers,
    so the number of answers should be two.
    """
    model = "deepset/roberta-base-squad2"
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
    processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None)
    dicts = processor.file_to_dicts(samples_path / "qa" / "vanilla.json")
    dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts, indices=[1])
    assert len(dataset[0][tensor_names.index("labels")]) == 2
    # check that a max_answers will be adjusted when processing a different dataset with the same SquadProcessor
    dicts_more_answers = copy.deepcopy(dicts)
    dicts_more_answers[0]["qas"][0]["answers"] = dicts_more_answers[0]["qas"][0]["answers"] * 3
    dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts_more_answers, indices=[1])
    assert len(dataset[0][tensor_names.index("labels")]) == 6


@pytest.mark.integration
def test_dataset_from_dicts_truncate_max_answers(samples_path, caplog=None):
    """
    Test that it is possible to manually set the number of answers, truncating the answers in the data.
    """
    model = "deepset/roberta-base-squad2"
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
    processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None, max_answers=1)
    dicts = processor.file_to_dicts(samples_path / "qa" / "vanilla.json")
    dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts, indices=[1])
    assert len(dataset[0][tensor_names.index("labels")]) == 1