haystack/test/modeling/test_processor.py

346 lines
14 KiB
Python
Raw Normal View History

import copy
import logging
from pathlib import Path
import pytest
from transformers import AutoTokenizer
from haystack.modeling.data_handler.processor import SquadProcessor, _is_json
# during inference (parameter return_baskets = False) we do not convert labels
def test_dataset_from_dicts_qa_inference(samples_path, caplog=None):
if caplog:
caplog.set_level(logging.CRITICAL)
models = [
"deepset/roberta-base-squad2",
"deepset/bert-base-cased-squad2",
"deepset/xlm-roberta-large-squad2",
"deepset/minilm-uncased-squad2",
"deepset/electra-base-squad2",
]
sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"]
for model in models:
feat: `MultiModalRetriever` (#2891) * Adding Data2VecVision and Data2VecText to the supported models and adapt Tokenizers accordingly * content_types * Splitting classes into respective folders * small changes * Fix EOF * eof * black * API * EOF * whitespace * api * improve multimodal similarity processor * tokenizer -> feature extractor * Making feature vectors come out of the feature extractor in the similarity head * embed_queries is now self-sufficient * couple trivial errors * Implemented separate language model classes for multimodal inference * Document embedding seems to work * removing batch_encode_plus, is deprecated anyway * Realized the base Data2Vec models are not trained on retrieval tasks * Issue with the generated embeddings * Add batching * Try to fit CLIP in * Stub of CLIP integration * Retrieval goes through but returns noise only * Still working on the scores * Introduce temporary adapter for CLIP models * Image retrieval now works with sentence-transformers * Tidying up the code * Refactoring is now functional * Add MPNet to the supported sentence transformers models * Remove unused classes * pylint * docs * docs * Remove the method renaming * mpyp first pass * docs * tutorial * schema * mypy * Move devices setup into get_model * more mypy * mypy * pylint * Move a few params in HaystackModel's init * make feature extractor work with squadprocessor * fix feature_extractor_kwargs forwarding * Forgotten part of the fix * Revert unrelated ES change * Revert unrelated memdocstore changes * comment * Small corrections * mypy and pylint * mypy * typo * mypy * Refactor the call * mypy * Do not make FARMReader use the new FeatureExtractor * mypy * Detach DPR tests from FeatureExtractor too * Detach processor tests too * Add end2end marker * extract end2end feature extractor tests * temporary disable feature extraction tests * Introduce end2end tests for tokenizer tests * pylint * Fix model loading from folder in FeatureExtractor * working o n end2end * end2end keeps failing * Restructuring retriever tests * Restructuring retriever tests * remove covert_dataset_to_dataloader * remove comment * Better check sentence-transformers models * Use embed_meta_fields properly * rename passage into document * Embedding dims can't be found * Add check for models that support it * pylint * Split all retriever tests into suites, running mostly on InMemory only * fix mypy * fix tfidf test * fix weaviate tests * Parallelize on every docstore * Fix schema and specify modality in base retriever suite * tests * Add first image tests * remove comment * Revert to simpler tests * Update docs/_src/api/api/primitives.md Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Update haystack/modeling/model/multimodal/__init__.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * get_args * mypy * Update haystack/modeling/model/multimodal/__init__.py * Update haystack/modeling/model/multimodal/base.py * Update haystack/modeling/model/multimodal/base.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Update haystack/modeling/model/multimodal/sentence_transformers.py * Update haystack/modeling/model/multimodal/sentence_transformers.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Update haystack/modeling/model/multimodal/transformers.py * Update haystack/modeling/model/multimodal/transformers.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Update haystack/modeling/model/multimodal/transformers.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Update haystack/nodes/retriever/multimodal/retriever.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * mypy * mypy * removing more ContentTypes * more contentypes * pylint * add to __init__ * revert end2end workflow for now * missing integration markers * Update haystack/nodes/retriever/multimodal/embedder.py Co-authored-by: bogdankostic <bogdankostic@web.de> * review feedback, removing HaystackImageTransformerModel * review feedback part 2 * mypy & pylint * mypy * mypy * fix multimodal docs also for Pinecone * add note on internal constants * Fix pinecone write_documents * schemas * keep support for sentence-transformers only * fix pinecone test * schemas * fix pinecone again * temporarily disable some tests, need to understand if they're still relevant Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> Co-authored-by: bogdankostic <bogdankostic@web.de>
2022-10-17 18:58:35 +02:00
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None)
for sample_type in sample_types:
dicts = processor.file_to_dicts(samples_path / "qa" / f"{sample_type}.json")
dataset, tensor_names, problematic_sample_ids, baskets = processor.dataset_from_dicts(
dicts, indices=[1], return_baskets=True
)
assert tensor_names == [
"input_ids",
"padding_mask",
"segment_ids",
"passage_start_t",
"start_of_word",
"labels",
"id",
"seq_2_start_t",
"span_mask",
], f"Processing for {model} has changed."
assert len(problematic_sample_ids) == 0, f"Processing for {model} has changed."
assert baskets[0].id_external == "5ad3d560604f3c001a3ff2c8", f"Processing for {model} has changed."
assert baskets[0].id_internal == "1-0", f"Processing for {model} has changed."
# roberta
if model == "deepset/roberta-base-squad2":
assert (
len(baskets[0].samples[0].tokenized["passage_tokens"]) == 6
), f"Processing for {model} has changed."
assert (
len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
), f"Processing for {model} has changed."
if sample_type == "noanswer":
assert baskets[0].samples[0].features[0]["input_ids"][:13] == [
0,
6179,
171,
82,
697,
11,
2201,
116,
2,
2,
26795,
2614,
34,
], f"Processing for {model} and {sample_type}-testsample has changed."
else:
assert baskets[0].samples[0].features[0]["input_ids"][:13] == [
0,
6179,
171,
82,
697,
11,
5459,
116,
2,
2,
26795,
2614,
34,
], f"Processing for {model} and {sample_type}-testsample has changed."
# bert
if model == "deepset/bert-base-cased-squad2":
assert (
len(baskets[0].samples[0].tokenized["passage_tokens"]) == 5
), f"Processing for {model} has changed."
assert (
len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
), f"Processing for {model} has changed."
if sample_type == "noanswer":
assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
101,
1731,
1242,
1234,
1686,
1107,
2123,
136,
102,
3206,
], f"Processing for {model} and {sample_type}-testsample has changed."
else:
assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
101,
1731,
1242,
1234,
1686,
1107,
3206,
136,
102,
3206,
], f"Processing for {model} and {sample_type}-testsample has changed."
# xlm-roberta
if model == "deepset/xlm-roberta-large-squad2":
assert (
len(baskets[0].samples[0].tokenized["passage_tokens"]) == 7
), f"Processing for {model} has changed."
assert (
len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
), f"Processing for {model} has changed."
if sample_type == "noanswer":
assert baskets[0].samples[0].features[0]["input_ids"][:12] == [
0,
11249,
5941,
3395,
6867,
23,
7270,
32,
2,
2,
10271,
1556,
], f"Processing for {model} and {sample_type}-testsample has changed."
else:
assert baskets[0].samples[0].features[0]["input_ids"][:12] == [
0,
11249,
5941,
3395,
6867,
23,
10271,
32,
2,
2,
10271,
1556,
], f"Processing for {model} and {sample_type}-testsample has changed."
# minilm and electra have same vocab + tokenizer
if model == "deepset/minilm-uncased-squad2" or model == "deepset/electra-base-squad2":
assert (
len(baskets[0].samples[0].tokenized["passage_tokens"]) == 5
), f"Processing for {model} has changed."
assert (
len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
), f"Processing for {model} has changed."
if sample_type == "noanswer":
assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
101,
2129,
2116,
2111,
2444,
1999,
3000,
1029,
102,
4068,
], f"Processing for {model} and {sample_type}-testsample has changed."
else:
assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
101,
2129,
2116,
2111,
2444,
1999,
4068,
1029,
102,
4068,
], f"Processing for {model} and {sample_type}-testsample has changed."
def test_batch_encoding_flatten_rename():
from haystack.modeling.data_handler.dataset import flatten_rename
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
batch_sentences = ["Hello I'm a single sentence", "And another sentence", "And the very very last one"]
encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True)
keys = ["input_ids", "token_type_ids", "attention_mask"]
rename_keys = ["input_ids", "segment_ids", "padding_mask"]
features_flat = flatten_rename(encoded_inputs, keys, rename_keys)
assert len(features_flat) == 3, "should have three elements in the feature dict list"
for e in features_flat:
for k in rename_keys:
assert k in e, f"feature dict list item {e} in a list should have a key {k}"
# rename no keys/rename keys
features_flat = flatten_rename(encoded_inputs)
assert len(features_flat) == 3, "should have three elements in the feature dict list"
for e in features_flat:
for k in keys:
assert k in e, f"feature dict list item {e} in a list should have a key {k}"
# empty input keys
flatten_rename(encoded_inputs, [])
# empty keys and rename keys
flatten_rename(encoded_inputs, [], [])
# no encoding_batch provided
flatten_rename(None, [], [])
# keys and renamed_keys have different sizes
try:
flatten_rename(encoded_inputs, [], ["blah"])
except AssertionError:
pass
def test_dataset_from_dicts_qa_label_conversion(samples_path, caplog=None):
if caplog:
caplog.set_level(logging.CRITICAL)
models = [
"deepset/roberta-base-squad2",
"deepset/bert-base-cased-squad2",
"deepset/xlm-roberta-large-squad2",
"deepset/minilm-uncased-squad2",
"deepset/electra-base-squad2",
]
sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"]
for model in models:
feat: `MultiModalRetriever` (#2891) * Adding Data2VecVision and Data2VecText to the supported models and adapt Tokenizers accordingly * content_types * Splitting classes into respective folders * small changes * Fix EOF * eof * black * API * EOF * whitespace * api * improve multimodal similarity processor * tokenizer -> feature extractor * Making feature vectors come out of the feature extractor in the similarity head * embed_queries is now self-sufficient * couple trivial errors * Implemented separate language model classes for multimodal inference * Document embedding seems to work * removing batch_encode_plus, is deprecated anyway * Realized the base Data2Vec models are not trained on retrieval tasks * Issue with the generated embeddings * Add batching * Try to fit CLIP in * Stub of CLIP integration * Retrieval goes through but returns noise only * Still working on the scores * Introduce temporary adapter for CLIP models * Image retrieval now works with sentence-transformers * Tidying up the code * Refactoring is now functional * Add MPNet to the supported sentence transformers models * Remove unused classes * pylint * docs * docs * Remove the method renaming * mpyp first pass * docs * tutorial * schema * mypy * Move devices setup into get_model * more mypy * mypy * pylint * Move a few params in HaystackModel's init * make feature extractor work with squadprocessor * fix feature_extractor_kwargs forwarding * Forgotten part of the fix * Revert unrelated ES change * Revert unrelated memdocstore changes * comment * Small corrections * mypy and pylint * mypy * typo * mypy * Refactor the call * mypy * Do not make FARMReader use the new FeatureExtractor * mypy * Detach DPR tests from FeatureExtractor too * Detach processor tests too * Add end2end marker * extract end2end feature extractor tests * temporary disable feature extraction tests * Introduce end2end tests for tokenizer tests * pylint * Fix model loading from folder in FeatureExtractor * working o n end2end * end2end keeps failing * Restructuring retriever tests * Restructuring retriever tests * remove covert_dataset_to_dataloader * remove comment * Better check sentence-transformers models * Use embed_meta_fields properly * rename passage into document * Embedding dims can't be found * Add check for models that support it * pylint * Split all retriever tests into suites, running mostly on InMemory only * fix mypy * fix tfidf test * fix weaviate tests * Parallelize on every docstore * Fix schema and specify modality in base retriever suite * tests * Add first image tests * remove comment * Revert to simpler tests * Update docs/_src/api/api/primitives.md Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Update haystack/modeling/model/multimodal/__init__.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * get_args * mypy * Update haystack/modeling/model/multimodal/__init__.py * Update haystack/modeling/model/multimodal/base.py * Update haystack/modeling/model/multimodal/base.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Update haystack/modeling/model/multimodal/sentence_transformers.py * Update haystack/modeling/model/multimodal/sentence_transformers.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Update haystack/modeling/model/multimodal/transformers.py * Update haystack/modeling/model/multimodal/transformers.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Update haystack/modeling/model/multimodal/transformers.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * Update haystack/nodes/retriever/multimodal/retriever.py Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> * mypy * mypy * removing more ContentTypes * more contentypes * pylint * add to __init__ * revert end2end workflow for now * missing integration markers * Update haystack/nodes/retriever/multimodal/embedder.py Co-authored-by: bogdankostic <bogdankostic@web.de> * review feedback, removing HaystackImageTransformerModel * review feedback part 2 * mypy & pylint * mypy * mypy * fix multimodal docs also for Pinecone * add note on internal constants * Fix pinecone write_documents * schemas * keep support for sentence-transformers only * fix pinecone test * schemas * fix pinecone again * temporarily disable some tests, need to understand if they're still relevant Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> Co-authored-by: bogdankostic <bogdankostic@web.de>
2022-10-17 18:58:35 +02:00
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None, max_answers=6)
for sample_type in sample_types:
dicts = processor.file_to_dicts(samples_path / "qa" / f"{sample_type}.json")
dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(
dicts, indices=[1], return_baskets=False
)
if sample_type == "answer-wrong" or sample_type == "answer-offset-wrong":
assert len(problematic_sample_ids) == 1, f"Processing labels for {model} has changed."
if sample_type == "noanswer":
assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
0,
0,
], f"Processing labels for {model} has changed."
assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 1, :]) == [
-1,
-1,
], f"Processing labels for {model} has changed."
if sample_type == "vanilla":
# roberta
if model == "deepset/roberta-base-squad2":
assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
13,
13,
], f"Processing labels for {model} has changed."
assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 1, :]) == [
13,
14,
], f"Processing labels for {model} has changed."
# bert, minilm, electra
if (
model == "deepset/bert-base-cased-squad2"
or model == "deepset/minilm-uncased-squad2"
or model == "deepset/electra-base-squad2"
):
assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
11,
11,
], f"Processing labels for {model} has changed."
# xlm-roberta
if model == "deepset/xlm-roberta-large-squad2":
assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
12,
12,
], f"Processing labels for {model} has changed."
@pytest.mark.unit
def test_is_json_identifies_json_objects():
"""Test that _is_json correctly identifies json objects"""
# Paths to json files should be considered json
assert _is_json(Path("processor_config.json"))
# dicts should be considered json
assert _is_json({"a": 1})
# non-serializable objects should not be considered json
assert not _is_json(AutoTokenizer)
@pytest.mark.integration
def test_dataset_from_dicts_auto_determine_max_answers(samples_path, caplog=None):
"""
SquadProcessor should determine the number of answers for the pytorch dataset based on
the maximum number of answers for each question. Vanilla.json has one question with two answers,
so the number of answers should be two.
"""
model = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None)
dicts = processor.file_to_dicts(samples_path / "qa" / "vanilla.json")
dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts, indices=[1])
assert len(dataset[0][tensor_names.index("labels")]) == 2
# check that a max_answers will be adjusted when processing a different dataset with the same SquadProcessor
dicts_more_answers = copy.deepcopy(dicts)
dicts_more_answers[0]["qas"][0]["answers"] = dicts_more_answers[0]["qas"][0]["answers"] * 3
dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts_more_answers, indices=[1])
assert len(dataset[0][tensor_names.index("labels")]) == 6
@pytest.mark.integration
def test_dataset_from_dicts_truncate_max_answers(samples_path, caplog=None):
"""
Test that it is possible to manually set the number of answers, truncating the answers in the data.
"""
model = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None, max_answers=1)
dicts = processor.file_to_dicts(samples_path / "qa" / "vanilla.json")
dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts, indices=[1])
assert len(dataset[0][tensor_names.index("labels")]) == 1