mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 09:49:48 +00:00 
			
		
		
		
	 bf6d306d68
			
		
	
	
		bf6d306d68
		
			
		
	
	
	
	
		
			
			* ci: Simplify Python code with ruff rules SIM * Revert #5828 * ruff --select=I --fix haystack/modeling/infer.py --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
		
			
				
	
	
		
			345 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			345 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import copy
 | |
| import logging
 | |
| from pathlib import Path
 | |
| 
 | |
| import pytest
 | |
| from transformers import AutoTokenizer
 | |
| 
 | |
| from haystack.modeling.data_handler.processor import SquadProcessor, _is_json
 | |
| import contextlib
 | |
| 
 | |
| 
 | |
| # during inference (parameter return_baskets = False) we do not convert labels
 | |
| def test_dataset_from_dicts_qa_inference(samples_path, caplog=None):
 | |
|     if caplog:
 | |
|         caplog.set_level(logging.CRITICAL)
 | |
| 
 | |
|     models = [
 | |
|         "deepset/roberta-base-squad2",
 | |
|         "deepset/bert-base-cased-squad2",
 | |
|         "deepset/xlm-roberta-large-squad2",
 | |
|         "deepset/minilm-uncased-squad2",
 | |
|         "deepset/electra-base-squad2",
 | |
|     ]
 | |
|     sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"]
 | |
| 
 | |
|     for model in models:
 | |
|         tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
 | |
|         processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None)
 | |
| 
 | |
|         for sample_type in sample_types:
 | |
|             dicts = processor.file_to_dicts(samples_path / "qa" / f"{sample_type}.json")
 | |
|             dataset, tensor_names, problematic_sample_ids, baskets = processor.dataset_from_dicts(
 | |
|                 dicts, indices=[1], return_baskets=True
 | |
|             )
 | |
|             assert tensor_names == [
 | |
|                 "input_ids",
 | |
|                 "padding_mask",
 | |
|                 "segment_ids",
 | |
|                 "passage_start_t",
 | |
|                 "start_of_word",
 | |
|                 "labels",
 | |
|                 "id",
 | |
|                 "seq_2_start_t",
 | |
|                 "span_mask",
 | |
|             ], f"Processing for {model} has changed."
 | |
|             assert len(problematic_sample_ids) == 0, f"Processing for {model} has changed."
 | |
|             assert baskets[0].id_external == "5ad3d560604f3c001a3ff2c8", f"Processing for {model} has changed."
 | |
|             assert baskets[0].id_internal == "1-0", f"Processing for {model} has changed."
 | |
| 
 | |
|             # roberta
 | |
|             if model == "deepset/roberta-base-squad2":
 | |
|                 assert (
 | |
|                     len(baskets[0].samples[0].tokenized["passage_tokens"]) == 6
 | |
|                 ), f"Processing for {model} has changed."
 | |
|                 assert (
 | |
|                     len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
 | |
|                 ), f"Processing for {model} has changed."
 | |
|                 if sample_type == "noanswer":
 | |
|                     assert baskets[0].samples[0].features[0]["input_ids"][:13] == [
 | |
|                         0,
 | |
|                         6179,
 | |
|                         171,
 | |
|                         82,
 | |
|                         697,
 | |
|                         11,
 | |
|                         2201,
 | |
|                         116,
 | |
|                         2,
 | |
|                         2,
 | |
|                         26795,
 | |
|                         2614,
 | |
|                         34,
 | |
|                     ], f"Processing for {model} and {sample_type}-testsample has changed."
 | |
|                 else:
 | |
|                     assert baskets[0].samples[0].features[0]["input_ids"][:13] == [
 | |
|                         0,
 | |
|                         6179,
 | |
|                         171,
 | |
|                         82,
 | |
|                         697,
 | |
|                         11,
 | |
|                         5459,
 | |
|                         116,
 | |
|                         2,
 | |
|                         2,
 | |
|                         26795,
 | |
|                         2614,
 | |
|                         34,
 | |
|                     ], f"Processing for {model} and {sample_type}-testsample has changed."
 | |
| 
 | |
|             # bert
 | |
|             if model == "deepset/bert-base-cased-squad2":
 | |
|                 assert (
 | |
|                     len(baskets[0].samples[0].tokenized["passage_tokens"]) == 5
 | |
|                 ), f"Processing for {model} has changed."
 | |
|                 assert (
 | |
|                     len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
 | |
|                 ), f"Processing for {model} has changed."
 | |
|                 if sample_type == "noanswer":
 | |
|                     assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
 | |
|                         101,
 | |
|                         1731,
 | |
|                         1242,
 | |
|                         1234,
 | |
|                         1686,
 | |
|                         1107,
 | |
|                         2123,
 | |
|                         136,
 | |
|                         102,
 | |
|                         3206,
 | |
|                     ], f"Processing for {model} and {sample_type}-testsample has changed."
 | |
|                 else:
 | |
|                     assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
 | |
|                         101,
 | |
|                         1731,
 | |
|                         1242,
 | |
|                         1234,
 | |
|                         1686,
 | |
|                         1107,
 | |
|                         3206,
 | |
|                         136,
 | |
|                         102,
 | |
|                         3206,
 | |
|                     ], f"Processing for {model} and {sample_type}-testsample has changed."
 | |
| 
 | |
|             # xlm-roberta
 | |
|             if model == "deepset/xlm-roberta-large-squad2":
 | |
|                 assert (
 | |
|                     len(baskets[0].samples[0].tokenized["passage_tokens"]) == 7
 | |
|                 ), f"Processing for {model} has changed."
 | |
|                 assert (
 | |
|                     len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
 | |
|                 ), f"Processing for {model} has changed."
 | |
|                 if sample_type == "noanswer":
 | |
|                     assert baskets[0].samples[0].features[0]["input_ids"][:12] == [
 | |
|                         0,
 | |
|                         11249,
 | |
|                         5941,
 | |
|                         3395,
 | |
|                         6867,
 | |
|                         23,
 | |
|                         7270,
 | |
|                         32,
 | |
|                         2,
 | |
|                         2,
 | |
|                         10271,
 | |
|                         1556,
 | |
|                     ], f"Processing for {model} and {sample_type}-testsample has changed."
 | |
|                 else:
 | |
|                     assert baskets[0].samples[0].features[0]["input_ids"][:12] == [
 | |
|                         0,
 | |
|                         11249,
 | |
|                         5941,
 | |
|                         3395,
 | |
|                         6867,
 | |
|                         23,
 | |
|                         10271,
 | |
|                         32,
 | |
|                         2,
 | |
|                         2,
 | |
|                         10271,
 | |
|                         1556,
 | |
|                     ], f"Processing for {model} and {sample_type}-testsample has changed."
 | |
| 
 | |
|             # minilm and electra have same vocab + tokenizer
 | |
|             if model == "deepset/minilm-uncased-squad2" or model == "deepset/electra-base-squad2":
 | |
|                 assert (
 | |
|                     len(baskets[0].samples[0].tokenized["passage_tokens"]) == 5
 | |
|                 ), f"Processing for {model} has changed."
 | |
|                 assert (
 | |
|                     len(baskets[0].samples[0].tokenized["question_tokens"]) == 7
 | |
|                 ), f"Processing for {model} has changed."
 | |
|                 if sample_type == "noanswer":
 | |
|                     assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
 | |
|                         101,
 | |
|                         2129,
 | |
|                         2116,
 | |
|                         2111,
 | |
|                         2444,
 | |
|                         1999,
 | |
|                         3000,
 | |
|                         1029,
 | |
|                         102,
 | |
|                         4068,
 | |
|                     ], f"Processing for {model} and {sample_type}-testsample has changed."
 | |
|                 else:
 | |
|                     assert baskets[0].samples[0].features[0]["input_ids"][:10] == [
 | |
|                         101,
 | |
|                         2129,
 | |
|                         2116,
 | |
|                         2111,
 | |
|                         2444,
 | |
|                         1999,
 | |
|                         4068,
 | |
|                         1029,
 | |
|                         102,
 | |
|                         4068,
 | |
|                     ], f"Processing for {model} and {sample_type}-testsample has changed."
 | |
| 
 | |
| 
 | |
| def test_batch_encoding_flatten_rename():
 | |
|     from haystack.modeling.data_handler.dataset import flatten_rename
 | |
| 
 | |
|     tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 | |
|     batch_sentences = ["Hello I'm a single sentence", "And another sentence", "And the very very last one"]
 | |
|     encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True)
 | |
| 
 | |
|     keys = ["input_ids", "token_type_ids", "attention_mask"]
 | |
|     rename_keys = ["input_ids", "segment_ids", "padding_mask"]
 | |
|     features_flat = flatten_rename(encoded_inputs, keys, rename_keys)
 | |
| 
 | |
|     assert len(features_flat) == 3, "should have three elements in the feature dict list"
 | |
|     for e in features_flat:
 | |
|         for k in rename_keys:
 | |
|             assert k in e, f"feature dict list item {e} in a list should have a key {k}"
 | |
| 
 | |
|     # rename no keys/rename keys
 | |
|     features_flat = flatten_rename(encoded_inputs)
 | |
|     assert len(features_flat) == 3, "should have three elements in the feature dict list"
 | |
|     for e in features_flat:
 | |
|         for k in keys:
 | |
|             assert k in e, f"feature dict list item {e} in a list should have a key {k}"
 | |
| 
 | |
|     # empty input keys
 | |
|     flatten_rename(encoded_inputs, [])
 | |
| 
 | |
|     # empty keys and rename keys
 | |
|     flatten_rename(encoded_inputs, [], [])
 | |
| 
 | |
|     # no encoding_batch provided
 | |
|     flatten_rename(None, [], [])
 | |
| 
 | |
|     # keys and renamed_keys have different sizes
 | |
|     with contextlib.suppress(AssertionError):
 | |
|         flatten_rename(encoded_inputs, [], ["blah"])
 | |
| 
 | |
| 
 | |
| def test_dataset_from_dicts_qa_label_conversion(samples_path, caplog=None):
 | |
|     if caplog:
 | |
|         caplog.set_level(logging.CRITICAL)
 | |
| 
 | |
|     models = [
 | |
|         "deepset/roberta-base-squad2",
 | |
|         "deepset/bert-base-cased-squad2",
 | |
|         "deepset/xlm-roberta-large-squad2",
 | |
|         "deepset/minilm-uncased-squad2",
 | |
|         "deepset/electra-base-squad2",
 | |
|     ]
 | |
|     sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"]
 | |
| 
 | |
|     for model in models:
 | |
|         tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
 | |
|         processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None, max_answers=6)
 | |
| 
 | |
|         for sample_type in sample_types:
 | |
|             dicts = processor.file_to_dicts(samples_path / "qa" / f"{sample_type}.json")
 | |
|             dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(
 | |
|                 dicts, indices=[1], return_baskets=False
 | |
|             )
 | |
| 
 | |
|             if sample_type == "answer-wrong" or sample_type == "answer-offset-wrong":
 | |
|                 assert len(problematic_sample_ids) == 1, f"Processing labels for {model} has changed."
 | |
| 
 | |
|             if sample_type == "noanswer":
 | |
|                 assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
 | |
|                     0,
 | |
|                     0,
 | |
|                 ], f"Processing labels for {model} has changed."
 | |
|                 assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 1, :]) == [
 | |
|                     -1,
 | |
|                     -1,
 | |
|                 ], f"Processing labels for {model} has changed."
 | |
| 
 | |
|             if sample_type == "vanilla":
 | |
|                 # roberta
 | |
|                 if model == "deepset/roberta-base-squad2":
 | |
|                     assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
 | |
|                         13,
 | |
|                         13,
 | |
|                     ], f"Processing labels for {model} has changed."
 | |
|                     assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 1, :]) == [
 | |
|                         13,
 | |
|                         14,
 | |
|                     ], f"Processing labels for {model} has changed."
 | |
|                 # bert, minilm, electra
 | |
|                 if (
 | |
|                     model == "deepset/bert-base-cased-squad2"
 | |
|                     or model == "deepset/minilm-uncased-squad2"
 | |
|                     or model == "deepset/electra-base-squad2"
 | |
|                 ):
 | |
|                     assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
 | |
|                         11,
 | |
|                         11,
 | |
|                     ], f"Processing labels for {model} has changed."
 | |
|                 # xlm-roberta
 | |
|                 if model == "deepset/xlm-roberta-large-squad2":
 | |
|                     assert list(dataset.tensors[tensor_names.index("labels")].numpy()[0, 0, :]) == [
 | |
|                         12,
 | |
|                         12,
 | |
|                     ], f"Processing labels for {model} has changed."
 | |
| 
 | |
| 
 | |
| @pytest.mark.unit
 | |
| def test_is_json_identifies_json_objects():
 | |
|     """Test that _is_json correctly identifies json objects"""
 | |
|     # Paths to json files should be considered json
 | |
|     assert _is_json(Path("processor_config.json"))
 | |
|     # dicts should be considered json
 | |
|     assert _is_json({"a": 1})
 | |
|     # non-serializable objects should not be considered json
 | |
|     assert not _is_json(AutoTokenizer)
 | |
| 
 | |
| 
 | |
| @pytest.mark.integration
 | |
| def test_dataset_from_dicts_auto_determine_max_answers(samples_path, caplog=None):
 | |
|     """
 | |
|     SquadProcessor should determine the number of answers for the pytorch dataset based on
 | |
|     the maximum number of answers for each question. Vanilla.json has one question with two answers,
 | |
|     so the number of answers should be two.
 | |
|     """
 | |
|     model = "deepset/roberta-base-squad2"
 | |
|     tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
 | |
|     processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None)
 | |
|     dicts = processor.file_to_dicts(samples_path / "qa" / "vanilla.json")
 | |
|     dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts, indices=[1])
 | |
|     assert len(dataset[0][tensor_names.index("labels")]) == 2
 | |
|     # check that a max_answers will be adjusted when processing a different dataset with the same SquadProcessor
 | |
|     dicts_more_answers = copy.deepcopy(dicts)
 | |
|     dicts_more_answers[0]["qas"][0]["answers"] = dicts_more_answers[0]["qas"][0]["answers"] * 3
 | |
|     dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts_more_answers, indices=[1])
 | |
|     assert len(dataset[0][tensor_names.index("labels")]) == 6
 | |
| 
 | |
| 
 | |
| @pytest.mark.integration
 | |
| def test_dataset_from_dicts_truncate_max_answers(samples_path, caplog=None):
 | |
|     """
 | |
|     Test that it is possible to manually set the number of answers, truncating the answers in the data.
 | |
|     """
 | |
|     model = "deepset/roberta-base-squad2"
 | |
|     tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)
 | |
|     processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None, max_answers=1)
 | |
|     dicts = processor.file_to_dicts(samples_path / "qa" / "vanilla.json")
 | |
|     dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts, indices=[1])
 | |
|     assert len(dataset[0][tensor_names.index("labels")]) == 1
 |