Sara Zan 9539a209ae
refactor: apply pep-484 (#3542)
* apply pep-484

* another implicit optional

* apply pep-484 on rest_api and ui too
2022-11-08 14:30:33 +01:00

2271 lines
105 KiB
Python

# pylint: disable=missing-timeout
from typing import Optional, Dict, List, Union, Any, Iterable, Type
import os
import json
import uuid
import inspect
import logging
import random
import tarfile
import tempfile
from pathlib import Path
from inspect import signature
from abc import ABC, abstractmethod
import numpy as np
import requests
from tqdm import tqdm
from torch.utils.data import TensorDataset
import transformers
from transformers import PreTrainedTokenizer, AutoTokenizer
from haystack.modeling.model.feature_extraction import (
tokenize_batch_question_answering,
tokenize_with_metadata,
truncate_sequences,
)
from haystack.modeling.data_handler.dataset import convert_features_to_dataset
from haystack.modeling.data_handler.samples import (
Sample,
SampleBasket,
get_passage_offsets,
offset_to_token_idx_vecorized,
)
from haystack.modeling.data_handler.input_features import sample_to_features_text
from haystack.utils.experiment_tracking import Tracker as tracker
DOWNSTREAM_TASK_MAP = {
"squad20": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/squad20.tar.gz",
"covidqa": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/covidqa.tar.gz",
}
logger = logging.getLogger(__name__)
class Processor(ABC):
"""
Base class for low level data processors to convert input text to PyTorch Datasets.
"""
subclasses: dict = {}
def __init__(
self,
tokenizer,
max_seq_len: int,
train_filename: Optional[Union[Path, str]],
dev_filename: Optional[Union[Path, str]],
test_filename: Optional[Union[Path, str]],
dev_split: float,
data_dir: Optional[Union[Path, str]],
tasks: Dict = {},
proxies: Optional[Dict] = None,
multithreading_rust: Optional[bool] = True,
):
"""
:param tokenizer: Used to split a sentence (str) into tokens.
:param max_seq_len: Samples are truncated after this many tokens.
:param train_filename: The name of the file containing training data.
:param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
will be a slice of the train set.
:param test_filename: The name of the file containing test data.
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
:param data_dir: The directory in which the train, test and perhaps dev files can be found.
:param tasks: Tasks for which the processor shall extract labels from the input data.
Usually this includes a single, default task, e.g. text classification.
In a multitask setting this includes multiple tasks, e.g. 2x text classification.
The task name will be used to connect with the related PredictionHead.
:param proxies: proxy configuration to allow downloads of remote datasets.
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
:param multithreading_rust: Whether to allow multithreading in Rust, e.g. for FastTokenizers.
Note: Enabling multithreading in Rust AND multiprocessing in python might cause
deadlocks.
"""
if not multithreading_rust:
os.environ["RAYON_RS_NUM_CPUS"] = "1"
self.tokenizer = tokenizer
self.max_seq_len = max_seq_len
self.tasks = tasks
self.proxies = proxies
# data sets
self.train_filename = train_filename
self.dev_filename = dev_filename
self.test_filename = test_filename
self.dev_split = dev_split
if data_dir:
self.data_dir = Path(data_dir)
else:
self.data_dir = None # type: ignore
self.baskets: List = []
self._log_params()
self.problematic_sample_ids: set = set()
def __init_subclass__(cls, **kwargs):
"""This automatically keeps track of all available subclasses.
Enables generic load() and load_from_dir() for all specific Processor implementation.
"""
super().__init_subclass__(**kwargs)
cls.subclasses[cls.__name__] = cls
@classmethod
def load(
cls,
processor_name: str,
data_dir: str, # TODO revert ignore
tokenizer, # type: ignore
max_seq_len: int,
train_filename: str,
dev_filename: Optional[str],
test_filename: str,
dev_split: float,
**kwargs,
):
"""
Loads the class of processor specified by processor name.
:param processor_name: The class of processor to be loaded.
:param data_dir: Directory where data files are located.
:param tokenizer: A tokenizer object
:param max_seq_len: Sequences longer than this will be truncated.
:param train_filename: The name of the file containing training data.
:param dev_filename: The name of the file containing the dev data.
If None and 0.0 < dev_split < 1.0 the dev set
will be a slice of the train set.
:param test_filename: The name of the file containing test data.
:param dev_split: The proportion of the train set that will sliced.
Only works if dev_filename is set to None
:param kwargs: placeholder for passing generic parameters
:return: An instance of the specified processor.
"""
sig = signature(cls.subclasses[processor_name])
unused_args = {k: v for k, v in kwargs.items() if k not in sig.parameters}
logger.debug(
"Got more parameters than needed for loading %s: %s. Those won't be used!", processor_name, unused_args
)
processor = cls.subclasses[processor_name](
data_dir=data_dir,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
train_filename=train_filename,
dev_filename=dev_filename,
test_filename=test_filename,
dev_split=dev_split,
**kwargs,
)
return processor
@classmethod
def load_from_dir(cls, load_dir: str):
"""
Infers the specific type of Processor from a config file (e.g. SquadProcessor) and loads an instance of it.
:param load_dir: directory that contains a 'processor_config.json'
:return: An instance of a Processor Subclass (e.g. SquadProcessor)
"""
# read config
processor_config_file = Path(load_dir) / "processor_config.json"
config = json.load(open(processor_config_file))
config["inference"] = True
# init tokenizer
if "lower_case" in config.keys():
logger.warning(
"Loading tokenizer from deprecated config. "
"If you used `custom_vocab` or `never_split_chars`, this won't work anymore."
)
tokenizer = AutoTokenizer.from_pretrained(
load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"]
)
else:
tokenizer = AutoTokenizer.from_pretrained(load_dir, tokenizer_class=config["tokenizer"])
# we have to delete the tokenizer string from config, because we pass it as Object
del config["tokenizer"]
processor = cls.load(tokenizer=tokenizer, processor_name=config["processor"], **config)
for task_name, task in config["tasks"].items():
processor.add_task(
name=task_name,
metric=task["metric"],
label_list=task["label_list"],
label_column_name=task["label_column_name"],
text_column_name=task.get("text_column_name", None),
task_type=task["task_type"],
)
if processor is None:
raise Exception
return processor
@classmethod
def convert_from_transformers(
cls,
tokenizer_name_or_path,
task_type,
max_seq_len,
doc_stride,
revision=None,
tokenizer_class=None,
tokenizer_args=None,
use_fast=True,
**kwargs,
):
tokenizer_args = tokenizer_args or {}
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name_or_path,
tokenizer_class=tokenizer_class,
use_fast=use_fast,
revision=revision,
**tokenizer_args,
**kwargs,
)
# TODO infer task_type automatically from config (if possible)
if task_type == "question_answering":
processor = SquadProcessor(
tokenizer=tokenizer,
max_seq_len=max_seq_len,
label_list=["start_token", "end_token"],
metric="squad",
data_dir="data",
doc_stride=doc_stride,
)
elif task_type == "embeddings":
processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len)
else:
raise ValueError(
f"`task_type` {task_type} is not supported yet. "
f"Valid options for arg `task_type`: 'question_answering', "
f"'embeddings', "
)
return processor
def save(self, save_dir: str):
"""
Saves the vocabulary to file and also creates a json file containing all the
information needed to load the same processor.
:param save_dir: Directory where the files are to be saved
:return: None
"""
os.makedirs(save_dir, exist_ok=True)
config = self.generate_config()
# save tokenizer incl. attributes
config["tokenizer"] = self.tokenizer.__class__.__name__
# Because the fast tokenizers expect a str and not Path
# always convert Path to str here.
self.tokenizer.save_pretrained(str(save_dir))
# save processor
config["processor"] = self.__class__.__name__
output_config_file = Path(save_dir) / "processor_config.json"
with open(output_config_file, "w") as file:
json.dump(config, file)
def generate_config(self):
"""
Generates config file from Class and instance attributes (only for sensible config parameters).
"""
config = {}
# self.__dict__ doesn't give parent class attributes
for key, value in inspect.getmembers(self):
if _is_json(value) and key[0] != "_":
if issubclass(type(value), Path):
value = str(value)
config[key] = value
return config
# TODO potentially remove tasks from code - multitask learning is not supported anyways
def add_task(
self, name, metric, label_list, label_column_name=None, label_name=None, task_type=None, text_column_name=None
):
if type(label_list) is not list:
raise ValueError(f"Argument `label_list` must be of type list. Got: f{type(label_list)}")
if label_name is None:
label_name = f"{name}_label"
label_tensor_name = label_name + "_ids"
self.tasks[name] = {
"label_list": label_list,
"metric": metric,
"label_tensor_name": label_tensor_name,
"label_name": label_name,
"label_column_name": label_column_name,
"text_column_name": text_column_name,
"task_type": task_type,
}
@abstractmethod
def file_to_dicts(self, file: str) -> List[dict]:
raise NotImplementedError()
@abstractmethod
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
):
raise NotImplementedError()
@abstractmethod
def _create_dataset(self, baskets: List[SampleBasket]):
raise NotImplementedError
@staticmethod
def log_problematic(problematic_sample_ids):
if problematic_sample_ids:
n_problematic = len(problematic_sample_ids)
problematic_id_str = ", ".join([str(i) for i in problematic_sample_ids])
logger.error(
"Unable to convert %s samples to features. Their ids are : %s", n_problematic, problematic_id_str
)
@staticmethod
def _check_sample_features(basket: SampleBasket):
"""
Check if all samples in the basket has computed its features.
:param basket: the basket containing the samples
:return: True if all the samples in the basket has computed its features, False otherwise
"""
return basket.samples and not any(sample.features is None for sample in basket.samples)
def _log_samples(self, n_samples: int, baskets: List[SampleBasket]):
logger.debug("*** Show %s random examples ***", n_samples)
if len(baskets) == 0:
logger.debug("*** No samples to show because there are no baskets ***")
return
for i in range(n_samples):
random_basket = random.choice(baskets)
random_sample = random.choice(random_basket.samples) # type: ignore
logger.debug(random_sample)
def _log_params(self):
params = {"processor": self.__class__.__name__, "tokenizer": self.tokenizer.__class__.__name__}
names = ["max_seq_len", "dev_split"]
for name in names:
value = getattr(self, name)
params.update({name: str(value)})
tracker.track_params(params)
class SquadProcessor(Processor):
"""
Convert QA data (in SQuAD Format)
"""
def __init__(
self,
tokenizer, # type: ignore
max_seq_len: int,
data_dir: Optional[Union[Path, str]],
label_list: Optional[List[str]] = None,
metric="squad", # type: ignore
train_filename: Optional[Union[Path, str]] = Path("train-v2.0.json"),
dev_filename: Optional[Union[Path, str]] = Path("dev-v2.0.json"),
test_filename: Optional[Union[Path, str]] = None,
dev_split: float = 0,
doc_stride: int = 128,
max_query_length: int = 64,
proxies: Optional[dict] = None,
max_answers: int = 6,
**kwargs,
):
"""
:param tokenizer: Used to split a sentence (str) into tokens.
:param max_seq_len: Samples are truncated after this many tokens.
:param data_dir: The directory in which the train and dev files can be found.
If not available the dataset will be loaded automaticaly
if the last directory has the same name as a predefined dataset.
These predefined datasets are defined as the keys in the dict at
`haystack.basics.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/haystack/blob/main/haystack/basics/data_handler/utils.py>`_.
:param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
:param metric: name of metric that shall be used for evaluation, can be "squad" or "top_n_accuracy"
:param train_filename: The name of the file containing training data.
:param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
will be a slice of the train set.
:param test_filename: None
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
:param doc_stride: When the document containing the answer is too long it gets split into part, strided by doc_stride
:param max_query_length: Maximum length of the question (in number of subword tokens)
:param proxies: proxy configuration to allow downloads of remote datasets.
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
:param max_answers: number of answers to be converted. QA dev or train sets can contain multi-way annotations, which are converted to arrays of max_answer length
:param kwargs: placeholder for passing generic parameters
"""
self.ph_output_type = "per_token_squad"
# validate max_seq_len
assert max_seq_len <= tokenizer.model_max_length, (
"max_seq_len cannot be greater than the maximum sequence length handled by the model: "
f"got max_seq_len={max_seq_len}, while the model maximum length is {tokenizer.model_max_length}. "
"Please adjust max_seq_len accordingly or use another model "
)
assert doc_stride < (max_seq_len - max_query_length), (
"doc_stride ({}) is longer than max_seq_len ({}) minus space reserved for query tokens ({}). \nThis means that there will be gaps "
"as the passage windows slide, causing the model to skip over parts of the document.\n"
"Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384)\n "
"Or decrease max_query_length".format(doc_stride, max_seq_len, max_query_length)
)
self.doc_stride = doc_stride
self.max_query_length = max_query_length
self.max_answers = max_answers
super(SquadProcessor, self).__init__(
tokenizer=tokenizer,
max_seq_len=max_seq_len,
train_filename=train_filename,
dev_filename=dev_filename,
test_filename=test_filename,
dev_split=dev_split,
data_dir=data_dir,
tasks={},
proxies=proxies,
)
self._initialize_special_tokens_count()
if metric and label_list:
self.add_task("question_answering", metric, label_list)
else:
logger.info(
"Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for "
"using the default task or add a custom task later via processor.add_task()"
)
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
):
"""
Convert input dictionaries into a pytorch dataset for Question Answering.
For this we have an internal representation called "baskets".
Each basket is a question-document pair.
Each stage adds or transforms specific information to our baskets.
:param dicts: dict, input dictionary with SQuAD style information present
:param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
:param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference)
"""
# Convert to standard format
pre_baskets = [self.convert_qa_input_dict(x) for x in dicts] # TODO move to input object conversion
# Tokenize documents and questions
baskets = tokenize_batch_question_answering(pre_baskets, self.tokenizer, indices)
# Split documents into smaller passages to fit max_seq_len
baskets = self._split_docs_into_passages(baskets)
# Convert answers from string to token space, skip this step for inference
if not return_baskets:
baskets = self._convert_answers(baskets)
# Convert internal representation (nested baskets + samples with mixed types) to pytorch features (arrays of numbers)
baskets = self._passages_to_pytorch_features(baskets, return_baskets)
# Convert features into pytorch dataset, this step also removes potential errors during preprocessing
dataset, tensor_names, baskets = self._create_dataset(baskets)
# Logging
if indices:
if 0 in indices:
self._log_samples(n_samples=1, baskets=self.baskets)
# During inference we need to keep the information contained in baskets.
if return_baskets:
return dataset, tensor_names, self.problematic_sample_ids, baskets
else:
return dataset, tensor_names, self.problematic_sample_ids
def file_to_dicts(self, file: str) -> List[dict]:
nested_dicts = _read_squad_file(filename=file)
dicts = [y for x in nested_dicts for y in x["paragraphs"]]
return dicts
# TODO use Input Objects instead of this function, remove Natural Questions (NQ) related code
def convert_qa_input_dict(self, infer_dict: dict) -> Dict[str, Any]:
"""Input dictionaries in QA can either have ["context", "qas"] (internal format) as keys or
["text", "questions"] (api format). This function converts the latter into the former. It also converts the
is_impossible field to answer_type so that NQ and SQuAD dicts have the same format.
"""
# validate again max_seq_len
assert self.max_seq_len <= self.tokenizer.model_max_length, (
"max_seq_len cannot be greater than the maximum sequence length handled by the model: "
f"got max_seq_len={self.max_seq_len}, while the model maximum length is {self.tokenizer.model_max_length}. "
"Please adjust max_seq_len accordingly or use another model "
)
# check again for doc stride vs max_seq_len when. Parameters can be changed for already initialized models (e.g. in haystack)
assert self.doc_stride < (self.max_seq_len - self.max_query_length), (
"doc_stride ({}) is longer than max_seq_len ({}) minus space reserved for query tokens ({}). \nThis means that there will be gaps "
"as the passage windows slide, causing the model to skip over parts of the document.\n"
"Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384)\n "
"Or decrease max_query_length".format(self.doc_stride, self.max_seq_len, self.max_query_length)
)
try:
# Check if infer_dict is already in internal json format
if "context" in infer_dict and "qas" in infer_dict:
return infer_dict
# converts dicts from inference mode to data structure used in Haystack
questions = infer_dict["questions"]
text = infer_dict["text"]
uid = infer_dict.get("id", None)
qas = [{"question": q, "id": uid, "answers": [], "answer_type": None} for i, q in enumerate(questions)]
converted = {"qas": qas, "context": text}
return converted
except KeyError:
raise Exception("Input does not have the expected format")
def _initialize_special_tokens_count(self):
vec = self.tokenizer.build_inputs_with_special_tokens(token_ids_0=["a"], token_ids_1=["b"])
self.sp_toks_start = vec.index("a")
self.sp_toks_mid = vec.index("b") - self.sp_toks_start - 1
self.sp_toks_end = len(vec) - vec.index("b") - 1
def _split_docs_into_passages(self, baskets: List[SampleBasket]):
"""
Because of the sequence length limitation of Language Models, the documents need to be divided into smaller
parts that we call passages.
"""
n_special_tokens = self.tokenizer.num_special_tokens_to_add(pair=True)
for basket in baskets:
samples = []
########## perform some basic checking
# TODO, eventually move checking into input validation functions
# ignore samples with empty context
if basket.raw["document_text"] == "":
logger.warning("Ignoring sample with empty context")
continue
########## end checking
# Calculate the number of tokens that can be reserved for the passage. This is calculated by considering
# the max_seq_len, the number of tokens in the question and the number of special tokens that will be added
# when the question and passage are joined (e.g. [CLS] and [SEP])
passage_len_t = (
self.max_seq_len - len(basket.raw["question_tokens"][: self.max_query_length]) - n_special_tokens
)
# passage_spans is a list of dictionaries where each defines the start and end of each passage
# on both token and character level
try:
passage_spans = get_passage_offsets(
basket.raw["document_offsets"], self.doc_stride, passage_len_t, basket.raw["document_text"]
)
except Exception as e:
logger.warning(
f"Could not devide document into passages. Document: {basket.raw['document_text'][:200]}\n"
f"With error: {e}"
)
passage_spans = []
for passage_span in passage_spans:
# Unpack each variable in the dictionary. The "_t" and "_c" indicate
# whether the index is on the token or character level
passage_start_t = passage_span["passage_start_t"]
passage_end_t = passage_span["passage_end_t"]
passage_start_c = passage_span["passage_start_c"]
passage_end_c = passage_span["passage_end_c"]
passage_start_of_word = basket.raw["document_start_of_word"][passage_start_t:passage_end_t]
passage_tokens = basket.raw["document_tokens"][passage_start_t:passage_end_t]
passage_text = basket.raw["document_text"][passage_start_c:passage_end_c]
clear_text = {
"passage_text": passage_text,
"question_text": basket.raw["question_text"],
"passage_id": passage_span["passage_id"],
}
tokenized = {
"passage_start_t": passage_start_t,
"passage_start_c": passage_start_c,
"passage_tokens": passage_tokens,
"passage_start_of_word": passage_start_of_word,
"question_tokens": basket.raw["question_tokens"][: self.max_query_length],
"question_offsets": basket.raw["question_offsets"][: self.max_query_length],
"question_start_of_word": basket.raw["question_start_of_word"][: self.max_query_length],
}
# The sample ID consists of internal_id and a passage numbering
sample_id = f"{basket.id_internal}-{passage_span['passage_id']}"
samples.append(Sample(id=sample_id, clear_text=clear_text, tokenized=tokenized))
basket.samples = samples
return baskets
def _convert_answers(self, baskets: List[SampleBasket]):
"""
Converts answers that are pure strings into the token based representation with start and end token offset.
Can handle multiple answers per question document pair as is common for development/text sets
"""
for basket in baskets:
error_in_answer = False
for num, sample in enumerate(basket.samples): # type: ignore
# Dealing with potentially multiple answers (e.g. Squad dev set)
# Initializing a numpy array of shape (max_answers, 2), filled with -1 for missing values
label_idxs = np.full((self.max_answers, 2), fill_value=-1)
if error_in_answer or (len(basket.raw["answers"]) == 0):
# If there are no answers we set
label_idxs[0, :] = 0
else:
# For all other cases we use start and end token indices, that are relative to the passage
for i, answer in enumerate(basket.raw["answers"]):
# Calculate start and end relative to document
answer_len_c = len(answer["text"])
answer_start_c = answer["answer_start"]
answer_end_c = answer_start_c + answer_len_c - 1
# Convert character offsets to token offsets on document level
answer_start_t = offset_to_token_idx_vecorized(basket.raw["document_offsets"], answer_start_c)
answer_end_t = offset_to_token_idx_vecorized(basket.raw["document_offsets"], answer_end_c)
# Adjust token offsets to be relative to the passage
answer_start_t -= sample.tokenized["passage_start_t"] # type: ignore
answer_end_t -= sample.tokenized["passage_start_t"] # type: ignore
# Initialize some basic variables
question_len_t = len(sample.tokenized["question_tokens"]) # type: ignore
passage_len_t = len(sample.tokenized["passage_tokens"]) # type: ignore
# Check that start and end are contained within this passage
# answer_end_t is 0 if the first token is the answer
# answer_end_t is passage_len_t if the last token is the answer
if passage_len_t > answer_start_t >= 0 and passage_len_t >= answer_end_t >= 0:
# Then adjust the start and end offsets by adding question and special token
label_idxs[i][0] = self.sp_toks_start + question_len_t + self.sp_toks_mid + answer_start_t
label_idxs[i][1] = self.sp_toks_start + question_len_t + self.sp_toks_mid + answer_end_t
# If the start or end of the span answer is outside the passage, treat passage as no_answer
else:
label_idxs[i][0] = 0
label_idxs[i][1] = 0
########## answer checking ##############################
# TODO, move this checking into input validation functions and delete wrong examples there
# Cases where the answer is not within the current passage will be turned into no answers by the featurization fn
if answer_start_t < 0 or answer_end_t >= passage_len_t:
pass
else:
doc_text = basket.raw["document_text"]
answer_indices = doc_text[answer_start_c : answer_end_c + 1]
answer_text = answer["text"]
# check if answer string can be found in context
if answer_text not in doc_text:
logger.warning(
f"Answer '{answer['text']}' not contained in context.\n"
f"Example will not be converted for training/evaluation."
)
error_in_answer = True
label_idxs[i][0] = -100 # TODO remove this hack also from featurization
label_idxs[i][1] = -100
break # Break loop around answers, so the error message is not shown multiple times
if answer_indices.strip() != answer_text.strip():
logger.warning(
f"Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'.\n"
f"Example will not be converted for training/evaluation."
)
error_in_answer = True
label_idxs[i][0] = -100 # TODO remove this hack also from featurization
label_idxs[i][1] = -100
break # Break loop around answers, so the error message is not shown multiple times
########## end of checking ####################
sample.tokenized["labels"] = label_idxs # type: ignore
return baskets
def _passages_to_pytorch_features(self, baskets: List[SampleBasket], return_baskets: bool):
"""
Convert internal representation (nested baskets + samples with mixed types) to python features (arrays of numbers).
We first join question and passages into one large vector.
Then we add vectors for: - input_ids (token ids)
- segment_ids (does a token belong to question or document)
- padding_mask
- span_mask (valid answer tokens)
- start_of_word
"""
for basket in baskets:
# Add features to samples
for num, sample in enumerate(basket.samples): # type: ignore
# Initialize some basic variables
if sample.tokenized is not None:
question_tokens = sample.tokenized["question_tokens"]
question_start_of_word = sample.tokenized["question_start_of_word"]
question_len_t = len(question_tokens)
passage_start_t = sample.tokenized["passage_start_t"]
passage_tokens = sample.tokenized["passage_tokens"]
passage_start_of_word = sample.tokenized["passage_start_of_word"]
passage_len_t = len(passage_tokens)
sample_id = [int(x) for x in sample.id.split("-")]
# - Combines question_tokens and passage_tokens into a single vector called input_ids
# - input_ids also contains special tokens (e.g. CLS or SEP tokens).
# - It will have length = question_len_t + passage_len_t + n_special_tokens. This may be less than
# max_seq_len but never greater since truncation was already performed when the document was chunked into passages
question_input_ids = sample.tokenized["question_tokens"]
passage_input_ids = sample.tokenized["passage_tokens"]
input_ids = self.tokenizer.build_inputs_with_special_tokens(
token_ids_0=question_input_ids, token_ids_1=passage_input_ids
)
segment_ids = self.tokenizer.create_token_type_ids_from_sequences(
token_ids_0=question_input_ids, token_ids_1=passage_input_ids
)
# To make the start index of passage tokens the start manually
seq_2_start_t = self.sp_toks_start + question_len_t + self.sp_toks_mid
start_of_word = (
[0] * self.sp_toks_start
+ question_start_of_word
+ [0] * self.sp_toks_mid
+ passage_start_of_word
+ [0] * self.sp_toks_end
)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
padding_mask = [1] * len(input_ids)
# The span_mask has 1 for tokens that are valid start or end tokens for QA spans.
# 0s are assigned to question tokens, mid special tokens, end special tokens, and padding
# Note that start special tokens are assigned 1 since they can be chosen for a no_answer prediction
span_mask = [1] * self.sp_toks_start
span_mask += [0] * question_len_t
span_mask += [0] * self.sp_toks_mid
span_mask += [1] * passage_len_t
span_mask += [0] * self.sp_toks_end
# Pad up to the sequence length. For certain models, the pad token id is not 0 (e.g. Roberta where it is 1)
pad_idx = self.tokenizer.pad_token_id
padding = [pad_idx] * (self.max_seq_len - len(input_ids))
zero_padding = [0] * (self.max_seq_len - len(input_ids))
input_ids += padding
padding_mask += zero_padding
segment_ids += zero_padding
start_of_word += zero_padding
span_mask += zero_padding
# TODO possibly remove these checks after input validation is in place
len_check = (
len(input_ids) == len(padding_mask) == len(segment_ids) == len(start_of_word) == len(span_mask)
)
id_check = len(sample_id) == 3
label_check = return_baskets or len(sample.tokenized.get("labels", [])) == self.max_answers # type: ignore
# labels are set to -100 when answer cannot be found
label_check2 = return_baskets or np.all(sample.tokenized["labels"] > -99) # type: ignore
if len_check and id_check and label_check and label_check2:
# - The first of the labels will be used in train, and the full array will be used in eval.
# - start_of_word and spec_tok_mask are not actually needed by model.forward() but are needed for
# model.formatted_preds() during inference for creating answer strings
# - passage_start_t is index of passage's first token relative to document
feature_dict = {
"input_ids": input_ids,
"padding_mask": padding_mask,
"segment_ids": segment_ids,
"passage_start_t": passage_start_t,
"start_of_word": start_of_word,
"labels": sample.tokenized.get("labels", []), # type: ignore
"id": sample_id,
"seq_2_start_t": seq_2_start_t,
"span_mask": span_mask,
}
# other processor's features can be lists
sample.features = [feature_dict] # type: ignore
else:
self.problematic_sample_ids.add(sample.id)
sample.features = None
return baskets
def _create_dataset(self, baskets: List[SampleBasket]):
"""
Convert python features into pytorch dataset.
Also removes potential errors during preprocessing.
Flattens nested basket structure to create a flat list of features
"""
features_flat: List[dict] = []
basket_to_remove = []
for basket in baskets:
if self._check_sample_features(basket):
for sample in basket.samples: # type: ignore
features_flat.extend(sample.features) # type: ignore
else:
# remove the entire basket
basket_to_remove.append(basket)
if len(basket_to_remove) > 0:
for basket in basket_to_remove:
# if basket_to_remove is not empty remove the related baskets
baskets.remove(basket)
dataset, tensor_names = convert_features_to_dataset(features=features_flat)
return dataset, tensor_names, baskets
class TextSimilarityProcessor(Processor):
"""
Used to handle the Dense Passage Retrieval (DPR) datasets that come in json format, example: biencoder-nq-train.json, biencoder-nq-dev.json, trivia-train.json, trivia-dev.json
Datasets can be downloaded from the official DPR github repository (https://github.com/facebookresearch/DPR)
dataset format: list of dictionaries with keys: 'dataset', 'question', 'answers', 'positive_ctxs', 'negative_ctxs', 'hard_negative_ctxs'
Each sample is a dictionary of format:
{"dataset": str,
"question": str,
"answers": list of str
"positive_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
"negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
"hard_negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
}
"""
def __init__(
self,
query_tokenizer, # type: ignore
passage_tokenizer, # type: ignore
max_seq_len_query: int,
max_seq_len_passage: int,
data_dir: str = "",
metric=None, # type: ignore
train_filename: str = "train.json",
dev_filename: Optional[str] = None,
test_filename: Optional[str] = "test.json",
dev_split: float = 0.1,
proxies: Optional[dict] = None,
max_samples: Optional[int] = None,
embed_title: bool = True,
num_positives: int = 1,
num_hard_negatives: int = 1,
shuffle_negatives: bool = True,
shuffle_positives: bool = False,
label_list: Optional[List[str]] = None,
**kwargs,
):
"""
:param query_tokenizer: Used to split a question (str) into tokens
:param passage_tokenizer: Used to split a passage (str) into tokens.
:param max_seq_len_query: Query samples are truncated after this many tokens.
:param max_seq_len_passage: Context/Passage Samples are truncated after this many tokens.
:param data_dir: The directory in which the train and dev files can be found.
If not available the dataset will be loaded automaticaly
if the last directory has the same name as a predefined dataset.
These predefined datasets are defined as the keys in the dict at
`haystack.basics.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/haystack/blob/main/haystack/basics/data_handler/utils.py>`_.
:param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro".
Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value.
For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn].
:param train_filename: The name of the file containing training data.
:param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
will be a slice of the train set.
:param test_filename: None
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
:param proxies: proxy configuration to allow downloads of remote datasets.
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
:param max_samples: maximum number of samples to use
:param embed_title: Whether to embed title in passages during tensorization (bool),
:param num_hard_negatives: maximum number to hard negative context passages in a sample
:param num_positives: maximum number to positive context passages in a sample
:param shuffle_negatives: Whether to shuffle all the hard_negative passages before selecting the num_hard_negative number of passages
:param shuffle_positives: Whether to shuffle all the positive passages before selecting the num_positive number of passages
:param label_list: list of labels to predict. Usually ["hard_negative", "positive"]
:param kwargs: placeholder for passing generic parameters
"""
# TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs
# Custom processor attributes
self.max_samples = max_samples
self.query_tokenizer = query_tokenizer
self.passage_tokenizer = passage_tokenizer
self.embed_title = embed_title
self.num_hard_negatives = num_hard_negatives
self.num_positives = num_positives
self.shuffle_negatives = shuffle_negatives
self.shuffle_positives = shuffle_positives
self.max_seq_len_query = max_seq_len_query
self.max_seq_len_passage = max_seq_len_passage
super(TextSimilarityProcessor, self).__init__(
tokenizer=None, # type: ignore
max_seq_len=0,
train_filename=train_filename,
dev_filename=dev_filename,
test_filename=test_filename,
dev_split=dev_split,
data_dir=data_dir,
tasks={},
proxies=proxies,
)
if metric:
self.add_task(
name="text_similarity",
metric=metric,
label_list=label_list,
label_name="label",
task_type="text_similarity",
)
else:
logger.info(
"Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for "
"using the default task or add a custom task later via processor.add_task()"
)
@classmethod
def load_from_dir(cls, load_dir: str):
"""
Overwriting method from parent class to **always** load the TextSimilarityProcessor instead of the specific class stored in the config.
:param load_dir: directory that contains a 'processor_config.json'
:return: An instance of an TextSimilarityProcessor
"""
# read config
processor_config_file = Path(load_dir) / "processor_config.json"
config = json.load(open(processor_config_file))
# init tokenizers
query_tokenizer_class: Type[PreTrainedTokenizer] = getattr(transformers, config["query_tokenizer"])
query_tokenizer = query_tokenizer_class.from_pretrained(
pretrained_model_name_or_path=load_dir, subfolder="query"
)
passage_tokenizer_class: Type[PreTrainedTokenizer] = getattr(transformers, config["passage_tokenizer"])
passage_tokenizer = passage_tokenizer_class.from_pretrained(
pretrained_model_name_or_path=load_dir, subfolder="passage"
)
# we have to delete the tokenizer string from config, because we pass it as Object
del config["query_tokenizer"]
del config["passage_tokenizer"]
processor = cls.load(
query_tokenizer=query_tokenizer,
passage_tokenizer=passage_tokenizer,
processor_name="TextSimilarityProcessor",
**config,
)
for task_name, task in config["tasks"].items():
processor.add_task(name=task_name, metric=task["metric"], label_list=task["label_list"])
if processor is None:
raise Exception
return processor
def save(self, save_dir: Union[str, Path]):
"""
Saves the vocabulary to file and also creates a json file containing all the
information needed to load the same processor.
:param save_dir: Directory where the files are to be saved
:return: None
"""
if isinstance(save_dir, str):
save_dir = Path(save_dir)
os.makedirs(save_dir, exist_ok=True)
config = self.generate_config()
# save tokenizer incl. attributes
config["query_tokenizer"] = self.query_tokenizer.__class__.__name__
config["passage_tokenizer"] = self.passage_tokenizer.__class__.__name__
# Because the fast tokenizers expect a str and not Path
# always convert Path to str here.
self.query_tokenizer.save_pretrained(str(save_dir / "query"))
self.passage_tokenizer.save_pretrained(str(save_dir / "passage"))
# save processor
config["processor"] = self.__class__.__name__
output_config_file = Path(save_dir) / "processor_config.json"
with open(output_config_file, "w") as file:
json.dump(config, file)
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
):
"""
Convert input dictionaries into a pytorch dataset for TextSimilarity (e.g. DPR).
For conversion we have an internal representation called "baskets".
Each basket is one query and related text passages (positive passages fitting to the query and negative
passages that do not fit the query)
Each stage adds or transforms specific information to our baskets.
:param dicts: input dictionary with DPR-style content
{"query": str,
"passages": List[
{'title': str,
'text': str,
'label': 'hard_negative',
'external_id': str},
....
]
}
:param indices: indices used during multiprocessing so that IDs assigned to our baskets is unique
:param return_baskets: whether to return the baskets or not (baskets are needed during inference)
:return: dataset, tensor_names, problematic_ids, [baskets]
"""
# Take the dict and insert into our basket structure, this stages also adds an internal IDs
baskets = self._fill_baskets(dicts, indices)
# Separat conversion of query
baskets = self._convert_queries(baskets=baskets)
# and context passages. When converting the context the label is also assigned.
baskets = self._convert_contexts(baskets=baskets)
# Convert features into pytorch dataset, this step also removes and logs potential errors during preprocessing
dataset, tensor_names, problematic_ids, baskets = self._create_dataset(baskets)
if problematic_ids:
logger.error(
f"There were {len(problematic_ids)} errors during preprocessing at positions: {problematic_ids}"
)
if return_baskets:
return dataset, tensor_names, problematic_ids, baskets
else:
return dataset, tensor_names, problematic_ids
def file_to_dicts(self, file: str) -> List[dict]:
"""
Converts a Dense Passage Retrieval (DPR) data file in json format to a list of dictionaries.
:param file: filename of DPR data in json format
Each sample is a dictionary of format:
{"dataset": str,
"question": str,
"answers": list of str
"positive_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
"negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
"hard_negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
}
Returns:
list of dictionaries: List[dict]
each dictionary:
{"query": str,
"passages": [{"text": document_text, "title": xxx, "label": "positive", "external_id": abb123},
{"text": document_text, "title": xxx, "label": "hard_negative", "external_id": abb134},
...]}
"""
dicts = _read_dpr_json(
file,
max_samples=self.max_samples,
num_hard_negatives=self.num_hard_negatives,
num_positives=self.num_positives,
shuffle_negatives=self.shuffle_negatives,
shuffle_positives=self.shuffle_positives,
)
# shuffle dicts to make sure that similar positive passages do not end up in one batch
dicts = random.sample(dicts, len(dicts))
return dicts
def _fill_baskets(self, dicts: List[dict], indices: Optional[List[int]]):
baskets = []
if not indices:
indices = list(range(len(dicts)))
for d, id_internal in zip(dicts, indices):
basket = SampleBasket(id_external=None, id_internal=id_internal, raw=d)
baskets.append(basket)
return baskets
def _convert_queries(self, baskets: List[SampleBasket]):
for basket in baskets:
clear_text = {}
tokenized = {}
features = [{}] # type: ignore
# extract query, positive context passages and titles, hard-negative passages and titles
if "query" in basket.raw:
try:
query = self._normalize_question(basket.raw["query"])
# featurize the query
query_inputs = self.query_tokenizer(
query,
max_length=self.max_seq_len_query,
add_special_tokens=True,
truncation=True,
truncation_strategy="longest_first",
padding="max_length",
return_token_type_ids=True,
)
# tokenize query
tokenized_query = self.query_tokenizer.convert_ids_to_tokens(query_inputs["input_ids"])
if len(tokenized_query) == 0:
logger.warning(
f"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
)
return None
clear_text["query_text"] = query
tokenized["query_tokens"] = tokenized_query
features[0]["query_input_ids"] = query_inputs["input_ids"]
features[0]["query_segment_ids"] = query_inputs["token_type_ids"]
features[0]["query_attention_mask"] = query_inputs["attention_mask"]
except Exception as e:
features = None # type: ignore
sample = Sample(id="", clear_text=clear_text, tokenized=tokenized, features=features) # type: ignore
basket.samples = [sample]
return baskets
def _convert_contexts(self, baskets: List[SampleBasket]):
for basket in baskets:
if "passages" in basket.raw:
try:
positive_context = list(filter(lambda x: x["label"] == "positive", basket.raw["passages"]))
if self.shuffle_positives:
random.shuffle(positive_context)
positive_context = positive_context[: self.num_positives]
hard_negative_context = list(
filter(lambda x: x["label"] == "hard_negative", basket.raw["passages"])
)
if self.shuffle_negatives:
random.shuffle(hard_negative_context)
hard_negative_context = hard_negative_context[: self.num_hard_negatives]
positive_ctx_titles = [passage.get("title", None) for passage in positive_context]
positive_ctx_texts = [passage["text"] for passage in positive_context]
hard_negative_ctx_titles = [passage.get("title", None) for passage in hard_negative_context]
hard_negative_ctx_texts = [passage["text"] for passage in hard_negative_context]
# all context passages and labels: 1 for positive context and 0 for hard-negative context
ctx_label = [1] * self.num_positives + [0] * self.num_hard_negatives
# featurize context passages
if self.embed_title:
# concatenate title with positive context passages + negative context passages
all_ctx = self._combine_title_context(
positive_ctx_titles, positive_ctx_texts
) + self._combine_title_context(hard_negative_ctx_titles, hard_negative_ctx_texts)
else:
all_ctx = positive_ctx_texts + hard_negative_ctx_texts
# assign empty string tuples if hard_negative passages less than num_hard_negatives
all_ctx += [("", "")] * ((self.num_positives + self.num_hard_negatives) - len(all_ctx))
ctx_inputs = self.passage_tokenizer(
all_ctx,
add_special_tokens=True,
truncation=True,
padding="max_length",
max_length=self.max_seq_len_passage,
return_token_type_ids=True,
)
ctx_segment_ids = [[0] * len(ctx_inputs["token_type_ids"][0])] * len(ctx_inputs["token_type_ids"])
# get tokens in string format
tokenized_passage = [
self.passage_tokenizer.convert_ids_to_tokens(ctx) for ctx in ctx_inputs["input_ids"]
]
# for DPR we only have one sample containing query and corresponding (multiple) context features
sample = basket.samples[0] # type: ignore
sample.clear_text["passages"] = positive_context + hard_negative_context
sample.tokenized["passages_tokens"] = tokenized_passage # type: ignore
sample.features[0]["passage_input_ids"] = ctx_inputs["input_ids"] # type: ignore
sample.features[0]["passage_segment_ids"] = ctx_segment_ids # type: ignore
sample.features[0]["passage_attention_mask"] = ctx_inputs["attention_mask"] # type: ignore
sample.features[0]["label_ids"] = ctx_label # type: ignore
except Exception as e:
basket.samples[0].features = None # type: ignore
return baskets
def _create_dataset(self, baskets: List[SampleBasket]):
"""
Convert python features into pytorch dataset.
Also removes potential errors during preprocessing.
Flattens nested basket structure to create a flat list of features
"""
features_flat: List[dict] = []
basket_to_remove = []
problematic_ids: set = set()
for basket in baskets:
if self._check_sample_features(basket):
for sample in basket.samples: # type: ignore
features_flat.extend(sample.features) # type: ignore
else:
# remove the entire basket
basket_to_remove.append(basket)
if len(basket_to_remove) > 0:
for basket in basket_to_remove:
# if basket_to_remove is not empty remove the related baskets
problematic_ids.add(basket.id_internal)
baskets.remove(basket)
dataset, tensor_names = convert_features_to_dataset(features=features_flat)
return dataset, tensor_names, problematic_ids, baskets
@staticmethod
def _normalize_question(question: str) -> str:
"""Removes '?' from queries/questions"""
if question[-1] == "?":
question = question[:-1]
return question
@staticmethod
def _combine_title_context(titles: List[str], texts: List[str]):
res = []
for title, ctx in zip(titles, texts):
if title is None:
title = ""
logger.warning(
f"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '{ctx}' "
)
res.append(tuple((title, ctx)))
return res
class TableTextSimilarityProcessor(Processor):
"""
Used to handle the Multimodal Retrieval datasets consisting of text passages and tables
that come in json format.
"""
def __init__(
self,
query_tokenizer, # type: ignore
passage_tokenizer, # type: ignore
table_tokenizer, # type: ignore
max_seq_len_query: int,
max_seq_len_passage: int,
max_seq_len_table: int,
data_dir: str = "",
metric: Optional[str] = None,
train_filename: Optional[Union[Path, str]] = "train.json",
dev_filename: Optional[Union[Path, str]] = None,
test_filename: Optional[Union[Path, str]] = "test.json",
dev_split: float = 0.1,
proxies: Optional[Dict] = None,
max_samples: Optional[int] = None,
embed_meta_fields: List[str] = ["page_title", "section_title", "caption"],
num_positives: int = 1,
num_hard_negatives: int = 1,
shuffle_negatives: bool = True,
shuffle_positives: bool = False,
label_list: Optional[List[str]] = None,
**kwargs,
):
"""
:param query_tokenizer: Used to split a question (str) into tokens
:param passage_tokenizer: Used to split a text passage (str) into tokens.
:param table_tokenizer: Used to split a table into tokens
:param max_seq_len_query: Query samples are truncated after this many tokens.
:param max_seq_len_passage: Context/Passage Samples are truncated after this many tokens.
:param max_seq_len_table: Table samples are truncated after this many tokens.
:param data_dir: The directory in which the train and dev files can be found.
If not available the dataset will be loaded automatically
if the last directory has the same name as a predefined dataset.
These predefined datasets are defined as the keys in the dict DOWNSTREAM_TASK_MAP
:param metric: Name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro".
Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value.
For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn].
:param train_filename: The name of the file containing training data.
:param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
will be a slice of the train set.
:param test_filename: The name of the file containing the test data.
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None.
:param proxies: Proxy configuration to allow downloads of remote datasets.
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
:param max_samples: maximum number of samples to use.
:param embed_meta_fields: List of meta fields to embed in text passages and tables during tensorization.
:param num_hard_negatives: Maximum number of hard negative context passages in a sample.
:param num_positives: Maximum number of positive context passages in a sample.
:param shuffle_negatives: Whether to shuffle all the hard_negative passages before selecting the
num_hard_negative number of passages.
:param shuffle_positives: Whether to shuffle all the positive passages before selecting the
num_positive number of passages.
:param label_list: List of labels to predict. Usually ["hard_negative", "positive"].
:param kwargs: Placeholder for passing generic parameters
"""
# TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs
# Custom processor attributes
self.max_samples = max_samples
self.query_tokenizer = query_tokenizer
self.passage_tokenizer = passage_tokenizer
self.table_tokenizer = table_tokenizer
self.embed_meta_fields = embed_meta_fields
self.num_hard_negatives = num_hard_negatives
self.num_positives = num_positives
self.shuffle_negatives = shuffle_negatives
self.shuffle_positives = shuffle_positives
self.max_seq_len_query = max_seq_len_query
self.max_seq_len_passage = max_seq_len_passage
self.max_seq_len_table = max_seq_len_table
super(TableTextSimilarityProcessor, self).__init__(
tokenizer=self.query_tokenizer,
max_seq_len=0,
train_filename=train_filename,
dev_filename=dev_filename,
test_filename=test_filename,
dev_split=dev_split,
data_dir=data_dir,
tasks={},
proxies=proxies,
)
if metric:
self.add_task(
name="text_similarity",
metric=metric,
label_list=label_list,
label_name="label",
task_type="text_similarity",
)
else:
logger.info(
"Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for "
"using the default task or add a custom task later via processor.add_task()"
)
@classmethod
def load_from_dir(cls, load_dir: str):
"""
Overwriting method from parent class to **always** load the TableTextSimilarityProcessor
instead of the specific class stored in the config.
:param load_dir: Directory that contains a 'processor_config.json'
:return: An instance of an TableTextSimilarityProcessor.
"""
# read config
processor_config_file = Path(load_dir) / "processor_config.json"
config = json.load(open(processor_config_file))
# init tokenizer
query_tokenizer = AutoTokenizer.from_pretrained(
load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query"
)
passage_tokenizer = AutoTokenizer.from_pretrained(
load_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage"
)
table_tokenizer = AutoTokenizer.from_pretrained(
load_dir, tokenizer_class=config["table_tokenizer"], subfolder="table"
)
# we have to delete the tokenizer string from config, because we pass it as Object
del config["query_tokenizer"]
del config["passage_tokenizer"]
del config["table_tokenizer"]
processor = cls.load(
query_tokenizer=query_tokenizer,
passage_tokenizer=passage_tokenizer,
table_tokenizer=table_tokenizer,
processor_name="TableTextSimilarityProcessor",
**config,
)
for task_name, task in config["tasks"].items():
processor.add_task(name=task_name, metric=task["metric"], label_list=task["label_list"])
if processor is None:
raise Exception
return processor
def save(self, save_dir: Union[str, Path]):
"""
Saves the vocabulary to file and also creates a json file containing all the
information needed to load the same processor.
:param save_dir: Directory where the files are to be saved.
"""
if isinstance(save_dir, str):
save_dir = Path(save_dir)
os.makedirs(save_dir, exist_ok=True)
config = self.generate_config()
# save tokenizer incl. attributes
config["query_tokenizer"] = self.query_tokenizer.__class__.__name__
config["passage_tokenizer"] = self.passage_tokenizer.__class__.__name__
config["table_tokenizer"] = self.table_tokenizer.__class__.__name__
# Because the fast tokenizers expect a str and not Path
# always convert Path to str here.
self.query_tokenizer.save_pretrained(str(save_dir / "query"))
self.passage_tokenizer.save_pretrained(str(save_dir / "passage"))
self.table_tokenizer.save_pretrained(str(save_dir / "table"))
# save processor
config["processor"] = self.__class__.__name__
output_config_file = Path(save_dir) / "processor_config.json"
with open(output_config_file, "w") as file:
json.dump(config, file)
def file_to_dicts(self, file: str) -> List[Dict]:
"""
Converts a Multimodal Retrieval data file in json format to a list of dictionaries.
:param file: filename of DPR data in json format
Each sample is a dictionary of format:
{"question": str,
"answers": list of str
"positive_ctxs": list of dictionaries of format
{'title': str, 'text': str, 'passage_id': str, 'type': 'text', 'source': str}
or
{'page_title': str, 'section_title': str, 'caption': str, 'columns': list of str,
'rows': list of list of str, 'type': 'table', 'source': str}
"hard_negative_ctxs": list of dictionaries of format
{'title': str, 'text': str, 'passage_id': str, 'type': 'text', 'source': str}
or
{'page_title': str, 'section_title': str, 'caption': str, 'columns': list of str,
'rows': list of list of str, 'type': 'table', 'source': str}
}
Returns:
List of dictionaries: List[dict]
each dictionary:
{"query": str,
"passages": [
{"title": str, "text": str, "label": "positive" / "hard_negative", "type": "text", "external_id": id}
or
{"page_title": str, "section_title": str, "caption": str, "columns": list of str,
"rows": list of list of str, "label": "positive" / "hard_negative", "type": "table", "external_id": id}
...]}
"""
dicts = self._read_multimodal_dpr_json(file, max_samples=self.max_samples)
return dicts
def _read_multimodal_dpr_json(self, file: str, max_samples: Optional[int] = None) -> List[Dict]:
"""
Reads a Multimodal Retrieval data file in json format and returns a list of dictionaries.
:param file: filename of MMR data in json format
Returns:
list of dictionaries: List[dict]
each dictionary: {
"query": str -> query_text
"passages": List[dictionaries] -> [
{"text": str, "title": str, "label": "positive" / "hard_negative, "external_id": id},
or
{"page_title": str, "section_title": str, "caption": str, "columns": list of str,
"rows": list of lists of str, "label": "positive" / "hard_negative", "type": "table", "external_id": id}
...]
}
"""
dicts = json.load(open(file))
if max_samples:
dicts = random.sample(dicts, min(max_samples, len(dicts)))
# convert DPR dictionary to standard dictionary
query_json_keys = ["question", "questions", "query"]
positive_context_json_keys = ["positive_contexts", "positive_ctxs", "positive_context", "positive_ctx"]
hard_negative_json_keys = [
"hard_negative_contexts",
"hard_negative_ctxs",
"hard_negative_context",
"hard_negative_ctx",
]
standard_dicts = []
for dict in dicts:
sample = {}
docs = []
for key, val in dict.items():
if key in query_json_keys:
sample["query"] = val
elif key in positive_context_json_keys + hard_negative_json_keys:
for doc in val:
if doc["type"] == "table":
docs.append(
{
"meta": [
doc[meta_field] for meta_field in self.embed_meta_fields if meta_field in doc
],
"columns": doc.get("columns"),
"rows": doc.get("rows"),
"label": "positive" if key in positive_context_json_keys else "hard_negative",
"type": "table",
}
)
elif doc["type"] == "text":
docs.append(
{
"meta": [
doc[meta_field] for meta_field in self.embed_meta_fields if meta_field in doc
],
"text": doc["text"],
"label": "positive" if key in positive_context_json_keys else "hard_negative",
"type": "text",
}
)
sample["passages"] = docs
standard_dicts.append(sample)
return standard_dicts
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
):
"""
Convert input dictionaries into a pytorch dataset for TextSimilarity.
For conversion we have an internal representation called "baskets".
Each basket is one query and related text passages (positive passages fitting to the query and negative
passages that do not fit the query)
Each stage adds or transforms specific information to our baskets.
:param dicts: List of dicts, input dictionary with DPR-style content
{"query": str,
"passages": List[
{'title': str,
'text': str,
'label': 'hard_negative',
'external_id': str},
....
]
}
:param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
:param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference)
"""
# Take the dict and insert into our basket structure, this stages also adds an internal IDs
baskets = self._fill_baskets(dicts, indices)
# Separate conversion of query
baskets = self._convert_queries(baskets=baskets)
# and context passages and tables. When converting the context the label is also assigned.
baskets = self._convert_contexts(baskets=baskets)
# Convert features into pytorch dataset, this step also removes and logs potential errors during preprocessing
dataset, tensor_names, problematic_ids, baskets = self._create_dataset(baskets)
if problematic_ids:
logger.error(
f"There were {len(problematic_ids)} errors during preprocessing at positions: {problematic_ids}"
)
if return_baskets:
return dataset, tensor_names, problematic_ids, baskets
else:
return dataset, tensor_names, problematic_ids
def _fill_baskets(self, dicts: List[Dict], indices: Optional[Iterable[int]]):
baskets = []
if not indices:
indices = range(len(dicts))
for d, id_internal in zip(dicts, indices):
basket = SampleBasket(id_external=None, id_internal=id_internal, raw=d)
baskets.append(basket)
return baskets
def _convert_queries(self, baskets: List[SampleBasket]):
for basket in baskets:
clear_text = {}
tokenized = {}
features: List[Dict] = [{}]
# extract query, positive context passages and titles, hard-negative passages and titles
if "query" in basket.raw:
try:
query = self._normalize_question(basket.raw["query"])
# featurize the query
query_inputs = self.query_tokenizer(
query,
max_length=self.max_seq_len_query,
add_special_tokens=True,
truncation=True,
truncation_strategy="longest_first",
padding="max_length",
return_token_type_ids=True,
)
# tokenize query
tokenized_query = self.query_tokenizer.convert_ids_to_tokens(query_inputs["input_ids"])
if len(tokenized_query) == 0:
logger.warning(
f"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
)
return None
clear_text["query_text"] = query
tokenized["query_tokens"] = tokenized_query
features[0]["query_input_ids"] = query_inputs["input_ids"]
features[0]["query_segment_ids"] = query_inputs["token_type_ids"]
features[0]["query_attention_mask"] = query_inputs["attention_mask"]
except Exception as e:
features = None # type: ignore
sample = Sample(id="", clear_text=clear_text, tokenized=tokenized, features=features) # type: ignore
basket.samples = [sample]
return baskets
def _convert_contexts(self, baskets: List[SampleBasket]):
# Converts both text passages and tables.
for basket in baskets:
if "passages" in basket.raw:
try:
positive_context = list(filter(lambda x: x["label"] == "positive", basket.raw["passages"]))
if self.shuffle_positives:
random.shuffle(positive_context)
positive_context = positive_context[: self.num_positives]
hard_negative_context = list(
filter(lambda x: x["label"] == "hard_negative", basket.raw["passages"])
)
if self.shuffle_negatives:
random.shuffle(hard_negative_context)
hard_negative_context = hard_negative_context[: self.num_hard_negatives]
positive_ctx_meta = []
positive_ctx_texts = []
hard_negative_ctx_meta = []
hard_negative_ctx_texts = []
is_table = []
for pos_ctx in positive_context:
if pos_ctx["type"] == "text":
positive_ctx_meta.append(" ".join(pos_ctx.get("meta")))
positive_ctx_texts.append(pos_ctx["text"])
is_table.append(0)
elif pos_ctx["type"] == "table":
positive_ctx_meta.append(" ".join(pos_ctx.get("meta")))
linearized_rows = [cell for row in pos_ctx["rows"] for cell in row]
linearized_table = " ".join(pos_ctx["columns"]) + " " + " ".join(linearized_rows)
positive_ctx_texts.append(linearized_table)
is_table.append(1)
for hn_ctx in hard_negative_context:
if hn_ctx["type"] == "text":
hard_negative_ctx_meta.append(" ".join(hn_ctx.get("meta")))
hard_negative_ctx_texts.append(hn_ctx["text"])
is_table.append(0)
elif hn_ctx["type"] == "table":
hard_negative_ctx_meta.append(" ".join(hn_ctx.get("meta")))
linearized_rows = [cell for row in hn_ctx["rows"] for cell in row]
linearized_table = " ".join(hn_ctx["columns"]) + " " + " ".join(linearized_rows)
hard_negative_ctx_texts.append(linearized_table)
is_table.append(1)
# all context passages and labels: 1 for positive context and 0 for hard-negative context
ctx_label = [1] * self.num_positives + [0] * self.num_hard_negatives
# featurize context passages
if self.embed_meta_fields:
# concatenate title with positive context passages + negative context passages
all_ctx = self._combine_meta_context(
positive_ctx_meta, positive_ctx_texts
) + self._combine_meta_context(hard_negative_ctx_meta, hard_negative_ctx_texts)
else:
all_ctx = positive_ctx_texts + hard_negative_ctx_texts
# assign empty string tuples if hard_negative passages less than num_hard_negatives
all_ctx += [("", "")] * ((self.num_positives + self.num_hard_negatives) - len(all_ctx))
inputs = self.passage_tokenizer(
all_ctx,
add_special_tokens=True,
truncation=True,
padding="max_length",
max_length=self.max_seq_len_passage,
return_token_type_ids=True,
)
input_ids = inputs["input_ids"]
passage_segment_ids = inputs["token_type_ids"]
attention_mask = inputs["attention_mask"]
# get tokens in string format
tokenized = [self.passage_tokenizer.convert_ids_to_tokens(ctx) for ctx in input_ids]
# for DPR we only have one sample containing query and corresponding (multiple) context features
sample = basket.samples[0] # type: ignore
sample.clear_text["passages"] = positive_context + hard_negative_context # type: ignore
sample.tokenized["passages_tokens"] = tokenized # type: ignore
sample.features[0]["passage_input_ids"] = input_ids # type: ignore
sample.features[0]["passage_segment_ids"] = passage_segment_ids # type: ignore
sample.features[0]["table_segment_ids"] = passage_segment_ids # type: ignore
sample.features[0]["passage_attention_mask"] = attention_mask # type: ignore
sample.features[0]["label_ids"] = ctx_label # type: ignore
sample.features[0]["is_table"] = is_table # type: ignore
except Exception as e:
basket.samples[0].features = None # type: ignore
return baskets
def _create_dataset(self, baskets: List[SampleBasket]):
"""
Convert python features into pytorch dataset.
Also removes potential errors during preprocessing.
Flattens nested basket structure to create a flat list of features
"""
features_flat: List = []
basket_to_remove = []
problematic_ids = set()
for basket in baskets:
if self._check_sample_features(basket):
for sample in basket.samples: # type: ignore
features_flat.extend(sample.features) # type: ignore
else:
# remove the entire basket
basket_to_remove.append(basket)
if len(basket_to_remove) > 0:
for basket in basket_to_remove:
# if basket_to_remove is not empty remove the related baskets
problematic_ids.add(basket.id_internal)
baskets.remove(basket)
dataset, tensor_names = convert_features_to_dataset(features=features_flat)
return dataset, tensor_names, problematic_ids, baskets
@staticmethod
def _normalize_question(question: str) -> str:
"""Removes '?' from queries/questions"""
if question[-1] == "?":
question = question[:-1]
return question
@staticmethod
def _combine_meta_context(meta_fields: List[str], texts: List[str]):
res = []
for meta, ctx in zip(meta_fields, texts):
if meta is None:
meta = ""
res.append(tuple((meta, ctx)))
return res
class TextClassificationProcessor(Processor):
"""
Used to handle the text classification datasets that come in tabular format (CSV, TSV, etc.)
"""
def __init__(
self,
tokenizer,
max_seq_len,
data_dir,
label_list=None,
metric=None,
train_filename="train.tsv",
dev_filename=None,
test_filename="test.tsv",
dev_split=0.1,
dev_stratification=False,
delimiter="\t",
quote_char="'",
skiprows=None,
label_column_name="label",
multilabel=False,
header=0,
proxies=None,
max_samples=None,
text_column_name="text",
**kwargs,
):
"""
:param tokenizer: Used to split a sentence (str) into tokens.
:param max_seq_len: Samples are truncated after this many tokens.
:type max_seq_len: int
:param data_dir: The directory in which the train and dev files can be found.
If not available the dataset will be loaded automaticaly
if the last directory has the same name as a predefined dataset.
These predefined datasets are defined as the keys in the dict at
`farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/main/farm/data_handler/utils.py>`_.
:type data_dir: str
:param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
:type label_list: list
:param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro".
Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value.
For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn].
:type metric: str, function, or list
:param train_filename: The name of the file containing training data.
:type train_filename: str
:param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
will be a slice of the train set.
:type dev_filename: str or None
:param test_filename: None
:type test_filename: str
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
:type dev_split: float
:param dev_stratification: if True, create a class-stratified split for the dev set.
:type dev_stratification: bool
:param delimiter: Separator used in the input tsv / csv file
:type delimiter: str
:param quote_char: Character used for quoting strings in the input tsv/ csv file
:type quote_char: str
:param skiprows: number of rows to skip in the tsvs (e.g. for multirow headers)
:type skiprows: int
:param label_column_name: name of the column in the input csv/tsv that shall be used as training labels
:type label_column_name: str
:param multilabel: set to True for multilabel classification
:type multilabel: bool
:param header: which line to use as a header in the input csv/tsv
:type header: int
:param proxies: proxy configuration to allow downloads of remote datasets.
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
:type proxies: dict
:param text_column_name: name of the column in the input csv/tsv that shall be used as training text
:type text_column_name: str
:param kwargs: placeholder for passing generic parameters
:type kwargs: object
"""
# TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs
# Custom processor attributes
self.delimiter = delimiter
self.quote_char = quote_char
self.skiprows = skiprows
self.header = header
self.max_samples = max_samples
self.dev_stratification = dev_stratification
logger.debug("Currently no support in Processor for returning problematic ids")
super(TextClassificationProcessor, self).__init__(
tokenizer=tokenizer,
max_seq_len=max_seq_len,
train_filename=train_filename,
dev_filename=dev_filename,
test_filename=test_filename,
dev_split=dev_split,
data_dir=data_dir,
tasks={},
proxies=proxies,
)
if metric and label_list:
if multilabel:
task_type = "multilabel_classification"
else:
task_type = "classification"
self.add_task(
name="text_classification",
metric=metric,
label_list=label_list,
label_column_name=label_column_name,
text_column_name=text_column_name,
task_type=task_type,
)
else:
logger.info(
"Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for "
"using the default task or add a custom task later via processor.add_task()"
)
def file_to_dicts(self, file: str) -> List[Dict]:
raise NotImplementedError
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
):
self.baskets = []
# Tokenize in batches
texts = [x["text"] for x in dicts]
tokenized_batch = self.tokenizer(
texts,
return_offsets_mapping=True,
return_special_tokens_mask=True,
return_token_type_ids=True,
return_attention_mask=True,
truncation=True,
max_length=self.max_seq_len,
padding="max_length",
)
input_ids_batch = tokenized_batch["input_ids"]
segment_ids_batch = tokenized_batch["token_type_ids"]
padding_masks_batch = tokenized_batch["attention_mask"]
tokens_batch = [x.tokens for x in tokenized_batch.encodings]
# From here we operate on a per sample basis
for dictionary, input_ids, segment_ids, padding_mask, tokens in zip(
dicts, input_ids_batch, segment_ids_batch, padding_masks_batch, tokens_batch
):
tokenized = {}
if debug:
tokenized["tokens"] = tokens
feat_dict = {"input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids}
# Create labels
# i.e. not inference
if not return_baskets:
label_dict = self.convert_labels(dictionary)
feat_dict.update(label_dict)
# Add Basket to self.baskets
curr_sample = Sample(id="", clear_text=dictionary, tokenized=tokenized, features=[feat_dict])
curr_basket = SampleBasket(id_internal=None, raw=dictionary, id_external=None, samples=[curr_sample])
self.baskets.append(curr_basket)
if indices and 0 not in indices:
pass
else:
self._log_samples(n_samples=1, baskets=self.baskets)
# TODO populate problematic ids
problematic_ids: set = set()
dataset, tensornames = self._create_dataset()
if return_baskets:
return dataset, tensornames, problematic_ids, self.baskets
else:
return dataset, tensornames, problematic_ids
def convert_labels(self, dictionary: Dict):
ret: Dict = {}
# Add labels for different tasks
for task_name, task in self.tasks.items():
label_name = task["label_name"]
label_raw = dictionary[label_name]
label_list = task["label_list"]
if task["task_type"] == "classification":
# id of label
label_ids = [label_list.index(label_raw)]
elif task["task_type"] == "multilabel_classification":
# multi-hot-format
label_ids = [0] * len(label_list)
for l in label_raw.split(","):
if l != "":
label_ids[label_list.index(l)] = 1
ret[task["label_tensor_name"]] = label_ids
return ret
def _create_dataset(self):
# TODO this is the proposed new version to replace the mother function
features_flat = []
basket_to_remove = []
for basket in self.baskets:
if self._check_sample_features(basket):
for sample in basket.samples:
features_flat.extend(sample.features)
else:
# remove the entire basket
basket_to_remove.append(basket)
dataset, tensor_names = convert_features_to_dataset(features=features_flat)
return dataset, tensor_names
class InferenceProcessor(TextClassificationProcessor):
"""
Generic processor used at inference time:
- fast
- no labels
- pure encoding of text into pytorch dataset
- Doesn't read from file, but only consumes dictionaries (e.g. coming from API requests)
"""
def __init__(self, tokenizer, max_seq_len, **kwargs):
super(InferenceProcessor, self).__init__(
tokenizer=tokenizer,
max_seq_len=max_seq_len,
train_filename=None,
dev_filename=None,
test_filename=None,
dev_split=None,
data_dir=None,
tasks={},
)
@classmethod
def load_from_dir(cls, load_dir: str):
"""
Overwriting method from parent class to **always** load the InferenceProcessor instead of the specific class stored in the config.
:param load_dir: str, directory that contains a 'processor_config.json'
:return: An instance of an InferenceProcessor
"""
# read config
processor_config_file = Path(load_dir) / "processor_config.json"
config = json.load(open(processor_config_file))
# init tokenizer
tokenizer = AutoTokenizer.from_pretrained(load_dir, tokenizer_class=config["tokenizer"])
# we have to delete the tokenizer string from config, because we pass it as Object
del config["tokenizer"]
processor = cls.load(tokenizer=tokenizer, processor_name="InferenceProcessor", **config)
for task_name, task in config["tasks"].items():
processor.add_task(name=task_name, metric=task["metric"], label_list=task["label_list"])
if processor is None:
raise Exception
return processor
def file_to_dicts(self, file: str) -> List[Dict]:
raise NotImplementedError
def convert_labels(self, dictionary: Dict):
# For inference we do not need labels
ret: Dict = {}
return ret
# Private method to keep s3e pooling and embedding extraction working
def _dict_to_samples(self, dictionary: Dict, **kwargs) -> Sample:
# this tokenization also stores offsets
tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
# truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
truncated_tokens = {}
for seq_name, tokens in tokenized.items():
truncated_tokens[seq_name], _, _ = truncate_sequences(
seq_a=tokens, seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len
)
return Sample(id="", clear_text=dictionary, tokenized=truncated_tokens)
# Private method to keep s3e pooling and embedding extraction working
def _sample_to_features(self, sample: Sample) -> Dict:
features = sample_to_features_text(
sample=sample, tasks=self.tasks, max_seq_len=self.max_seq_len, tokenizer=self.tokenizer
)
return features
class UnlabeledTextProcessor(Processor):
"""
Processor to be used for distilling a teacher model into a student model from scratch. Can only be used with distil_intermediate_layers_from.
"""
def __init__(
self,
tokenizer,
max_seq_len: int,
train_filename: Optional[Union[Path, str]] = None,
dev_filename: Optional[Union[Path, str]] = None,
test_filename: Optional[Union[Path, str]] = None,
dev_split: float = 0,
data_dir: Optional[Union[Path, str]] = None,
tasks: Dict = {},
proxies: Optional[Dict] = None,
multithreading_rust: Optional[bool] = True,
):
super().__init__(
tokenizer,
max_seq_len,
train_filename,
dev_filename,
test_filename,
dev_split,
data_dir,
tasks,
proxies,
multithreading_rust,
)
self.add_task("question_answering", "squad", ["start_token", "end_token"])
def file_to_dicts(self, file: str) -> List[dict]:
dicts = []
with open(file, "r") as f:
for line in f:
dicts.append({"text": line})
return dicts
def dataset_from_dicts(
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
):
if return_baskets:
raise NotImplementedError("return_baskets is not supported by UnlabeledTextProcessor")
texts = [dict_["text"] for dict_ in dicts]
tokens = self.tokenizer(
texts,
add_special_tokens=True,
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=self.max_seq_len,
)
names = [key for key in tokens]
inputs = [tokens[key] for key in tokens]
if not "padding_mask" in names:
index = names.index("attention_mask")
names[index] = "padding_mask"
if not "segment_ids" in names:
index = names.index("token_type_ids")
names[index] = "segment_ids"
dataset = TensorDataset(*inputs)
return dataset, names, []
def _create_dataset(self, baskets: List[SampleBasket]):
raise NotImplementedError("_create_dataset is not supported by UnlabeledTextProcessor")
# helper fcts
def write_squad_predictions(predictions, out_filename, predictions_filename=None):
predictions_json = {}
for x in predictions:
for p in x["predictions"]:
if p["answers"][0]["answer"] is not None:
predictions_json[p["question_id"]] = p["answers"][0]["answer"]
else:
predictions_json[
p["question_id"]
] = "" # convert No answer = None to format understood by the SQuAD eval script
if predictions_filename:
dev_labels = {}
temp = json.load(open(predictions_filename, "r"))
for d in temp["data"]:
for p in d["paragraphs"]:
for q in p["qas"]:
if q.get("is_impossible", False):
dev_labels[q["id"]] = "is_impossible"
else:
dev_labels[q["id"]] = q["answers"][0]["text"]
not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys()))
if len(not_included) > 0:
logger.info("There were missing predicitons for question ids: %s", list(not_included))
for x in not_included:
predictions_json[x] = ""
# os.makedirs("model_output", exist_ok=True)
# filepath = Path("model_output") / out_filename
json.dump(predictions_json, open(out_filename, "w"))
logger.info("Written Squad predictions to: %s", out_filename)
def _read_dpr_json(
file: str,
max_samples: Optional[int] = None,
proxies: Optional[Any] = None,
num_hard_negatives: int = 1,
num_positives: int = 1,
shuffle_negatives: bool = True,
shuffle_positives: bool = False,
):
"""
Reads a Dense Passage Retrieval (DPR) data file in json format and returns a list of dictionaries.
:param file: filename of DPR data in json format
Returns:
list of dictionaries: List[dict]
each dictionary: {
"query": str -> query_text
"passages": List[dictionaries] -> [{"text": document_text, "title": xxx, "label": "positive", "external_id": abb123},
{"text": document_text, "title": xxx, "label": "hard_negative", "external_id": abb134},
...]
}
example:
["query": 'who sings does he love me with reba'
"passages" : [{'title': 'Does He Love You',
'text': 'Does He Love You "Does He Love You" is a song written by Sandy Knox and Billy Stritch, and recorded as a duet by American country music artists Reba McEntire and Linda Davis. It was released in August 1993 as the first single from Reba\'s album "Greatest Hits Volume Two". It is one of country music\'s several songs about a love triangle. "Does He Love You" was written in 1982 by Billy Stritch. He recorded it with a trio in which he performed at the time, because he wanted a song that could be sung by the other two members',
'label': 'positive',
'external_id': '11828866'},
{'title': 'When the Nightingale Sings',
'text': "When the Nightingale Sings When The Nightingale Sings is a Middle English poem, author unknown, recorded in the British Library's Harley 2253 manuscript, verse 25. It is a love poem, extolling the beauty and lost love of an unknown maiden. When þe nyhtegale singes þe wodes waxen grene.<br> Lef ant gras ant blosme springes in aueryl y wene,<br> Ant love is to myn herte gon wiþ one spere so kene<br> Nyht ant day my blod hit drynkes myn herte deþ me tene. Ich have loved al þis er þat y may love namore,<br> Ich have siked moni syk lemmon for",
'label': 'hard_negative',
'external_id': '10891637'}]
]
"""
# get remote dataset if needed
if not os.path.exists(file):
logger.info("Couldn't find %s locally. Trying to download ...", file)
_download_extract_downstream_data(file, proxies=proxies)
if Path(file).suffix.lower() == ".jsonl":
dicts = []
with open(file, encoding="utf-8") as f:
for line in f:
dicts.append(json.loads(line))
else:
dicts = json.load(open(file, encoding="utf-8"))
if max_samples:
dicts = random.sample(dicts, min(max_samples, len(dicts)))
# convert DPR dictionary to standard dictionary
query_json_keys = ["question", "questions", "query"]
positive_context_json_keys = ["positive_contexts", "positive_ctxs", "positive_context", "positive_ctx"]
hard_negative_json_keys = [
"hard_negative_contexts",
"hard_negative_ctxs",
"hard_negative_context",
"hard_negative_ctx",
]
standard_dicts = []
for dict in dicts:
sample = {}
passages = []
for key, val in dict.items():
if key in query_json_keys:
sample["query"] = val
elif key in positive_context_json_keys:
if shuffle_positives:
random.shuffle(val)
for passage in val[:num_positives]:
passages.append(
{
"title": passage["title"],
"text": passage["text"],
"label": "positive",
"external_id": passage.get("passage_id", uuid.uuid4().hex.upper()[0:8]),
}
)
elif key in hard_negative_json_keys:
if shuffle_negatives:
random.shuffle(val)
for passage in val[:num_hard_negatives]:
passages.append(
{
"title": passage["title"],
"text": passage["text"],
"label": "hard_negative",
"external_id": passage.get("passage_id", uuid.uuid4().hex.upper()[0:8]),
}
)
sample["passages"] = passages
standard_dicts.append(sample)
return standard_dicts
def _read_squad_file(filename: str, proxies=None):
"""Read a SQuAD json file"""
if not os.path.exists(filename):
logger.info("Couldn't find %s locally. Trying to download ...", filename)
_download_extract_downstream_data(filename, proxies)
with open(filename, "r", encoding="utf-8") as reader:
input_data = json.load(reader)["data"]
return input_data
def http_get(url, temp_file, proxies=None):
req = requests.get(url, stream=True, proxies=proxies)
content_length = req.headers.get("Content-Length")
total = int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total)
for chunk in req.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()
def _download_extract_downstream_data(input_file: str, proxies=None):
# download archive to temp dir and extract to correct position
full_path = Path(os.path.realpath(input_file))
directory = full_path.parent
taskname = directory.stem
datadir = directory.parent
logger.info("downloading and extracting file {} to dir {}".format(taskname, datadir))
if taskname not in DOWNSTREAM_TASK_MAP:
logger.error("Cannot download {}. Unknown data source.".format(taskname))
else:
if os.name == "nt": # make use of NamedTemporaryFile compatible with Windows
delete_tmp_file = False
else:
delete_tmp_file = True
with tempfile.NamedTemporaryFile(delete=delete_tmp_file) as temp_file:
http_get(DOWNSTREAM_TASK_MAP[taskname], temp_file, proxies=proxies)
temp_file.flush()
temp_file.seek(0) # making tempfile accessible
tfile = tarfile.open(temp_file.name)
tfile.extractall(datadir)
# temp_file gets deleted here
def _is_json(x):
if issubclass(type(x), Path):
return True
try:
json.dumps(x)
return True
except:
return False