mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-20 03:54:17 +00:00
2271 lines
105 KiB
Python
2271 lines
105 KiB
Python
# pylint: disable=missing-timeout
|
|
|
|
from typing import Optional, Dict, List, Union, Any, Iterable, Type
|
|
|
|
import os
|
|
import json
|
|
import uuid
|
|
import inspect
|
|
import logging
|
|
import random
|
|
import tarfile
|
|
import tempfile
|
|
from pathlib import Path
|
|
from inspect import signature
|
|
from abc import ABC, abstractmethod
|
|
|
|
import numpy as np
|
|
import requests
|
|
from tqdm import tqdm
|
|
from torch.utils.data import TensorDataset
|
|
import transformers
|
|
from transformers import PreTrainedTokenizer, AutoTokenizer
|
|
|
|
from haystack.modeling.model.feature_extraction import (
|
|
tokenize_batch_question_answering,
|
|
tokenize_with_metadata,
|
|
truncate_sequences,
|
|
)
|
|
from haystack.modeling.data_handler.dataset import convert_features_to_dataset
|
|
from haystack.modeling.data_handler.samples import (
|
|
Sample,
|
|
SampleBasket,
|
|
get_passage_offsets,
|
|
offset_to_token_idx_vecorized,
|
|
)
|
|
from haystack.modeling.data_handler.input_features import sample_to_features_text
|
|
from haystack.utils.experiment_tracking import Tracker as tracker
|
|
|
|
|
|
DOWNSTREAM_TASK_MAP = {
|
|
"squad20": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/squad20.tar.gz",
|
|
"covidqa": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/covidqa.tar.gz",
|
|
}
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class Processor(ABC):
|
|
"""
|
|
Base class for low level data processors to convert input text to PyTorch Datasets.
|
|
"""
|
|
|
|
subclasses: dict = {}
|
|
|
|
def __init__(
|
|
self,
|
|
tokenizer,
|
|
max_seq_len: int,
|
|
train_filename: Optional[Union[Path, str]],
|
|
dev_filename: Optional[Union[Path, str]],
|
|
test_filename: Optional[Union[Path, str]],
|
|
dev_split: float,
|
|
data_dir: Optional[Union[Path, str]],
|
|
tasks: Dict = {},
|
|
proxies: Optional[Dict] = None,
|
|
multithreading_rust: Optional[bool] = True,
|
|
):
|
|
"""
|
|
:param tokenizer: Used to split a sentence (str) into tokens.
|
|
:param max_seq_len: Samples are truncated after this many tokens.
|
|
:param train_filename: The name of the file containing training data.
|
|
:param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
|
|
will be a slice of the train set.
|
|
:param test_filename: The name of the file containing test data.
|
|
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
|
|
:param data_dir: The directory in which the train, test and perhaps dev files can be found.
|
|
:param tasks: Tasks for which the processor shall extract labels from the input data.
|
|
Usually this includes a single, default task, e.g. text classification.
|
|
In a multitask setting this includes multiple tasks, e.g. 2x text classification.
|
|
The task name will be used to connect with the related PredictionHead.
|
|
:param proxies: proxy configuration to allow downloads of remote datasets.
|
|
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
|
|
:param multithreading_rust: Whether to allow multithreading in Rust, e.g. for FastTokenizers.
|
|
Note: Enabling multithreading in Rust AND multiprocessing in python might cause
|
|
deadlocks.
|
|
"""
|
|
if not multithreading_rust:
|
|
os.environ["RAYON_RS_NUM_CPUS"] = "1"
|
|
|
|
self.tokenizer = tokenizer
|
|
self.max_seq_len = max_seq_len
|
|
self.tasks = tasks
|
|
self.proxies = proxies
|
|
|
|
# data sets
|
|
self.train_filename = train_filename
|
|
self.dev_filename = dev_filename
|
|
self.test_filename = test_filename
|
|
self.dev_split = dev_split
|
|
if data_dir:
|
|
self.data_dir = Path(data_dir)
|
|
else:
|
|
self.data_dir = None # type: ignore
|
|
self.baskets: List = []
|
|
|
|
self._log_params()
|
|
self.problematic_sample_ids: set = set()
|
|
|
|
def __init_subclass__(cls, **kwargs):
|
|
"""This automatically keeps track of all available subclasses.
|
|
Enables generic load() and load_from_dir() for all specific Processor implementation.
|
|
"""
|
|
super().__init_subclass__(**kwargs)
|
|
cls.subclasses[cls.__name__] = cls
|
|
|
|
@classmethod
|
|
def load(
|
|
cls,
|
|
processor_name: str,
|
|
data_dir: str, # TODO revert ignore
|
|
tokenizer, # type: ignore
|
|
max_seq_len: int,
|
|
train_filename: str,
|
|
dev_filename: Optional[str],
|
|
test_filename: str,
|
|
dev_split: float,
|
|
**kwargs,
|
|
):
|
|
"""
|
|
Loads the class of processor specified by processor name.
|
|
|
|
:param processor_name: The class of processor to be loaded.
|
|
:param data_dir: Directory where data files are located.
|
|
:param tokenizer: A tokenizer object
|
|
:param max_seq_len: Sequences longer than this will be truncated.
|
|
:param train_filename: The name of the file containing training data.
|
|
:param dev_filename: The name of the file containing the dev data.
|
|
If None and 0.0 < dev_split < 1.0 the dev set
|
|
will be a slice of the train set.
|
|
:param test_filename: The name of the file containing test data.
|
|
:param dev_split: The proportion of the train set that will sliced.
|
|
Only works if dev_filename is set to None
|
|
:param kwargs: placeholder for passing generic parameters
|
|
:return: An instance of the specified processor.
|
|
"""
|
|
|
|
sig = signature(cls.subclasses[processor_name])
|
|
unused_args = {k: v for k, v in kwargs.items() if k not in sig.parameters}
|
|
logger.debug(
|
|
"Got more parameters than needed for loading %s: %s. Those won't be used!", processor_name, unused_args
|
|
)
|
|
processor = cls.subclasses[processor_name](
|
|
data_dir=data_dir,
|
|
tokenizer=tokenizer,
|
|
max_seq_len=max_seq_len,
|
|
train_filename=train_filename,
|
|
dev_filename=dev_filename,
|
|
test_filename=test_filename,
|
|
dev_split=dev_split,
|
|
**kwargs,
|
|
)
|
|
|
|
return processor
|
|
|
|
@classmethod
|
|
def load_from_dir(cls, load_dir: str):
|
|
"""
|
|
Infers the specific type of Processor from a config file (e.g. SquadProcessor) and loads an instance of it.
|
|
|
|
:param load_dir: directory that contains a 'processor_config.json'
|
|
:return: An instance of a Processor Subclass (e.g. SquadProcessor)
|
|
"""
|
|
# read config
|
|
processor_config_file = Path(load_dir) / "processor_config.json"
|
|
config = json.load(open(processor_config_file))
|
|
config["inference"] = True
|
|
# init tokenizer
|
|
if "lower_case" in config.keys():
|
|
logger.warning(
|
|
"Loading tokenizer from deprecated config. "
|
|
"If you used `custom_vocab` or `never_split_chars`, this won't work anymore."
|
|
)
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"]
|
|
)
|
|
else:
|
|
tokenizer = AutoTokenizer.from_pretrained(load_dir, tokenizer_class=config["tokenizer"])
|
|
|
|
# we have to delete the tokenizer string from config, because we pass it as Object
|
|
del config["tokenizer"]
|
|
|
|
processor = cls.load(tokenizer=tokenizer, processor_name=config["processor"], **config)
|
|
|
|
for task_name, task in config["tasks"].items():
|
|
processor.add_task(
|
|
name=task_name,
|
|
metric=task["metric"],
|
|
label_list=task["label_list"],
|
|
label_column_name=task["label_column_name"],
|
|
text_column_name=task.get("text_column_name", None),
|
|
task_type=task["task_type"],
|
|
)
|
|
|
|
if processor is None:
|
|
raise Exception
|
|
|
|
return processor
|
|
|
|
@classmethod
|
|
def convert_from_transformers(
|
|
cls,
|
|
tokenizer_name_or_path,
|
|
task_type,
|
|
max_seq_len,
|
|
doc_stride,
|
|
revision=None,
|
|
tokenizer_class=None,
|
|
tokenizer_args=None,
|
|
use_fast=True,
|
|
**kwargs,
|
|
):
|
|
tokenizer_args = tokenizer_args or {}
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
tokenizer_name_or_path,
|
|
tokenizer_class=tokenizer_class,
|
|
use_fast=use_fast,
|
|
revision=revision,
|
|
**tokenizer_args,
|
|
**kwargs,
|
|
)
|
|
|
|
# TODO infer task_type automatically from config (if possible)
|
|
if task_type == "question_answering":
|
|
processor = SquadProcessor(
|
|
tokenizer=tokenizer,
|
|
max_seq_len=max_seq_len,
|
|
label_list=["start_token", "end_token"],
|
|
metric="squad",
|
|
data_dir="data",
|
|
doc_stride=doc_stride,
|
|
)
|
|
elif task_type == "embeddings":
|
|
processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=max_seq_len)
|
|
|
|
else:
|
|
raise ValueError(
|
|
f"`task_type` {task_type} is not supported yet. "
|
|
f"Valid options for arg `task_type`: 'question_answering', "
|
|
f"'embeddings', "
|
|
)
|
|
|
|
return processor
|
|
|
|
def save(self, save_dir: str):
|
|
"""
|
|
Saves the vocabulary to file and also creates a json file containing all the
|
|
information needed to load the same processor.
|
|
|
|
:param save_dir: Directory where the files are to be saved
|
|
:return: None
|
|
"""
|
|
os.makedirs(save_dir, exist_ok=True)
|
|
config = self.generate_config()
|
|
# save tokenizer incl. attributes
|
|
config["tokenizer"] = self.tokenizer.__class__.__name__
|
|
|
|
# Because the fast tokenizers expect a str and not Path
|
|
# always convert Path to str here.
|
|
self.tokenizer.save_pretrained(str(save_dir))
|
|
|
|
# save processor
|
|
config["processor"] = self.__class__.__name__
|
|
output_config_file = Path(save_dir) / "processor_config.json"
|
|
with open(output_config_file, "w") as file:
|
|
json.dump(config, file)
|
|
|
|
def generate_config(self):
|
|
"""
|
|
Generates config file from Class and instance attributes (only for sensible config parameters).
|
|
"""
|
|
config = {}
|
|
# self.__dict__ doesn't give parent class attributes
|
|
for key, value in inspect.getmembers(self):
|
|
if _is_json(value) and key[0] != "_":
|
|
if issubclass(type(value), Path):
|
|
value = str(value)
|
|
config[key] = value
|
|
return config
|
|
|
|
# TODO potentially remove tasks from code - multitask learning is not supported anyways
|
|
def add_task(
|
|
self, name, metric, label_list, label_column_name=None, label_name=None, task_type=None, text_column_name=None
|
|
):
|
|
if type(label_list) is not list:
|
|
raise ValueError(f"Argument `label_list` must be of type list. Got: f{type(label_list)}")
|
|
|
|
if label_name is None:
|
|
label_name = f"{name}_label"
|
|
label_tensor_name = label_name + "_ids"
|
|
self.tasks[name] = {
|
|
"label_list": label_list,
|
|
"metric": metric,
|
|
"label_tensor_name": label_tensor_name,
|
|
"label_name": label_name,
|
|
"label_column_name": label_column_name,
|
|
"text_column_name": text_column_name,
|
|
"task_type": task_type,
|
|
}
|
|
|
|
@abstractmethod
|
|
def file_to_dicts(self, file: str) -> List[dict]:
|
|
raise NotImplementedError()
|
|
|
|
@abstractmethod
|
|
def dataset_from_dicts(
|
|
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
|
):
|
|
raise NotImplementedError()
|
|
|
|
@abstractmethod
|
|
def _create_dataset(self, baskets: List[SampleBasket]):
|
|
raise NotImplementedError
|
|
|
|
@staticmethod
|
|
def log_problematic(problematic_sample_ids):
|
|
if problematic_sample_ids:
|
|
n_problematic = len(problematic_sample_ids)
|
|
problematic_id_str = ", ".join([str(i) for i in problematic_sample_ids])
|
|
logger.error(
|
|
"Unable to convert %s samples to features. Their ids are : %s", n_problematic, problematic_id_str
|
|
)
|
|
|
|
@staticmethod
|
|
def _check_sample_features(basket: SampleBasket):
|
|
"""
|
|
Check if all samples in the basket has computed its features.
|
|
|
|
:param basket: the basket containing the samples
|
|
|
|
:return: True if all the samples in the basket has computed its features, False otherwise
|
|
"""
|
|
return basket.samples and not any(sample.features is None for sample in basket.samples)
|
|
|
|
def _log_samples(self, n_samples: int, baskets: List[SampleBasket]):
|
|
logger.debug("*** Show %s random examples ***", n_samples)
|
|
if len(baskets) == 0:
|
|
logger.debug("*** No samples to show because there are no baskets ***")
|
|
return
|
|
for i in range(n_samples):
|
|
random_basket = random.choice(baskets)
|
|
random_sample = random.choice(random_basket.samples) # type: ignore
|
|
logger.debug(random_sample)
|
|
|
|
def _log_params(self):
|
|
params = {"processor": self.__class__.__name__, "tokenizer": self.tokenizer.__class__.__name__}
|
|
names = ["max_seq_len", "dev_split"]
|
|
for name in names:
|
|
value = getattr(self, name)
|
|
params.update({name: str(value)})
|
|
tracker.track_params(params)
|
|
|
|
|
|
class SquadProcessor(Processor):
|
|
"""
|
|
Convert QA data (in SQuAD Format)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
tokenizer, # type: ignore
|
|
max_seq_len: int,
|
|
data_dir: Optional[Union[Path, str]],
|
|
label_list: Optional[List[str]] = None,
|
|
metric="squad", # type: ignore
|
|
train_filename: Optional[Union[Path, str]] = Path("train-v2.0.json"),
|
|
dev_filename: Optional[Union[Path, str]] = Path("dev-v2.0.json"),
|
|
test_filename: Optional[Union[Path, str]] = None,
|
|
dev_split: float = 0,
|
|
doc_stride: int = 128,
|
|
max_query_length: int = 64,
|
|
proxies: Optional[dict] = None,
|
|
max_answers: int = 6,
|
|
**kwargs,
|
|
):
|
|
"""
|
|
:param tokenizer: Used to split a sentence (str) into tokens.
|
|
:param max_seq_len: Samples are truncated after this many tokens.
|
|
:param data_dir: The directory in which the train and dev files can be found.
|
|
If not available the dataset will be loaded automaticaly
|
|
if the last directory has the same name as a predefined dataset.
|
|
These predefined datasets are defined as the keys in the dict at
|
|
`haystack.basics.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/haystack/blob/main/haystack/basics/data_handler/utils.py>`_.
|
|
:param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
|
|
:param metric: name of metric that shall be used for evaluation, can be "squad" or "top_n_accuracy"
|
|
:param train_filename: The name of the file containing training data.
|
|
:param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
|
|
will be a slice of the train set.
|
|
:param test_filename: None
|
|
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
|
|
:param doc_stride: When the document containing the answer is too long it gets split into part, strided by doc_stride
|
|
:param max_query_length: Maximum length of the question (in number of subword tokens)
|
|
:param proxies: proxy configuration to allow downloads of remote datasets.
|
|
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
|
|
:param max_answers: number of answers to be converted. QA dev or train sets can contain multi-way annotations, which are converted to arrays of max_answer length
|
|
:param kwargs: placeholder for passing generic parameters
|
|
"""
|
|
self.ph_output_type = "per_token_squad"
|
|
|
|
# validate max_seq_len
|
|
assert max_seq_len <= tokenizer.model_max_length, (
|
|
"max_seq_len cannot be greater than the maximum sequence length handled by the model: "
|
|
f"got max_seq_len={max_seq_len}, while the model maximum length is {tokenizer.model_max_length}. "
|
|
"Please adjust max_seq_len accordingly or use another model "
|
|
)
|
|
|
|
assert doc_stride < (max_seq_len - max_query_length), (
|
|
"doc_stride ({}) is longer than max_seq_len ({}) minus space reserved for query tokens ({}). \nThis means that there will be gaps "
|
|
"as the passage windows slide, causing the model to skip over parts of the document.\n"
|
|
"Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384)\n "
|
|
"Or decrease max_query_length".format(doc_stride, max_seq_len, max_query_length)
|
|
)
|
|
|
|
self.doc_stride = doc_stride
|
|
self.max_query_length = max_query_length
|
|
self.max_answers = max_answers
|
|
super(SquadProcessor, self).__init__(
|
|
tokenizer=tokenizer,
|
|
max_seq_len=max_seq_len,
|
|
train_filename=train_filename,
|
|
dev_filename=dev_filename,
|
|
test_filename=test_filename,
|
|
dev_split=dev_split,
|
|
data_dir=data_dir,
|
|
tasks={},
|
|
proxies=proxies,
|
|
)
|
|
self._initialize_special_tokens_count()
|
|
if metric and label_list:
|
|
self.add_task("question_answering", metric, label_list)
|
|
else:
|
|
logger.info(
|
|
"Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for "
|
|
"using the default task or add a custom task later via processor.add_task()"
|
|
)
|
|
|
|
def dataset_from_dicts(
|
|
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
|
):
|
|
"""
|
|
Convert input dictionaries into a pytorch dataset for Question Answering.
|
|
For this we have an internal representation called "baskets".
|
|
Each basket is a question-document pair.
|
|
Each stage adds or transforms specific information to our baskets.
|
|
|
|
:param dicts: dict, input dictionary with SQuAD style information present
|
|
:param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
|
|
:param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference)
|
|
"""
|
|
# Convert to standard format
|
|
pre_baskets = [self.convert_qa_input_dict(x) for x in dicts] # TODO move to input object conversion
|
|
|
|
# Tokenize documents and questions
|
|
baskets = tokenize_batch_question_answering(pre_baskets, self.tokenizer, indices)
|
|
|
|
# Split documents into smaller passages to fit max_seq_len
|
|
baskets = self._split_docs_into_passages(baskets)
|
|
|
|
# Convert answers from string to token space, skip this step for inference
|
|
if not return_baskets:
|
|
baskets = self._convert_answers(baskets)
|
|
|
|
# Convert internal representation (nested baskets + samples with mixed types) to pytorch features (arrays of numbers)
|
|
baskets = self._passages_to_pytorch_features(baskets, return_baskets)
|
|
|
|
# Convert features into pytorch dataset, this step also removes potential errors during preprocessing
|
|
dataset, tensor_names, baskets = self._create_dataset(baskets)
|
|
|
|
# Logging
|
|
if indices:
|
|
if 0 in indices:
|
|
self._log_samples(n_samples=1, baskets=self.baskets)
|
|
|
|
# During inference we need to keep the information contained in baskets.
|
|
if return_baskets:
|
|
return dataset, tensor_names, self.problematic_sample_ids, baskets
|
|
else:
|
|
return dataset, tensor_names, self.problematic_sample_ids
|
|
|
|
def file_to_dicts(self, file: str) -> List[dict]:
|
|
nested_dicts = _read_squad_file(filename=file)
|
|
dicts = [y for x in nested_dicts for y in x["paragraphs"]]
|
|
return dicts
|
|
|
|
# TODO use Input Objects instead of this function, remove Natural Questions (NQ) related code
|
|
def convert_qa_input_dict(self, infer_dict: dict) -> Dict[str, Any]:
|
|
"""Input dictionaries in QA can either have ["context", "qas"] (internal format) as keys or
|
|
["text", "questions"] (api format). This function converts the latter into the former. It also converts the
|
|
is_impossible field to answer_type so that NQ and SQuAD dicts have the same format.
|
|
"""
|
|
# validate again max_seq_len
|
|
assert self.max_seq_len <= self.tokenizer.model_max_length, (
|
|
"max_seq_len cannot be greater than the maximum sequence length handled by the model: "
|
|
f"got max_seq_len={self.max_seq_len}, while the model maximum length is {self.tokenizer.model_max_length}. "
|
|
"Please adjust max_seq_len accordingly or use another model "
|
|
)
|
|
|
|
# check again for doc stride vs max_seq_len when. Parameters can be changed for already initialized models (e.g. in haystack)
|
|
assert self.doc_stride < (self.max_seq_len - self.max_query_length), (
|
|
"doc_stride ({}) is longer than max_seq_len ({}) minus space reserved for query tokens ({}). \nThis means that there will be gaps "
|
|
"as the passage windows slide, causing the model to skip over parts of the document.\n"
|
|
"Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384)\n "
|
|
"Or decrease max_query_length".format(self.doc_stride, self.max_seq_len, self.max_query_length)
|
|
)
|
|
|
|
try:
|
|
# Check if infer_dict is already in internal json format
|
|
if "context" in infer_dict and "qas" in infer_dict:
|
|
return infer_dict
|
|
# converts dicts from inference mode to data structure used in Haystack
|
|
questions = infer_dict["questions"]
|
|
text = infer_dict["text"]
|
|
uid = infer_dict.get("id", None)
|
|
qas = [{"question": q, "id": uid, "answers": [], "answer_type": None} for i, q in enumerate(questions)]
|
|
converted = {"qas": qas, "context": text}
|
|
return converted
|
|
except KeyError:
|
|
raise Exception("Input does not have the expected format")
|
|
|
|
def _initialize_special_tokens_count(self):
|
|
vec = self.tokenizer.build_inputs_with_special_tokens(token_ids_0=["a"], token_ids_1=["b"])
|
|
self.sp_toks_start = vec.index("a")
|
|
self.sp_toks_mid = vec.index("b") - self.sp_toks_start - 1
|
|
self.sp_toks_end = len(vec) - vec.index("b") - 1
|
|
|
|
def _split_docs_into_passages(self, baskets: List[SampleBasket]):
|
|
"""
|
|
Because of the sequence length limitation of Language Models, the documents need to be divided into smaller
|
|
parts that we call passages.
|
|
"""
|
|
n_special_tokens = self.tokenizer.num_special_tokens_to_add(pair=True)
|
|
for basket in baskets:
|
|
samples = []
|
|
########## perform some basic checking
|
|
# TODO, eventually move checking into input validation functions
|
|
# ignore samples with empty context
|
|
if basket.raw["document_text"] == "":
|
|
logger.warning("Ignoring sample with empty context")
|
|
continue
|
|
########## end checking
|
|
|
|
# Calculate the number of tokens that can be reserved for the passage. This is calculated by considering
|
|
# the max_seq_len, the number of tokens in the question and the number of special tokens that will be added
|
|
# when the question and passage are joined (e.g. [CLS] and [SEP])
|
|
passage_len_t = (
|
|
self.max_seq_len - len(basket.raw["question_tokens"][: self.max_query_length]) - n_special_tokens
|
|
)
|
|
|
|
# passage_spans is a list of dictionaries where each defines the start and end of each passage
|
|
# on both token and character level
|
|
try:
|
|
passage_spans = get_passage_offsets(
|
|
basket.raw["document_offsets"], self.doc_stride, passage_len_t, basket.raw["document_text"]
|
|
)
|
|
except Exception as e:
|
|
logger.warning(
|
|
f"Could not devide document into passages. Document: {basket.raw['document_text'][:200]}\n"
|
|
f"With error: {e}"
|
|
)
|
|
passage_spans = []
|
|
|
|
for passage_span in passage_spans:
|
|
# Unpack each variable in the dictionary. The "_t" and "_c" indicate
|
|
# whether the index is on the token or character level
|
|
passage_start_t = passage_span["passage_start_t"]
|
|
passage_end_t = passage_span["passage_end_t"]
|
|
passage_start_c = passage_span["passage_start_c"]
|
|
passage_end_c = passage_span["passage_end_c"]
|
|
|
|
passage_start_of_word = basket.raw["document_start_of_word"][passage_start_t:passage_end_t]
|
|
passage_tokens = basket.raw["document_tokens"][passage_start_t:passage_end_t]
|
|
passage_text = basket.raw["document_text"][passage_start_c:passage_end_c]
|
|
|
|
clear_text = {
|
|
"passage_text": passage_text,
|
|
"question_text": basket.raw["question_text"],
|
|
"passage_id": passage_span["passage_id"],
|
|
}
|
|
tokenized = {
|
|
"passage_start_t": passage_start_t,
|
|
"passage_start_c": passage_start_c,
|
|
"passage_tokens": passage_tokens,
|
|
"passage_start_of_word": passage_start_of_word,
|
|
"question_tokens": basket.raw["question_tokens"][: self.max_query_length],
|
|
"question_offsets": basket.raw["question_offsets"][: self.max_query_length],
|
|
"question_start_of_word": basket.raw["question_start_of_word"][: self.max_query_length],
|
|
}
|
|
# The sample ID consists of internal_id and a passage numbering
|
|
sample_id = f"{basket.id_internal}-{passage_span['passage_id']}"
|
|
samples.append(Sample(id=sample_id, clear_text=clear_text, tokenized=tokenized))
|
|
|
|
basket.samples = samples
|
|
|
|
return baskets
|
|
|
|
def _convert_answers(self, baskets: List[SampleBasket]):
|
|
"""
|
|
Converts answers that are pure strings into the token based representation with start and end token offset.
|
|
Can handle multiple answers per question document pair as is common for development/text sets
|
|
"""
|
|
for basket in baskets:
|
|
error_in_answer = False
|
|
for num, sample in enumerate(basket.samples): # type: ignore
|
|
# Dealing with potentially multiple answers (e.g. Squad dev set)
|
|
# Initializing a numpy array of shape (max_answers, 2), filled with -1 for missing values
|
|
label_idxs = np.full((self.max_answers, 2), fill_value=-1)
|
|
|
|
if error_in_answer or (len(basket.raw["answers"]) == 0):
|
|
# If there are no answers we set
|
|
label_idxs[0, :] = 0
|
|
else:
|
|
# For all other cases we use start and end token indices, that are relative to the passage
|
|
for i, answer in enumerate(basket.raw["answers"]):
|
|
# Calculate start and end relative to document
|
|
answer_len_c = len(answer["text"])
|
|
answer_start_c = answer["answer_start"]
|
|
answer_end_c = answer_start_c + answer_len_c - 1
|
|
|
|
# Convert character offsets to token offsets on document level
|
|
answer_start_t = offset_to_token_idx_vecorized(basket.raw["document_offsets"], answer_start_c)
|
|
answer_end_t = offset_to_token_idx_vecorized(basket.raw["document_offsets"], answer_end_c)
|
|
|
|
# Adjust token offsets to be relative to the passage
|
|
answer_start_t -= sample.tokenized["passage_start_t"] # type: ignore
|
|
answer_end_t -= sample.tokenized["passage_start_t"] # type: ignore
|
|
|
|
# Initialize some basic variables
|
|
question_len_t = len(sample.tokenized["question_tokens"]) # type: ignore
|
|
passage_len_t = len(sample.tokenized["passage_tokens"]) # type: ignore
|
|
|
|
# Check that start and end are contained within this passage
|
|
# answer_end_t is 0 if the first token is the answer
|
|
# answer_end_t is passage_len_t if the last token is the answer
|
|
if passage_len_t > answer_start_t >= 0 and passage_len_t >= answer_end_t >= 0:
|
|
# Then adjust the start and end offsets by adding question and special token
|
|
label_idxs[i][0] = self.sp_toks_start + question_len_t + self.sp_toks_mid + answer_start_t
|
|
label_idxs[i][1] = self.sp_toks_start + question_len_t + self.sp_toks_mid + answer_end_t
|
|
# If the start or end of the span answer is outside the passage, treat passage as no_answer
|
|
else:
|
|
label_idxs[i][0] = 0
|
|
label_idxs[i][1] = 0
|
|
|
|
########## answer checking ##############################
|
|
# TODO, move this checking into input validation functions and delete wrong examples there
|
|
# Cases where the answer is not within the current passage will be turned into no answers by the featurization fn
|
|
if answer_start_t < 0 or answer_end_t >= passage_len_t:
|
|
pass
|
|
else:
|
|
doc_text = basket.raw["document_text"]
|
|
answer_indices = doc_text[answer_start_c : answer_end_c + 1]
|
|
answer_text = answer["text"]
|
|
# check if answer string can be found in context
|
|
if answer_text not in doc_text:
|
|
logger.warning(
|
|
f"Answer '{answer['text']}' not contained in context.\n"
|
|
f"Example will not be converted for training/evaluation."
|
|
)
|
|
error_in_answer = True
|
|
label_idxs[i][0] = -100 # TODO remove this hack also from featurization
|
|
label_idxs[i][1] = -100
|
|
break # Break loop around answers, so the error message is not shown multiple times
|
|
if answer_indices.strip() != answer_text.strip():
|
|
logger.warning(
|
|
f"Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'.\n"
|
|
f"Example will not be converted for training/evaluation."
|
|
)
|
|
error_in_answer = True
|
|
label_idxs[i][0] = -100 # TODO remove this hack also from featurization
|
|
label_idxs[i][1] = -100
|
|
break # Break loop around answers, so the error message is not shown multiple times
|
|
########## end of checking ####################
|
|
|
|
sample.tokenized["labels"] = label_idxs # type: ignore
|
|
|
|
return baskets
|
|
|
|
def _passages_to_pytorch_features(self, baskets: List[SampleBasket], return_baskets: bool):
|
|
"""
|
|
Convert internal representation (nested baskets + samples with mixed types) to python features (arrays of numbers).
|
|
We first join question and passages into one large vector.
|
|
Then we add vectors for: - input_ids (token ids)
|
|
- segment_ids (does a token belong to question or document)
|
|
- padding_mask
|
|
- span_mask (valid answer tokens)
|
|
- start_of_word
|
|
"""
|
|
for basket in baskets:
|
|
# Add features to samples
|
|
for num, sample in enumerate(basket.samples): # type: ignore
|
|
# Initialize some basic variables
|
|
if sample.tokenized is not None:
|
|
question_tokens = sample.tokenized["question_tokens"]
|
|
question_start_of_word = sample.tokenized["question_start_of_word"]
|
|
question_len_t = len(question_tokens)
|
|
passage_start_t = sample.tokenized["passage_start_t"]
|
|
passage_tokens = sample.tokenized["passage_tokens"]
|
|
passage_start_of_word = sample.tokenized["passage_start_of_word"]
|
|
passage_len_t = len(passage_tokens)
|
|
sample_id = [int(x) for x in sample.id.split("-")]
|
|
|
|
# - Combines question_tokens and passage_tokens into a single vector called input_ids
|
|
# - input_ids also contains special tokens (e.g. CLS or SEP tokens).
|
|
# - It will have length = question_len_t + passage_len_t + n_special_tokens. This may be less than
|
|
# max_seq_len but never greater since truncation was already performed when the document was chunked into passages
|
|
question_input_ids = sample.tokenized["question_tokens"]
|
|
passage_input_ids = sample.tokenized["passage_tokens"]
|
|
|
|
input_ids = self.tokenizer.build_inputs_with_special_tokens(
|
|
token_ids_0=question_input_ids, token_ids_1=passage_input_ids
|
|
)
|
|
|
|
segment_ids = self.tokenizer.create_token_type_ids_from_sequences(
|
|
token_ids_0=question_input_ids, token_ids_1=passage_input_ids
|
|
)
|
|
# To make the start index of passage tokens the start manually
|
|
seq_2_start_t = self.sp_toks_start + question_len_t + self.sp_toks_mid
|
|
|
|
start_of_word = (
|
|
[0] * self.sp_toks_start
|
|
+ question_start_of_word
|
|
+ [0] * self.sp_toks_mid
|
|
+ passage_start_of_word
|
|
+ [0] * self.sp_toks_end
|
|
)
|
|
|
|
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
|
# tokens are attended to.
|
|
padding_mask = [1] * len(input_ids)
|
|
|
|
# The span_mask has 1 for tokens that are valid start or end tokens for QA spans.
|
|
# 0s are assigned to question tokens, mid special tokens, end special tokens, and padding
|
|
# Note that start special tokens are assigned 1 since they can be chosen for a no_answer prediction
|
|
span_mask = [1] * self.sp_toks_start
|
|
span_mask += [0] * question_len_t
|
|
span_mask += [0] * self.sp_toks_mid
|
|
span_mask += [1] * passage_len_t
|
|
span_mask += [0] * self.sp_toks_end
|
|
|
|
# Pad up to the sequence length. For certain models, the pad token id is not 0 (e.g. Roberta where it is 1)
|
|
pad_idx = self.tokenizer.pad_token_id
|
|
padding = [pad_idx] * (self.max_seq_len - len(input_ids))
|
|
zero_padding = [0] * (self.max_seq_len - len(input_ids))
|
|
|
|
input_ids += padding
|
|
padding_mask += zero_padding
|
|
segment_ids += zero_padding
|
|
start_of_word += zero_padding
|
|
span_mask += zero_padding
|
|
|
|
# TODO possibly remove these checks after input validation is in place
|
|
len_check = (
|
|
len(input_ids) == len(padding_mask) == len(segment_ids) == len(start_of_word) == len(span_mask)
|
|
)
|
|
id_check = len(sample_id) == 3
|
|
label_check = return_baskets or len(sample.tokenized.get("labels", [])) == self.max_answers # type: ignore
|
|
# labels are set to -100 when answer cannot be found
|
|
label_check2 = return_baskets or np.all(sample.tokenized["labels"] > -99) # type: ignore
|
|
if len_check and id_check and label_check and label_check2:
|
|
# - The first of the labels will be used in train, and the full array will be used in eval.
|
|
# - start_of_word and spec_tok_mask are not actually needed by model.forward() but are needed for
|
|
# model.formatted_preds() during inference for creating answer strings
|
|
# - passage_start_t is index of passage's first token relative to document
|
|
feature_dict = {
|
|
"input_ids": input_ids,
|
|
"padding_mask": padding_mask,
|
|
"segment_ids": segment_ids,
|
|
"passage_start_t": passage_start_t,
|
|
"start_of_word": start_of_word,
|
|
"labels": sample.tokenized.get("labels", []), # type: ignore
|
|
"id": sample_id,
|
|
"seq_2_start_t": seq_2_start_t,
|
|
"span_mask": span_mask,
|
|
}
|
|
# other processor's features can be lists
|
|
sample.features = [feature_dict] # type: ignore
|
|
else:
|
|
self.problematic_sample_ids.add(sample.id)
|
|
sample.features = None
|
|
return baskets
|
|
|
|
def _create_dataset(self, baskets: List[SampleBasket]):
|
|
"""
|
|
Convert python features into pytorch dataset.
|
|
Also removes potential errors during preprocessing.
|
|
Flattens nested basket structure to create a flat list of features
|
|
"""
|
|
features_flat: List[dict] = []
|
|
basket_to_remove = []
|
|
for basket in baskets:
|
|
if self._check_sample_features(basket):
|
|
for sample in basket.samples: # type: ignore
|
|
features_flat.extend(sample.features) # type: ignore
|
|
else:
|
|
# remove the entire basket
|
|
basket_to_remove.append(basket)
|
|
if len(basket_to_remove) > 0:
|
|
for basket in basket_to_remove:
|
|
# if basket_to_remove is not empty remove the related baskets
|
|
baskets.remove(basket)
|
|
|
|
dataset, tensor_names = convert_features_to_dataset(features=features_flat)
|
|
return dataset, tensor_names, baskets
|
|
|
|
|
|
class TextSimilarityProcessor(Processor):
|
|
"""
|
|
Used to handle the Dense Passage Retrieval (DPR) datasets that come in json format, example: biencoder-nq-train.json, biencoder-nq-dev.json, trivia-train.json, trivia-dev.json
|
|
|
|
Datasets can be downloaded from the official DPR github repository (https://github.com/facebookresearch/DPR)
|
|
dataset format: list of dictionaries with keys: 'dataset', 'question', 'answers', 'positive_ctxs', 'negative_ctxs', 'hard_negative_ctxs'
|
|
Each sample is a dictionary of format:
|
|
{"dataset": str,
|
|
"question": str,
|
|
"answers": list of str
|
|
"positive_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
|
|
"negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
|
|
"hard_negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
|
|
}
|
|
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
query_tokenizer, # type: ignore
|
|
passage_tokenizer, # type: ignore
|
|
max_seq_len_query: int,
|
|
max_seq_len_passage: int,
|
|
data_dir: str = "",
|
|
metric=None, # type: ignore
|
|
train_filename: str = "train.json",
|
|
dev_filename: Optional[str] = None,
|
|
test_filename: Optional[str] = "test.json",
|
|
dev_split: float = 0.1,
|
|
proxies: Optional[dict] = None,
|
|
max_samples: Optional[int] = None,
|
|
embed_title: bool = True,
|
|
num_positives: int = 1,
|
|
num_hard_negatives: int = 1,
|
|
shuffle_negatives: bool = True,
|
|
shuffle_positives: bool = False,
|
|
label_list: Optional[List[str]] = None,
|
|
**kwargs,
|
|
):
|
|
"""
|
|
:param query_tokenizer: Used to split a question (str) into tokens
|
|
:param passage_tokenizer: Used to split a passage (str) into tokens.
|
|
:param max_seq_len_query: Query samples are truncated after this many tokens.
|
|
:param max_seq_len_passage: Context/Passage Samples are truncated after this many tokens.
|
|
:param data_dir: The directory in which the train and dev files can be found.
|
|
If not available the dataset will be loaded automaticaly
|
|
if the last directory has the same name as a predefined dataset.
|
|
These predefined datasets are defined as the keys in the dict at
|
|
`haystack.basics.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/haystack/blob/main/haystack/basics/data_handler/utils.py>`_.
|
|
:param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro".
|
|
Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value.
|
|
For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn].
|
|
:param train_filename: The name of the file containing training data.
|
|
:param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
|
|
will be a slice of the train set.
|
|
:param test_filename: None
|
|
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
|
|
:param proxies: proxy configuration to allow downloads of remote datasets.
|
|
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
|
|
:param max_samples: maximum number of samples to use
|
|
:param embed_title: Whether to embed title in passages during tensorization (bool),
|
|
:param num_hard_negatives: maximum number to hard negative context passages in a sample
|
|
:param num_positives: maximum number to positive context passages in a sample
|
|
:param shuffle_negatives: Whether to shuffle all the hard_negative passages before selecting the num_hard_negative number of passages
|
|
:param shuffle_positives: Whether to shuffle all the positive passages before selecting the num_positive number of passages
|
|
:param label_list: list of labels to predict. Usually ["hard_negative", "positive"]
|
|
:param kwargs: placeholder for passing generic parameters
|
|
"""
|
|
# TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs
|
|
|
|
# Custom processor attributes
|
|
self.max_samples = max_samples
|
|
self.query_tokenizer = query_tokenizer
|
|
self.passage_tokenizer = passage_tokenizer
|
|
self.embed_title = embed_title
|
|
self.num_hard_negatives = num_hard_negatives
|
|
self.num_positives = num_positives
|
|
self.shuffle_negatives = shuffle_negatives
|
|
self.shuffle_positives = shuffle_positives
|
|
self.max_seq_len_query = max_seq_len_query
|
|
self.max_seq_len_passage = max_seq_len_passage
|
|
|
|
super(TextSimilarityProcessor, self).__init__(
|
|
tokenizer=None, # type: ignore
|
|
max_seq_len=0,
|
|
train_filename=train_filename,
|
|
dev_filename=dev_filename,
|
|
test_filename=test_filename,
|
|
dev_split=dev_split,
|
|
data_dir=data_dir,
|
|
tasks={},
|
|
proxies=proxies,
|
|
)
|
|
if metric:
|
|
self.add_task(
|
|
name="text_similarity",
|
|
metric=metric,
|
|
label_list=label_list,
|
|
label_name="label",
|
|
task_type="text_similarity",
|
|
)
|
|
else:
|
|
logger.info(
|
|
"Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for "
|
|
"using the default task or add a custom task later via processor.add_task()"
|
|
)
|
|
|
|
@classmethod
|
|
def load_from_dir(cls, load_dir: str):
|
|
"""
|
|
Overwriting method from parent class to **always** load the TextSimilarityProcessor instead of the specific class stored in the config.
|
|
|
|
:param load_dir: directory that contains a 'processor_config.json'
|
|
:return: An instance of an TextSimilarityProcessor
|
|
"""
|
|
# read config
|
|
processor_config_file = Path(load_dir) / "processor_config.json"
|
|
config = json.load(open(processor_config_file))
|
|
# init tokenizers
|
|
query_tokenizer_class: Type[PreTrainedTokenizer] = getattr(transformers, config["query_tokenizer"])
|
|
query_tokenizer = query_tokenizer_class.from_pretrained(
|
|
pretrained_model_name_or_path=load_dir, subfolder="query"
|
|
)
|
|
passage_tokenizer_class: Type[PreTrainedTokenizer] = getattr(transformers, config["passage_tokenizer"])
|
|
passage_tokenizer = passage_tokenizer_class.from_pretrained(
|
|
pretrained_model_name_or_path=load_dir, subfolder="passage"
|
|
)
|
|
|
|
# we have to delete the tokenizer string from config, because we pass it as Object
|
|
del config["query_tokenizer"]
|
|
del config["passage_tokenizer"]
|
|
|
|
processor = cls.load(
|
|
query_tokenizer=query_tokenizer,
|
|
passage_tokenizer=passage_tokenizer,
|
|
processor_name="TextSimilarityProcessor",
|
|
**config,
|
|
)
|
|
for task_name, task in config["tasks"].items():
|
|
processor.add_task(name=task_name, metric=task["metric"], label_list=task["label_list"])
|
|
|
|
if processor is None:
|
|
raise Exception
|
|
|
|
return processor
|
|
|
|
def save(self, save_dir: Union[str, Path]):
|
|
"""
|
|
Saves the vocabulary to file and also creates a json file containing all the
|
|
information needed to load the same processor.
|
|
|
|
:param save_dir: Directory where the files are to be saved
|
|
:return: None
|
|
"""
|
|
if isinstance(save_dir, str):
|
|
save_dir = Path(save_dir)
|
|
os.makedirs(save_dir, exist_ok=True)
|
|
config = self.generate_config()
|
|
# save tokenizer incl. attributes
|
|
config["query_tokenizer"] = self.query_tokenizer.__class__.__name__
|
|
config["passage_tokenizer"] = self.passage_tokenizer.__class__.__name__
|
|
|
|
# Because the fast tokenizers expect a str and not Path
|
|
# always convert Path to str here.
|
|
self.query_tokenizer.save_pretrained(str(save_dir / "query"))
|
|
self.passage_tokenizer.save_pretrained(str(save_dir / "passage"))
|
|
|
|
# save processor
|
|
config["processor"] = self.__class__.__name__
|
|
output_config_file = Path(save_dir) / "processor_config.json"
|
|
with open(output_config_file, "w") as file:
|
|
json.dump(config, file)
|
|
|
|
def dataset_from_dicts(
|
|
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
|
):
|
|
"""
|
|
Convert input dictionaries into a pytorch dataset for TextSimilarity (e.g. DPR).
|
|
For conversion we have an internal representation called "baskets".
|
|
Each basket is one query and related text passages (positive passages fitting to the query and negative
|
|
passages that do not fit the query)
|
|
Each stage adds or transforms specific information to our baskets.
|
|
|
|
:param dicts: input dictionary with DPR-style content
|
|
{"query": str,
|
|
"passages": List[
|
|
{'title': str,
|
|
'text': str,
|
|
'label': 'hard_negative',
|
|
'external_id': str},
|
|
....
|
|
]
|
|
}
|
|
:param indices: indices used during multiprocessing so that IDs assigned to our baskets is unique
|
|
:param return_baskets: whether to return the baskets or not (baskets are needed during inference)
|
|
:return: dataset, tensor_names, problematic_ids, [baskets]
|
|
"""
|
|
# Take the dict and insert into our basket structure, this stages also adds an internal IDs
|
|
baskets = self._fill_baskets(dicts, indices)
|
|
|
|
# Separat conversion of query
|
|
baskets = self._convert_queries(baskets=baskets)
|
|
|
|
# and context passages. When converting the context the label is also assigned.
|
|
baskets = self._convert_contexts(baskets=baskets)
|
|
|
|
# Convert features into pytorch dataset, this step also removes and logs potential errors during preprocessing
|
|
dataset, tensor_names, problematic_ids, baskets = self._create_dataset(baskets)
|
|
|
|
if problematic_ids:
|
|
logger.error(
|
|
f"There were {len(problematic_ids)} errors during preprocessing at positions: {problematic_ids}"
|
|
)
|
|
|
|
if return_baskets:
|
|
return dataset, tensor_names, problematic_ids, baskets
|
|
else:
|
|
return dataset, tensor_names, problematic_ids
|
|
|
|
def file_to_dicts(self, file: str) -> List[dict]:
|
|
"""
|
|
Converts a Dense Passage Retrieval (DPR) data file in json format to a list of dictionaries.
|
|
|
|
:param file: filename of DPR data in json format
|
|
Each sample is a dictionary of format:
|
|
{"dataset": str,
|
|
"question": str,
|
|
"answers": list of str
|
|
"positive_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
|
|
"negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
|
|
"hard_negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str}
|
|
}
|
|
|
|
|
|
Returns:
|
|
list of dictionaries: List[dict]
|
|
each dictionary:
|
|
{"query": str,
|
|
"passages": [{"text": document_text, "title": xxx, "label": "positive", "external_id": abb123},
|
|
{"text": document_text, "title": xxx, "label": "hard_negative", "external_id": abb134},
|
|
...]}
|
|
"""
|
|
dicts = _read_dpr_json(
|
|
file,
|
|
max_samples=self.max_samples,
|
|
num_hard_negatives=self.num_hard_negatives,
|
|
num_positives=self.num_positives,
|
|
shuffle_negatives=self.shuffle_negatives,
|
|
shuffle_positives=self.shuffle_positives,
|
|
)
|
|
|
|
# shuffle dicts to make sure that similar positive passages do not end up in one batch
|
|
dicts = random.sample(dicts, len(dicts))
|
|
return dicts
|
|
|
|
def _fill_baskets(self, dicts: List[dict], indices: Optional[List[int]]):
|
|
baskets = []
|
|
if not indices:
|
|
indices = list(range(len(dicts)))
|
|
for d, id_internal in zip(dicts, indices):
|
|
basket = SampleBasket(id_external=None, id_internal=id_internal, raw=d)
|
|
baskets.append(basket)
|
|
return baskets
|
|
|
|
def _convert_queries(self, baskets: List[SampleBasket]):
|
|
for basket in baskets:
|
|
clear_text = {}
|
|
tokenized = {}
|
|
features = [{}] # type: ignore
|
|
# extract query, positive context passages and titles, hard-negative passages and titles
|
|
if "query" in basket.raw:
|
|
try:
|
|
query = self._normalize_question(basket.raw["query"])
|
|
|
|
# featurize the query
|
|
query_inputs = self.query_tokenizer(
|
|
query,
|
|
max_length=self.max_seq_len_query,
|
|
add_special_tokens=True,
|
|
truncation=True,
|
|
truncation_strategy="longest_first",
|
|
padding="max_length",
|
|
return_token_type_ids=True,
|
|
)
|
|
|
|
# tokenize query
|
|
tokenized_query = self.query_tokenizer.convert_ids_to_tokens(query_inputs["input_ids"])
|
|
|
|
if len(tokenized_query) == 0:
|
|
logger.warning(
|
|
f"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
|
|
)
|
|
return None
|
|
|
|
clear_text["query_text"] = query
|
|
tokenized["query_tokens"] = tokenized_query
|
|
features[0]["query_input_ids"] = query_inputs["input_ids"]
|
|
features[0]["query_segment_ids"] = query_inputs["token_type_ids"]
|
|
features[0]["query_attention_mask"] = query_inputs["attention_mask"]
|
|
except Exception as e:
|
|
features = None # type: ignore
|
|
|
|
sample = Sample(id="", clear_text=clear_text, tokenized=tokenized, features=features) # type: ignore
|
|
basket.samples = [sample]
|
|
return baskets
|
|
|
|
def _convert_contexts(self, baskets: List[SampleBasket]):
|
|
for basket in baskets:
|
|
if "passages" in basket.raw:
|
|
try:
|
|
positive_context = list(filter(lambda x: x["label"] == "positive", basket.raw["passages"]))
|
|
if self.shuffle_positives:
|
|
random.shuffle(positive_context)
|
|
positive_context = positive_context[: self.num_positives]
|
|
hard_negative_context = list(
|
|
filter(lambda x: x["label"] == "hard_negative", basket.raw["passages"])
|
|
)
|
|
if self.shuffle_negatives:
|
|
random.shuffle(hard_negative_context)
|
|
hard_negative_context = hard_negative_context[: self.num_hard_negatives]
|
|
|
|
positive_ctx_titles = [passage.get("title", None) for passage in positive_context]
|
|
positive_ctx_texts = [passage["text"] for passage in positive_context]
|
|
hard_negative_ctx_titles = [passage.get("title", None) for passage in hard_negative_context]
|
|
hard_negative_ctx_texts = [passage["text"] for passage in hard_negative_context]
|
|
|
|
# all context passages and labels: 1 for positive context and 0 for hard-negative context
|
|
ctx_label = [1] * self.num_positives + [0] * self.num_hard_negatives
|
|
# featurize context passages
|
|
if self.embed_title:
|
|
# concatenate title with positive context passages + negative context passages
|
|
all_ctx = self._combine_title_context(
|
|
positive_ctx_titles, positive_ctx_texts
|
|
) + self._combine_title_context(hard_negative_ctx_titles, hard_negative_ctx_texts)
|
|
else:
|
|
all_ctx = positive_ctx_texts + hard_negative_ctx_texts
|
|
|
|
# assign empty string tuples if hard_negative passages less than num_hard_negatives
|
|
all_ctx += [("", "")] * ((self.num_positives + self.num_hard_negatives) - len(all_ctx))
|
|
|
|
ctx_inputs = self.passage_tokenizer(
|
|
all_ctx,
|
|
add_special_tokens=True,
|
|
truncation=True,
|
|
padding="max_length",
|
|
max_length=self.max_seq_len_passage,
|
|
return_token_type_ids=True,
|
|
)
|
|
|
|
ctx_segment_ids = [[0] * len(ctx_inputs["token_type_ids"][0])] * len(ctx_inputs["token_type_ids"])
|
|
|
|
# get tokens in string format
|
|
tokenized_passage = [
|
|
self.passage_tokenizer.convert_ids_to_tokens(ctx) for ctx in ctx_inputs["input_ids"]
|
|
]
|
|
|
|
# for DPR we only have one sample containing query and corresponding (multiple) context features
|
|
sample = basket.samples[0] # type: ignore
|
|
sample.clear_text["passages"] = positive_context + hard_negative_context
|
|
sample.tokenized["passages_tokens"] = tokenized_passage # type: ignore
|
|
sample.features[0]["passage_input_ids"] = ctx_inputs["input_ids"] # type: ignore
|
|
sample.features[0]["passage_segment_ids"] = ctx_segment_ids # type: ignore
|
|
sample.features[0]["passage_attention_mask"] = ctx_inputs["attention_mask"] # type: ignore
|
|
sample.features[0]["label_ids"] = ctx_label # type: ignore
|
|
except Exception as e:
|
|
basket.samples[0].features = None # type: ignore
|
|
|
|
return baskets
|
|
|
|
def _create_dataset(self, baskets: List[SampleBasket]):
|
|
"""
|
|
Convert python features into pytorch dataset.
|
|
Also removes potential errors during preprocessing.
|
|
Flattens nested basket structure to create a flat list of features
|
|
"""
|
|
features_flat: List[dict] = []
|
|
basket_to_remove = []
|
|
problematic_ids: set = set()
|
|
for basket in baskets:
|
|
if self._check_sample_features(basket):
|
|
for sample in basket.samples: # type: ignore
|
|
features_flat.extend(sample.features) # type: ignore
|
|
else:
|
|
# remove the entire basket
|
|
basket_to_remove.append(basket)
|
|
if len(basket_to_remove) > 0:
|
|
for basket in basket_to_remove:
|
|
# if basket_to_remove is not empty remove the related baskets
|
|
problematic_ids.add(basket.id_internal)
|
|
baskets.remove(basket)
|
|
|
|
dataset, tensor_names = convert_features_to_dataset(features=features_flat)
|
|
return dataset, tensor_names, problematic_ids, baskets
|
|
|
|
@staticmethod
|
|
def _normalize_question(question: str) -> str:
|
|
"""Removes '?' from queries/questions"""
|
|
if question[-1] == "?":
|
|
question = question[:-1]
|
|
return question
|
|
|
|
@staticmethod
|
|
def _combine_title_context(titles: List[str], texts: List[str]):
|
|
res = []
|
|
for title, ctx in zip(titles, texts):
|
|
if title is None:
|
|
title = ""
|
|
logger.warning(
|
|
f"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '{ctx}' "
|
|
)
|
|
res.append(tuple((title, ctx)))
|
|
return res
|
|
|
|
|
|
class TableTextSimilarityProcessor(Processor):
|
|
"""
|
|
Used to handle the Multimodal Retrieval datasets consisting of text passages and tables
|
|
that come in json format.
|
|
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
query_tokenizer, # type: ignore
|
|
passage_tokenizer, # type: ignore
|
|
table_tokenizer, # type: ignore
|
|
max_seq_len_query: int,
|
|
max_seq_len_passage: int,
|
|
max_seq_len_table: int,
|
|
data_dir: str = "",
|
|
metric: Optional[str] = None,
|
|
train_filename: Optional[Union[Path, str]] = "train.json",
|
|
dev_filename: Optional[Union[Path, str]] = None,
|
|
test_filename: Optional[Union[Path, str]] = "test.json",
|
|
dev_split: float = 0.1,
|
|
proxies: Optional[Dict] = None,
|
|
max_samples: Optional[int] = None,
|
|
embed_meta_fields: List[str] = ["page_title", "section_title", "caption"],
|
|
num_positives: int = 1,
|
|
num_hard_negatives: int = 1,
|
|
shuffle_negatives: bool = True,
|
|
shuffle_positives: bool = False,
|
|
label_list: Optional[List[str]] = None,
|
|
**kwargs,
|
|
):
|
|
"""
|
|
:param query_tokenizer: Used to split a question (str) into tokens
|
|
:param passage_tokenizer: Used to split a text passage (str) into tokens.
|
|
:param table_tokenizer: Used to split a table into tokens
|
|
:param max_seq_len_query: Query samples are truncated after this many tokens.
|
|
:param max_seq_len_passage: Context/Passage Samples are truncated after this many tokens.
|
|
:param max_seq_len_table: Table samples are truncated after this many tokens.
|
|
:param data_dir: The directory in which the train and dev files can be found.
|
|
If not available the dataset will be loaded automatically
|
|
if the last directory has the same name as a predefined dataset.
|
|
These predefined datasets are defined as the keys in the dict DOWNSTREAM_TASK_MAP
|
|
:param metric: Name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro".
|
|
Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value.
|
|
For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn].
|
|
:param train_filename: The name of the file containing training data.
|
|
:param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
|
|
will be a slice of the train set.
|
|
:param test_filename: The name of the file containing the test data.
|
|
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None.
|
|
:param proxies: Proxy configuration to allow downloads of remote datasets.
|
|
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
|
|
:param max_samples: maximum number of samples to use.
|
|
:param embed_meta_fields: List of meta fields to embed in text passages and tables during tensorization.
|
|
:param num_hard_negatives: Maximum number of hard negative context passages in a sample.
|
|
:param num_positives: Maximum number of positive context passages in a sample.
|
|
:param shuffle_negatives: Whether to shuffle all the hard_negative passages before selecting the
|
|
num_hard_negative number of passages.
|
|
:param shuffle_positives: Whether to shuffle all the positive passages before selecting the
|
|
num_positive number of passages.
|
|
:param label_list: List of labels to predict. Usually ["hard_negative", "positive"].
|
|
:param kwargs: Placeholder for passing generic parameters
|
|
"""
|
|
# TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs
|
|
|
|
# Custom processor attributes
|
|
self.max_samples = max_samples
|
|
self.query_tokenizer = query_tokenizer
|
|
self.passage_tokenizer = passage_tokenizer
|
|
self.table_tokenizer = table_tokenizer
|
|
self.embed_meta_fields = embed_meta_fields
|
|
self.num_hard_negatives = num_hard_negatives
|
|
self.num_positives = num_positives
|
|
self.shuffle_negatives = shuffle_negatives
|
|
self.shuffle_positives = shuffle_positives
|
|
self.max_seq_len_query = max_seq_len_query
|
|
self.max_seq_len_passage = max_seq_len_passage
|
|
self.max_seq_len_table = max_seq_len_table
|
|
|
|
super(TableTextSimilarityProcessor, self).__init__(
|
|
tokenizer=self.query_tokenizer,
|
|
max_seq_len=0,
|
|
train_filename=train_filename,
|
|
dev_filename=dev_filename,
|
|
test_filename=test_filename,
|
|
dev_split=dev_split,
|
|
data_dir=data_dir,
|
|
tasks={},
|
|
proxies=proxies,
|
|
)
|
|
if metric:
|
|
self.add_task(
|
|
name="text_similarity",
|
|
metric=metric,
|
|
label_list=label_list,
|
|
label_name="label",
|
|
task_type="text_similarity",
|
|
)
|
|
else:
|
|
logger.info(
|
|
"Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for "
|
|
"using the default task or add a custom task later via processor.add_task()"
|
|
)
|
|
|
|
@classmethod
|
|
def load_from_dir(cls, load_dir: str):
|
|
"""
|
|
Overwriting method from parent class to **always** load the TableTextSimilarityProcessor
|
|
instead of the specific class stored in the config.
|
|
|
|
:param load_dir: Directory that contains a 'processor_config.json'
|
|
:return: An instance of an TableTextSimilarityProcessor.
|
|
"""
|
|
# read config
|
|
processor_config_file = Path(load_dir) / "processor_config.json"
|
|
config = json.load(open(processor_config_file))
|
|
# init tokenizer
|
|
query_tokenizer = AutoTokenizer.from_pretrained(
|
|
load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query"
|
|
)
|
|
passage_tokenizer = AutoTokenizer.from_pretrained(
|
|
load_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage"
|
|
)
|
|
table_tokenizer = AutoTokenizer.from_pretrained(
|
|
load_dir, tokenizer_class=config["table_tokenizer"], subfolder="table"
|
|
)
|
|
|
|
# we have to delete the tokenizer string from config, because we pass it as Object
|
|
del config["query_tokenizer"]
|
|
del config["passage_tokenizer"]
|
|
del config["table_tokenizer"]
|
|
|
|
processor = cls.load(
|
|
query_tokenizer=query_tokenizer,
|
|
passage_tokenizer=passage_tokenizer,
|
|
table_tokenizer=table_tokenizer,
|
|
processor_name="TableTextSimilarityProcessor",
|
|
**config,
|
|
)
|
|
for task_name, task in config["tasks"].items():
|
|
processor.add_task(name=task_name, metric=task["metric"], label_list=task["label_list"])
|
|
|
|
if processor is None:
|
|
raise Exception
|
|
|
|
return processor
|
|
|
|
def save(self, save_dir: Union[str, Path]):
|
|
"""
|
|
Saves the vocabulary to file and also creates a json file containing all the
|
|
information needed to load the same processor.
|
|
|
|
:param save_dir: Directory where the files are to be saved.
|
|
"""
|
|
if isinstance(save_dir, str):
|
|
save_dir = Path(save_dir)
|
|
os.makedirs(save_dir, exist_ok=True)
|
|
config = self.generate_config()
|
|
# save tokenizer incl. attributes
|
|
config["query_tokenizer"] = self.query_tokenizer.__class__.__name__
|
|
config["passage_tokenizer"] = self.passage_tokenizer.__class__.__name__
|
|
config["table_tokenizer"] = self.table_tokenizer.__class__.__name__
|
|
|
|
# Because the fast tokenizers expect a str and not Path
|
|
# always convert Path to str here.
|
|
self.query_tokenizer.save_pretrained(str(save_dir / "query"))
|
|
self.passage_tokenizer.save_pretrained(str(save_dir / "passage"))
|
|
self.table_tokenizer.save_pretrained(str(save_dir / "table"))
|
|
|
|
# save processor
|
|
config["processor"] = self.__class__.__name__
|
|
output_config_file = Path(save_dir) / "processor_config.json"
|
|
with open(output_config_file, "w") as file:
|
|
json.dump(config, file)
|
|
|
|
def file_to_dicts(self, file: str) -> List[Dict]:
|
|
"""
|
|
Converts a Multimodal Retrieval data file in json format to a list of dictionaries.
|
|
|
|
:param file: filename of DPR data in json format
|
|
Each sample is a dictionary of format:
|
|
{"question": str,
|
|
"answers": list of str
|
|
"positive_ctxs": list of dictionaries of format
|
|
{'title': str, 'text': str, 'passage_id': str, 'type': 'text', 'source': str}
|
|
or
|
|
{'page_title': str, 'section_title': str, 'caption': str, 'columns': list of str,
|
|
'rows': list of list of str, 'type': 'table', 'source': str}
|
|
"hard_negative_ctxs": list of dictionaries of format
|
|
{'title': str, 'text': str, 'passage_id': str, 'type': 'text', 'source': str}
|
|
or
|
|
{'page_title': str, 'section_title': str, 'caption': str, 'columns': list of str,
|
|
'rows': list of list of str, 'type': 'table', 'source': str}
|
|
}
|
|
|
|
|
|
Returns:
|
|
List of dictionaries: List[dict]
|
|
each dictionary:
|
|
{"query": str,
|
|
"passages": [
|
|
{"title": str, "text": str, "label": "positive" / "hard_negative", "type": "text", "external_id": id}
|
|
or
|
|
{"page_title": str, "section_title": str, "caption": str, "columns": list of str,
|
|
"rows": list of list of str, "label": "positive" / "hard_negative", "type": "table", "external_id": id}
|
|
...]}
|
|
"""
|
|
dicts = self._read_multimodal_dpr_json(file, max_samples=self.max_samples)
|
|
return dicts
|
|
|
|
def _read_multimodal_dpr_json(self, file: str, max_samples: Optional[int] = None) -> List[Dict]:
|
|
"""
|
|
Reads a Multimodal Retrieval data file in json format and returns a list of dictionaries.
|
|
|
|
:param file: filename of MMR data in json format
|
|
|
|
Returns:
|
|
list of dictionaries: List[dict]
|
|
each dictionary: {
|
|
"query": str -> query_text
|
|
"passages": List[dictionaries] -> [
|
|
{"text": str, "title": str, "label": "positive" / "hard_negative, "external_id": id},
|
|
or
|
|
{"page_title": str, "section_title": str, "caption": str, "columns": list of str,
|
|
"rows": list of lists of str, "label": "positive" / "hard_negative", "type": "table", "external_id": id}
|
|
...]
|
|
}
|
|
"""
|
|
dicts = json.load(open(file))
|
|
if max_samples:
|
|
dicts = random.sample(dicts, min(max_samples, len(dicts)))
|
|
# convert DPR dictionary to standard dictionary
|
|
query_json_keys = ["question", "questions", "query"]
|
|
positive_context_json_keys = ["positive_contexts", "positive_ctxs", "positive_context", "positive_ctx"]
|
|
hard_negative_json_keys = [
|
|
"hard_negative_contexts",
|
|
"hard_negative_ctxs",
|
|
"hard_negative_context",
|
|
"hard_negative_ctx",
|
|
]
|
|
standard_dicts = []
|
|
for dict in dicts:
|
|
sample = {}
|
|
docs = []
|
|
for key, val in dict.items():
|
|
if key in query_json_keys:
|
|
sample["query"] = val
|
|
elif key in positive_context_json_keys + hard_negative_json_keys:
|
|
for doc in val:
|
|
if doc["type"] == "table":
|
|
docs.append(
|
|
{
|
|
"meta": [
|
|
doc[meta_field] for meta_field in self.embed_meta_fields if meta_field in doc
|
|
],
|
|
"columns": doc.get("columns"),
|
|
"rows": doc.get("rows"),
|
|
"label": "positive" if key in positive_context_json_keys else "hard_negative",
|
|
"type": "table",
|
|
}
|
|
)
|
|
elif doc["type"] == "text":
|
|
docs.append(
|
|
{
|
|
"meta": [
|
|
doc[meta_field] for meta_field in self.embed_meta_fields if meta_field in doc
|
|
],
|
|
"text": doc["text"],
|
|
"label": "positive" if key in positive_context_json_keys else "hard_negative",
|
|
"type": "text",
|
|
}
|
|
)
|
|
|
|
sample["passages"] = docs
|
|
standard_dicts.append(sample)
|
|
return standard_dicts
|
|
|
|
def dataset_from_dicts(
|
|
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
|
):
|
|
"""
|
|
Convert input dictionaries into a pytorch dataset for TextSimilarity.
|
|
For conversion we have an internal representation called "baskets".
|
|
Each basket is one query and related text passages (positive passages fitting to the query and negative
|
|
passages that do not fit the query)
|
|
Each stage adds or transforms specific information to our baskets.
|
|
|
|
:param dicts: List of dicts, input dictionary with DPR-style content
|
|
{"query": str,
|
|
"passages": List[
|
|
{'title': str,
|
|
'text': str,
|
|
'label': 'hard_negative',
|
|
'external_id': str},
|
|
....
|
|
]
|
|
}
|
|
:param indices: list, indices used during multiprocessing so that IDs assigned to our baskets is unique
|
|
:param return_baskets: boolean, whether to return the baskets or not (baskets are needed during inference)
|
|
"""
|
|
|
|
# Take the dict and insert into our basket structure, this stages also adds an internal IDs
|
|
baskets = self._fill_baskets(dicts, indices)
|
|
|
|
# Separate conversion of query
|
|
baskets = self._convert_queries(baskets=baskets)
|
|
|
|
# and context passages and tables. When converting the context the label is also assigned.
|
|
baskets = self._convert_contexts(baskets=baskets)
|
|
|
|
# Convert features into pytorch dataset, this step also removes and logs potential errors during preprocessing
|
|
dataset, tensor_names, problematic_ids, baskets = self._create_dataset(baskets)
|
|
|
|
if problematic_ids:
|
|
logger.error(
|
|
f"There were {len(problematic_ids)} errors during preprocessing at positions: {problematic_ids}"
|
|
)
|
|
|
|
if return_baskets:
|
|
return dataset, tensor_names, problematic_ids, baskets
|
|
else:
|
|
return dataset, tensor_names, problematic_ids
|
|
|
|
def _fill_baskets(self, dicts: List[Dict], indices: Optional[Iterable[int]]):
|
|
baskets = []
|
|
if not indices:
|
|
indices = range(len(dicts))
|
|
for d, id_internal in zip(dicts, indices):
|
|
basket = SampleBasket(id_external=None, id_internal=id_internal, raw=d)
|
|
baskets.append(basket)
|
|
return baskets
|
|
|
|
def _convert_queries(self, baskets: List[SampleBasket]):
|
|
for basket in baskets:
|
|
clear_text = {}
|
|
tokenized = {}
|
|
features: List[Dict] = [{}]
|
|
# extract query, positive context passages and titles, hard-negative passages and titles
|
|
if "query" in basket.raw:
|
|
try:
|
|
query = self._normalize_question(basket.raw["query"])
|
|
|
|
# featurize the query
|
|
query_inputs = self.query_tokenizer(
|
|
query,
|
|
max_length=self.max_seq_len_query,
|
|
add_special_tokens=True,
|
|
truncation=True,
|
|
truncation_strategy="longest_first",
|
|
padding="max_length",
|
|
return_token_type_ids=True,
|
|
)
|
|
|
|
# tokenize query
|
|
tokenized_query = self.query_tokenizer.convert_ids_to_tokens(query_inputs["input_ids"])
|
|
|
|
if len(tokenized_query) == 0:
|
|
logger.warning(
|
|
f"The query could not be tokenized, likely because it contains a character that the query tokenizer does not recognize"
|
|
)
|
|
return None
|
|
|
|
clear_text["query_text"] = query
|
|
tokenized["query_tokens"] = tokenized_query
|
|
features[0]["query_input_ids"] = query_inputs["input_ids"]
|
|
features[0]["query_segment_ids"] = query_inputs["token_type_ids"]
|
|
features[0]["query_attention_mask"] = query_inputs["attention_mask"]
|
|
except Exception as e:
|
|
features = None # type: ignore
|
|
|
|
sample = Sample(id="", clear_text=clear_text, tokenized=tokenized, features=features) # type: ignore
|
|
basket.samples = [sample]
|
|
return baskets
|
|
|
|
def _convert_contexts(self, baskets: List[SampleBasket]):
|
|
# Converts both text passages and tables.
|
|
|
|
for basket in baskets:
|
|
if "passages" in basket.raw:
|
|
try:
|
|
positive_context = list(filter(lambda x: x["label"] == "positive", basket.raw["passages"]))
|
|
if self.shuffle_positives:
|
|
random.shuffle(positive_context)
|
|
positive_context = positive_context[: self.num_positives]
|
|
hard_negative_context = list(
|
|
filter(lambda x: x["label"] == "hard_negative", basket.raw["passages"])
|
|
)
|
|
if self.shuffle_negatives:
|
|
random.shuffle(hard_negative_context)
|
|
hard_negative_context = hard_negative_context[: self.num_hard_negatives]
|
|
|
|
positive_ctx_meta = []
|
|
positive_ctx_texts = []
|
|
hard_negative_ctx_meta = []
|
|
hard_negative_ctx_texts = []
|
|
is_table = []
|
|
|
|
for pos_ctx in positive_context:
|
|
if pos_ctx["type"] == "text":
|
|
positive_ctx_meta.append(" ".join(pos_ctx.get("meta")))
|
|
positive_ctx_texts.append(pos_ctx["text"])
|
|
is_table.append(0)
|
|
elif pos_ctx["type"] == "table":
|
|
positive_ctx_meta.append(" ".join(pos_ctx.get("meta")))
|
|
linearized_rows = [cell for row in pos_ctx["rows"] for cell in row]
|
|
linearized_table = " ".join(pos_ctx["columns"]) + " " + " ".join(linearized_rows)
|
|
positive_ctx_texts.append(linearized_table)
|
|
is_table.append(1)
|
|
|
|
for hn_ctx in hard_negative_context:
|
|
if hn_ctx["type"] == "text":
|
|
hard_negative_ctx_meta.append(" ".join(hn_ctx.get("meta")))
|
|
hard_negative_ctx_texts.append(hn_ctx["text"])
|
|
is_table.append(0)
|
|
elif hn_ctx["type"] == "table":
|
|
hard_negative_ctx_meta.append(" ".join(hn_ctx.get("meta")))
|
|
linearized_rows = [cell for row in hn_ctx["rows"] for cell in row]
|
|
linearized_table = " ".join(hn_ctx["columns"]) + " " + " ".join(linearized_rows)
|
|
hard_negative_ctx_texts.append(linearized_table)
|
|
is_table.append(1)
|
|
|
|
# all context passages and labels: 1 for positive context and 0 for hard-negative context
|
|
ctx_label = [1] * self.num_positives + [0] * self.num_hard_negatives
|
|
# featurize context passages
|
|
if self.embed_meta_fields:
|
|
# concatenate title with positive context passages + negative context passages
|
|
all_ctx = self._combine_meta_context(
|
|
positive_ctx_meta, positive_ctx_texts
|
|
) + self._combine_meta_context(hard_negative_ctx_meta, hard_negative_ctx_texts)
|
|
else:
|
|
all_ctx = positive_ctx_texts + hard_negative_ctx_texts
|
|
|
|
# assign empty string tuples if hard_negative passages less than num_hard_negatives
|
|
all_ctx += [("", "")] * ((self.num_positives + self.num_hard_negatives) - len(all_ctx))
|
|
|
|
inputs = self.passage_tokenizer(
|
|
all_ctx,
|
|
add_special_tokens=True,
|
|
truncation=True,
|
|
padding="max_length",
|
|
max_length=self.max_seq_len_passage,
|
|
return_token_type_ids=True,
|
|
)
|
|
|
|
input_ids = inputs["input_ids"]
|
|
passage_segment_ids = inputs["token_type_ids"]
|
|
attention_mask = inputs["attention_mask"]
|
|
|
|
# get tokens in string format
|
|
tokenized = [self.passage_tokenizer.convert_ids_to_tokens(ctx) for ctx in input_ids]
|
|
|
|
# for DPR we only have one sample containing query and corresponding (multiple) context features
|
|
sample = basket.samples[0] # type: ignore
|
|
sample.clear_text["passages"] = positive_context + hard_negative_context # type: ignore
|
|
sample.tokenized["passages_tokens"] = tokenized # type: ignore
|
|
sample.features[0]["passage_input_ids"] = input_ids # type: ignore
|
|
sample.features[0]["passage_segment_ids"] = passage_segment_ids # type: ignore
|
|
sample.features[0]["table_segment_ids"] = passage_segment_ids # type: ignore
|
|
sample.features[0]["passage_attention_mask"] = attention_mask # type: ignore
|
|
sample.features[0]["label_ids"] = ctx_label # type: ignore
|
|
sample.features[0]["is_table"] = is_table # type: ignore
|
|
except Exception as e:
|
|
basket.samples[0].features = None # type: ignore
|
|
|
|
return baskets
|
|
|
|
def _create_dataset(self, baskets: List[SampleBasket]):
|
|
"""
|
|
Convert python features into pytorch dataset.
|
|
Also removes potential errors during preprocessing.
|
|
Flattens nested basket structure to create a flat list of features
|
|
"""
|
|
features_flat: List = []
|
|
basket_to_remove = []
|
|
problematic_ids = set()
|
|
for basket in baskets:
|
|
if self._check_sample_features(basket):
|
|
for sample in basket.samples: # type: ignore
|
|
features_flat.extend(sample.features) # type: ignore
|
|
else:
|
|
# remove the entire basket
|
|
basket_to_remove.append(basket)
|
|
if len(basket_to_remove) > 0:
|
|
for basket in basket_to_remove:
|
|
# if basket_to_remove is not empty remove the related baskets
|
|
problematic_ids.add(basket.id_internal)
|
|
baskets.remove(basket)
|
|
|
|
dataset, tensor_names = convert_features_to_dataset(features=features_flat)
|
|
return dataset, tensor_names, problematic_ids, baskets
|
|
|
|
@staticmethod
|
|
def _normalize_question(question: str) -> str:
|
|
"""Removes '?' from queries/questions"""
|
|
if question[-1] == "?":
|
|
question = question[:-1]
|
|
return question
|
|
|
|
@staticmethod
|
|
def _combine_meta_context(meta_fields: List[str], texts: List[str]):
|
|
res = []
|
|
for meta, ctx in zip(meta_fields, texts):
|
|
if meta is None:
|
|
meta = ""
|
|
res.append(tuple((meta, ctx)))
|
|
return res
|
|
|
|
|
|
class TextClassificationProcessor(Processor):
|
|
"""
|
|
Used to handle the text classification datasets that come in tabular format (CSV, TSV, etc.)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
tokenizer,
|
|
max_seq_len,
|
|
data_dir,
|
|
label_list=None,
|
|
metric=None,
|
|
train_filename="train.tsv",
|
|
dev_filename=None,
|
|
test_filename="test.tsv",
|
|
dev_split=0.1,
|
|
dev_stratification=False,
|
|
delimiter="\t",
|
|
quote_char="'",
|
|
skiprows=None,
|
|
label_column_name="label",
|
|
multilabel=False,
|
|
header=0,
|
|
proxies=None,
|
|
max_samples=None,
|
|
text_column_name="text",
|
|
**kwargs,
|
|
):
|
|
"""
|
|
:param tokenizer: Used to split a sentence (str) into tokens.
|
|
:param max_seq_len: Samples are truncated after this many tokens.
|
|
:type max_seq_len: int
|
|
:param data_dir: The directory in which the train and dev files can be found.
|
|
If not available the dataset will be loaded automaticaly
|
|
if the last directory has the same name as a predefined dataset.
|
|
These predefined datasets are defined as the keys in the dict at
|
|
`farm.data_handler.utils.DOWNSTREAM_TASK_MAP <https://github.com/deepset-ai/FARM/blob/main/farm/data_handler/utils.py>`_.
|
|
:type data_dir: str
|
|
:param label_list: list of labels to predict (strings). For most cases this should be: ["start_token", "end_token"]
|
|
:type label_list: list
|
|
:param metric: name of metric that shall be used for evaluation, e.g. "acc" or "f1_macro".
|
|
Alternatively you can also supply a custom function, that takes preds and labels as args and returns a numerical value.
|
|
For using multiple metrics supply them as a list, e.g ["acc", my_custom_metric_fn].
|
|
:type metric: str, function, or list
|
|
:param train_filename: The name of the file containing training data.
|
|
:type train_filename: str
|
|
:param dev_filename: The name of the file containing the dev data. If None and 0.0 < dev_split < 1.0 the dev set
|
|
will be a slice of the train set.
|
|
:type dev_filename: str or None
|
|
:param test_filename: None
|
|
:type test_filename: str
|
|
:param dev_split: The proportion of the train set that will sliced. Only works if dev_filename is set to None
|
|
:type dev_split: float
|
|
:param dev_stratification: if True, create a class-stratified split for the dev set.
|
|
:type dev_stratification: bool
|
|
:param delimiter: Separator used in the input tsv / csv file
|
|
:type delimiter: str
|
|
:param quote_char: Character used for quoting strings in the input tsv/ csv file
|
|
:type quote_char: str
|
|
:param skiprows: number of rows to skip in the tsvs (e.g. for multirow headers)
|
|
:type skiprows: int
|
|
:param label_column_name: name of the column in the input csv/tsv that shall be used as training labels
|
|
:type label_column_name: str
|
|
:param multilabel: set to True for multilabel classification
|
|
:type multilabel: bool
|
|
:param header: which line to use as a header in the input csv/tsv
|
|
:type header: int
|
|
:param proxies: proxy configuration to allow downloads of remote datasets.
|
|
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
|
|
:type proxies: dict
|
|
:param text_column_name: name of the column in the input csv/tsv that shall be used as training text
|
|
:type text_column_name: str
|
|
:param kwargs: placeholder for passing generic parameters
|
|
:type kwargs: object
|
|
"""
|
|
# TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs
|
|
|
|
# Custom processor attributes
|
|
self.delimiter = delimiter
|
|
self.quote_char = quote_char
|
|
self.skiprows = skiprows
|
|
self.header = header
|
|
self.max_samples = max_samples
|
|
self.dev_stratification = dev_stratification
|
|
logger.debug("Currently no support in Processor for returning problematic ids")
|
|
|
|
super(TextClassificationProcessor, self).__init__(
|
|
tokenizer=tokenizer,
|
|
max_seq_len=max_seq_len,
|
|
train_filename=train_filename,
|
|
dev_filename=dev_filename,
|
|
test_filename=test_filename,
|
|
dev_split=dev_split,
|
|
data_dir=data_dir,
|
|
tasks={},
|
|
proxies=proxies,
|
|
)
|
|
if metric and label_list:
|
|
if multilabel:
|
|
task_type = "multilabel_classification"
|
|
else:
|
|
task_type = "classification"
|
|
self.add_task(
|
|
name="text_classification",
|
|
metric=metric,
|
|
label_list=label_list,
|
|
label_column_name=label_column_name,
|
|
text_column_name=text_column_name,
|
|
task_type=task_type,
|
|
)
|
|
else:
|
|
logger.info(
|
|
"Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for "
|
|
"using the default task or add a custom task later via processor.add_task()"
|
|
)
|
|
|
|
def file_to_dicts(self, file: str) -> List[Dict]:
|
|
raise NotImplementedError
|
|
|
|
def dataset_from_dicts(
|
|
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
|
):
|
|
self.baskets = []
|
|
# Tokenize in batches
|
|
texts = [x["text"] for x in dicts]
|
|
tokenized_batch = self.tokenizer(
|
|
texts,
|
|
return_offsets_mapping=True,
|
|
return_special_tokens_mask=True,
|
|
return_token_type_ids=True,
|
|
return_attention_mask=True,
|
|
truncation=True,
|
|
max_length=self.max_seq_len,
|
|
padding="max_length",
|
|
)
|
|
input_ids_batch = tokenized_batch["input_ids"]
|
|
segment_ids_batch = tokenized_batch["token_type_ids"]
|
|
padding_masks_batch = tokenized_batch["attention_mask"]
|
|
tokens_batch = [x.tokens for x in tokenized_batch.encodings]
|
|
|
|
# From here we operate on a per sample basis
|
|
for dictionary, input_ids, segment_ids, padding_mask, tokens in zip(
|
|
dicts, input_ids_batch, segment_ids_batch, padding_masks_batch, tokens_batch
|
|
):
|
|
|
|
tokenized = {}
|
|
if debug:
|
|
tokenized["tokens"] = tokens
|
|
|
|
feat_dict = {"input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids}
|
|
|
|
# Create labels
|
|
# i.e. not inference
|
|
if not return_baskets:
|
|
label_dict = self.convert_labels(dictionary)
|
|
feat_dict.update(label_dict)
|
|
|
|
# Add Basket to self.baskets
|
|
curr_sample = Sample(id="", clear_text=dictionary, tokenized=tokenized, features=[feat_dict])
|
|
curr_basket = SampleBasket(id_internal=None, raw=dictionary, id_external=None, samples=[curr_sample])
|
|
self.baskets.append(curr_basket)
|
|
|
|
if indices and 0 not in indices:
|
|
pass
|
|
else:
|
|
self._log_samples(n_samples=1, baskets=self.baskets)
|
|
|
|
# TODO populate problematic ids
|
|
problematic_ids: set = set()
|
|
dataset, tensornames = self._create_dataset()
|
|
if return_baskets:
|
|
return dataset, tensornames, problematic_ids, self.baskets
|
|
else:
|
|
return dataset, tensornames, problematic_ids
|
|
|
|
def convert_labels(self, dictionary: Dict):
|
|
ret: Dict = {}
|
|
# Add labels for different tasks
|
|
for task_name, task in self.tasks.items():
|
|
label_name = task["label_name"]
|
|
label_raw = dictionary[label_name]
|
|
label_list = task["label_list"]
|
|
if task["task_type"] == "classification":
|
|
# id of label
|
|
label_ids = [label_list.index(label_raw)]
|
|
elif task["task_type"] == "multilabel_classification":
|
|
# multi-hot-format
|
|
label_ids = [0] * len(label_list)
|
|
for l in label_raw.split(","):
|
|
if l != "":
|
|
label_ids[label_list.index(l)] = 1
|
|
ret[task["label_tensor_name"]] = label_ids
|
|
return ret
|
|
|
|
def _create_dataset(self):
|
|
# TODO this is the proposed new version to replace the mother function
|
|
features_flat = []
|
|
basket_to_remove = []
|
|
for basket in self.baskets:
|
|
if self._check_sample_features(basket):
|
|
for sample in basket.samples:
|
|
features_flat.extend(sample.features)
|
|
else:
|
|
# remove the entire basket
|
|
basket_to_remove.append(basket)
|
|
dataset, tensor_names = convert_features_to_dataset(features=features_flat)
|
|
return dataset, tensor_names
|
|
|
|
|
|
class InferenceProcessor(TextClassificationProcessor):
|
|
"""
|
|
Generic processor used at inference time:
|
|
- fast
|
|
- no labels
|
|
- pure encoding of text into pytorch dataset
|
|
- Doesn't read from file, but only consumes dictionaries (e.g. coming from API requests)
|
|
"""
|
|
|
|
def __init__(self, tokenizer, max_seq_len, **kwargs):
|
|
|
|
super(InferenceProcessor, self).__init__(
|
|
tokenizer=tokenizer,
|
|
max_seq_len=max_seq_len,
|
|
train_filename=None,
|
|
dev_filename=None,
|
|
test_filename=None,
|
|
dev_split=None,
|
|
data_dir=None,
|
|
tasks={},
|
|
)
|
|
|
|
@classmethod
|
|
def load_from_dir(cls, load_dir: str):
|
|
"""
|
|
Overwriting method from parent class to **always** load the InferenceProcessor instead of the specific class stored in the config.
|
|
|
|
:param load_dir: str, directory that contains a 'processor_config.json'
|
|
:return: An instance of an InferenceProcessor
|
|
"""
|
|
# read config
|
|
processor_config_file = Path(load_dir) / "processor_config.json"
|
|
config = json.load(open(processor_config_file))
|
|
# init tokenizer
|
|
tokenizer = AutoTokenizer.from_pretrained(load_dir, tokenizer_class=config["tokenizer"])
|
|
# we have to delete the tokenizer string from config, because we pass it as Object
|
|
del config["tokenizer"]
|
|
|
|
processor = cls.load(tokenizer=tokenizer, processor_name="InferenceProcessor", **config)
|
|
for task_name, task in config["tasks"].items():
|
|
processor.add_task(name=task_name, metric=task["metric"], label_list=task["label_list"])
|
|
|
|
if processor is None:
|
|
raise Exception
|
|
|
|
return processor
|
|
|
|
def file_to_dicts(self, file: str) -> List[Dict]:
|
|
raise NotImplementedError
|
|
|
|
def convert_labels(self, dictionary: Dict):
|
|
# For inference we do not need labels
|
|
ret: Dict = {}
|
|
return ret
|
|
|
|
# Private method to keep s3e pooling and embedding extraction working
|
|
def _dict_to_samples(self, dictionary: Dict, **kwargs) -> Sample:
|
|
# this tokenization also stores offsets
|
|
tokenized = tokenize_with_metadata(dictionary["text"], self.tokenizer)
|
|
# truncate tokens, offsets and start_of_word to max_seq_len that can be handled by the model
|
|
truncated_tokens = {}
|
|
for seq_name, tokens in tokenized.items():
|
|
truncated_tokens[seq_name], _, _ = truncate_sequences(
|
|
seq_a=tokens, seq_b=None, tokenizer=self.tokenizer, max_seq_len=self.max_seq_len
|
|
)
|
|
return Sample(id="", clear_text=dictionary, tokenized=truncated_tokens)
|
|
|
|
# Private method to keep s3e pooling and embedding extraction working
|
|
def _sample_to_features(self, sample: Sample) -> Dict:
|
|
features = sample_to_features_text(
|
|
sample=sample, tasks=self.tasks, max_seq_len=self.max_seq_len, tokenizer=self.tokenizer
|
|
)
|
|
return features
|
|
|
|
|
|
class UnlabeledTextProcessor(Processor):
|
|
"""
|
|
Processor to be used for distilling a teacher model into a student model from scratch. Can only be used with distil_intermediate_layers_from.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
tokenizer,
|
|
max_seq_len: int,
|
|
train_filename: Optional[Union[Path, str]] = None,
|
|
dev_filename: Optional[Union[Path, str]] = None,
|
|
test_filename: Optional[Union[Path, str]] = None,
|
|
dev_split: float = 0,
|
|
data_dir: Optional[Union[Path, str]] = None,
|
|
tasks: Dict = {},
|
|
proxies: Optional[Dict] = None,
|
|
multithreading_rust: Optional[bool] = True,
|
|
):
|
|
super().__init__(
|
|
tokenizer,
|
|
max_seq_len,
|
|
train_filename,
|
|
dev_filename,
|
|
test_filename,
|
|
dev_split,
|
|
data_dir,
|
|
tasks,
|
|
proxies,
|
|
multithreading_rust,
|
|
)
|
|
self.add_task("question_answering", "squad", ["start_token", "end_token"])
|
|
|
|
def file_to_dicts(self, file: str) -> List[dict]:
|
|
dicts = []
|
|
with open(file, "r") as f:
|
|
for line in f:
|
|
dicts.append({"text": line})
|
|
return dicts
|
|
|
|
def dataset_from_dicts(
|
|
self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False
|
|
):
|
|
if return_baskets:
|
|
raise NotImplementedError("return_baskets is not supported by UnlabeledTextProcessor")
|
|
texts = [dict_["text"] for dict_ in dicts]
|
|
tokens = self.tokenizer(
|
|
texts,
|
|
add_special_tokens=True,
|
|
return_tensors="pt",
|
|
padding="max_length",
|
|
truncation=True,
|
|
max_length=self.max_seq_len,
|
|
)
|
|
names = [key for key in tokens]
|
|
inputs = [tokens[key] for key in tokens]
|
|
if not "padding_mask" in names:
|
|
index = names.index("attention_mask")
|
|
names[index] = "padding_mask"
|
|
if not "segment_ids" in names:
|
|
index = names.index("token_type_ids")
|
|
names[index] = "segment_ids"
|
|
|
|
dataset = TensorDataset(*inputs)
|
|
return dataset, names, []
|
|
|
|
def _create_dataset(self, baskets: List[SampleBasket]):
|
|
raise NotImplementedError("_create_dataset is not supported by UnlabeledTextProcessor")
|
|
|
|
|
|
# helper fcts
|
|
def write_squad_predictions(predictions, out_filename, predictions_filename=None):
|
|
predictions_json = {}
|
|
for x in predictions:
|
|
for p in x["predictions"]:
|
|
if p["answers"][0]["answer"] is not None:
|
|
predictions_json[p["question_id"]] = p["answers"][0]["answer"]
|
|
else:
|
|
predictions_json[
|
|
p["question_id"]
|
|
] = "" # convert No answer = None to format understood by the SQuAD eval script
|
|
|
|
if predictions_filename:
|
|
dev_labels = {}
|
|
temp = json.load(open(predictions_filename, "r"))
|
|
for d in temp["data"]:
|
|
for p in d["paragraphs"]:
|
|
for q in p["qas"]:
|
|
if q.get("is_impossible", False):
|
|
dev_labels[q["id"]] = "is_impossible"
|
|
else:
|
|
dev_labels[q["id"]] = q["answers"][0]["text"]
|
|
not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys()))
|
|
if len(not_included) > 0:
|
|
logger.info("There were missing predicitons for question ids: %s", list(not_included))
|
|
for x in not_included:
|
|
predictions_json[x] = ""
|
|
|
|
# os.makedirs("model_output", exist_ok=True)
|
|
# filepath = Path("model_output") / out_filename
|
|
json.dump(predictions_json, open(out_filename, "w"))
|
|
logger.info("Written Squad predictions to: %s", out_filename)
|
|
|
|
|
|
def _read_dpr_json(
|
|
file: str,
|
|
max_samples: Optional[int] = None,
|
|
proxies: Optional[Any] = None,
|
|
num_hard_negatives: int = 1,
|
|
num_positives: int = 1,
|
|
shuffle_negatives: bool = True,
|
|
shuffle_positives: bool = False,
|
|
):
|
|
"""
|
|
Reads a Dense Passage Retrieval (DPR) data file in json format and returns a list of dictionaries.
|
|
|
|
:param file: filename of DPR data in json format
|
|
|
|
Returns:
|
|
list of dictionaries: List[dict]
|
|
each dictionary: {
|
|
"query": str -> query_text
|
|
"passages": List[dictionaries] -> [{"text": document_text, "title": xxx, "label": "positive", "external_id": abb123},
|
|
{"text": document_text, "title": xxx, "label": "hard_negative", "external_id": abb134},
|
|
...]
|
|
}
|
|
example:
|
|
["query": 'who sings does he love me with reba'
|
|
"passages" : [{'title': 'Does He Love You',
|
|
'text': 'Does He Love You "Does He Love You" is a song written by Sandy Knox and Billy Stritch, and recorded as a duet by American country music artists Reba McEntire and Linda Davis. It was released in August 1993 as the first single from Reba\'s album "Greatest Hits Volume Two". It is one of country music\'s several songs about a love triangle. "Does He Love You" was written in 1982 by Billy Stritch. He recorded it with a trio in which he performed at the time, because he wanted a song that could be sung by the other two members',
|
|
'label': 'positive',
|
|
'external_id': '11828866'},
|
|
{'title': 'When the Nightingale Sings',
|
|
'text': "When the Nightingale Sings When The Nightingale Sings is a Middle English poem, author unknown, recorded in the British Library's Harley 2253 manuscript, verse 25. It is a love poem, extolling the beauty and lost love of an unknown maiden. When þe nyhtegale singes þe wodes waxen grene.<br> Lef ant gras ant blosme springes in aueryl y wene,<br> Ant love is to myn herte gon wiþ one spere so kene<br> Nyht ant day my blod hit drynkes myn herte deþ me tene. Ich have loved al þis er þat y may love namore,<br> Ich have siked moni syk lemmon for",
|
|
'label': 'hard_negative',
|
|
'external_id': '10891637'}]
|
|
]
|
|
|
|
"""
|
|
# get remote dataset if needed
|
|
if not os.path.exists(file):
|
|
logger.info("Couldn't find %s locally. Trying to download ...", file)
|
|
_download_extract_downstream_data(file, proxies=proxies)
|
|
|
|
if Path(file).suffix.lower() == ".jsonl":
|
|
dicts = []
|
|
with open(file, encoding="utf-8") as f:
|
|
for line in f:
|
|
dicts.append(json.loads(line))
|
|
else:
|
|
dicts = json.load(open(file, encoding="utf-8"))
|
|
|
|
if max_samples:
|
|
dicts = random.sample(dicts, min(max_samples, len(dicts)))
|
|
|
|
# convert DPR dictionary to standard dictionary
|
|
query_json_keys = ["question", "questions", "query"]
|
|
positive_context_json_keys = ["positive_contexts", "positive_ctxs", "positive_context", "positive_ctx"]
|
|
hard_negative_json_keys = [
|
|
"hard_negative_contexts",
|
|
"hard_negative_ctxs",
|
|
"hard_negative_context",
|
|
"hard_negative_ctx",
|
|
]
|
|
standard_dicts = []
|
|
for dict in dicts:
|
|
sample = {}
|
|
passages = []
|
|
for key, val in dict.items():
|
|
if key in query_json_keys:
|
|
sample["query"] = val
|
|
elif key in positive_context_json_keys:
|
|
if shuffle_positives:
|
|
random.shuffle(val)
|
|
for passage in val[:num_positives]:
|
|
passages.append(
|
|
{
|
|
"title": passage["title"],
|
|
"text": passage["text"],
|
|
"label": "positive",
|
|
"external_id": passage.get("passage_id", uuid.uuid4().hex.upper()[0:8]),
|
|
}
|
|
)
|
|
elif key in hard_negative_json_keys:
|
|
if shuffle_negatives:
|
|
random.shuffle(val)
|
|
for passage in val[:num_hard_negatives]:
|
|
passages.append(
|
|
{
|
|
"title": passage["title"],
|
|
"text": passage["text"],
|
|
"label": "hard_negative",
|
|
"external_id": passage.get("passage_id", uuid.uuid4().hex.upper()[0:8]),
|
|
}
|
|
)
|
|
sample["passages"] = passages
|
|
standard_dicts.append(sample)
|
|
return standard_dicts
|
|
|
|
|
|
def _read_squad_file(filename: str, proxies=None):
|
|
"""Read a SQuAD json file"""
|
|
if not os.path.exists(filename):
|
|
logger.info("Couldn't find %s locally. Trying to download ...", filename)
|
|
_download_extract_downstream_data(filename, proxies)
|
|
with open(filename, "r", encoding="utf-8") as reader:
|
|
input_data = json.load(reader)["data"]
|
|
return input_data
|
|
|
|
|
|
def http_get(url, temp_file, proxies=None):
|
|
req = requests.get(url, stream=True, proxies=proxies)
|
|
content_length = req.headers.get("Content-Length")
|
|
total = int(content_length) if content_length is not None else None
|
|
progress = tqdm(unit="B", total=total)
|
|
for chunk in req.iter_content(chunk_size=1024):
|
|
if chunk: # filter out keep-alive new chunks
|
|
progress.update(len(chunk))
|
|
temp_file.write(chunk)
|
|
progress.close()
|
|
|
|
|
|
def _download_extract_downstream_data(input_file: str, proxies=None):
|
|
# download archive to temp dir and extract to correct position
|
|
full_path = Path(os.path.realpath(input_file))
|
|
directory = full_path.parent
|
|
taskname = directory.stem
|
|
datadir = directory.parent
|
|
logger.info("downloading and extracting file {} to dir {}".format(taskname, datadir))
|
|
if taskname not in DOWNSTREAM_TASK_MAP:
|
|
logger.error("Cannot download {}. Unknown data source.".format(taskname))
|
|
else:
|
|
if os.name == "nt": # make use of NamedTemporaryFile compatible with Windows
|
|
delete_tmp_file = False
|
|
else:
|
|
delete_tmp_file = True
|
|
with tempfile.NamedTemporaryFile(delete=delete_tmp_file) as temp_file:
|
|
http_get(DOWNSTREAM_TASK_MAP[taskname], temp_file, proxies=proxies)
|
|
temp_file.flush()
|
|
temp_file.seek(0) # making tempfile accessible
|
|
tfile = tarfile.open(temp_file.name)
|
|
tfile.extractall(datadir)
|
|
# temp_file gets deleted here
|
|
|
|
|
|
def _is_json(x):
|
|
if issubclass(type(x), Path):
|
|
return True
|
|
try:
|
|
json.dumps(x)
|
|
return True
|
|
except:
|
|
return False
|