mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-27 15:08:43 +00:00
Fix tests and adjust folder structure
* Add type annotations in QuestionAnsweringHead * Fix test by increasing max_seq_len * Add SampleBasket type annotation * Remove prediction head param from adaptive model init * Add type ignore for AdaptiveModel init * Fix and rename tests * Adjust folder structure Co-authored-by: Julian Risch <julian.risch@deepset.ai>
This commit is contained in:
parent
e8a6427b9e
commit
537204e8c9
@ -1,247 +0,0 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import tarfile
|
||||
import tempfile
|
||||
import uuid
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DOWNSTREAM_TASK_MAP = {
|
||||
"squad20": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/squad20.tar.gz",
|
||||
"covidqa": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/covidqa.tar.gz",
|
||||
}
|
||||
|
||||
def read_dpr_json(file, max_samples=None, proxies=None, num_hard_negatives=1, num_positives=1, shuffle_negatives=True, shuffle_positives=False):
|
||||
"""
|
||||
Reads a Dense Passage Retrieval (DPR) data file in json format and returns a list of dictionaries.
|
||||
|
||||
:param file: filename of DPR data in json format
|
||||
|
||||
Returns:
|
||||
list of dictionaries: List[dict]
|
||||
each dictionary: {
|
||||
"query": str -> query_text
|
||||
"passages": List[dictionaries] -> [{"text": document_text, "title": xxx, "label": "positive", "external_id": abb123},
|
||||
{"text": document_text, "title": xxx, "label": "hard_negative", "external_id": abb134},
|
||||
...]
|
||||
}
|
||||
example:
|
||||
["query": 'who sings does he love me with reba'
|
||||
"passages" : [{'title': 'Does He Love You',
|
||||
'text': 'Does He Love You "Does He Love You" is a song written by Sandy Knox and Billy Stritch, and recorded as a duet by American country music artists Reba McEntire and Linda Davis. It was released in August 1993 as the first single from Reba\'s album "Greatest Hits Volume Two". It is one of country music\'s several songs about a love triangle. "Does He Love You" was written in 1982 by Billy Stritch. He recorded it with a trio in which he performed at the time, because he wanted a song that could be sung by the other two members',
|
||||
'label': 'positive',
|
||||
'external_id': '11828866'},
|
||||
{'title': 'When the Nightingale Sings',
|
||||
'text': "When the Nightingale Sings When The Nightingale Sings is a Middle English poem, author unknown, recorded in the British Library's Harley 2253 manuscript, verse 25. It is a love poem, extolling the beauty and lost love of an unknown maiden. When þe nyhtegale singes þe wodes waxen grene.<br> Lef ant gras ant blosme springes in aueryl y wene,<br> Ant love is to myn herte gon wiþ one spere so kene<br> Nyht ant day my blod hit drynkes myn herte deþ me tene. Ich have loved al þis er þat y may love namore,<br> Ich have siked moni syk lemmon for",
|
||||
'label': 'hard_negative',
|
||||
'external_id': '10891637'}]
|
||||
]
|
||||
|
||||
"""
|
||||
# get remote dataset if needed
|
||||
if not (os.path.exists(file)):
|
||||
logger.info(f" Couldn't find {file} locally. Trying to download ...")
|
||||
_download_extract_downstream_data(file, proxies=proxies)
|
||||
|
||||
if file.suffix.lower() == ".jsonl":
|
||||
dicts = []
|
||||
with open(file, encoding='utf-8') as f:
|
||||
for line in f:
|
||||
dicts.append(json.loads(line))
|
||||
else:
|
||||
dicts = json.load(open(file, encoding='utf-8'))
|
||||
|
||||
if max_samples:
|
||||
dicts = random.sample(dicts, min(max_samples, len(dicts)))
|
||||
|
||||
# convert DPR dictionary to standard dictionary
|
||||
query_json_keys = ["question", "questions", "query"]
|
||||
positive_context_json_keys = ["positive_contexts", "positive_ctxs", "positive_context", "positive_ctx"]
|
||||
hard_negative_json_keys = ["hard_negative_contexts", "hard_negative_ctxs", "hard_negative_context", "hard_negative_ctx"]
|
||||
standard_dicts = []
|
||||
for dict in dicts:
|
||||
sample = {}
|
||||
passages = []
|
||||
for key, val in dict.items():
|
||||
if key in query_json_keys:
|
||||
sample["query"] = val
|
||||
elif key in positive_context_json_keys:
|
||||
if shuffle_positives:
|
||||
random.shuffle(val)
|
||||
for passage in val[:num_positives]:
|
||||
passages.append({
|
||||
"title": passage["title"],
|
||||
"text": passage["text"],
|
||||
"label": "positive",
|
||||
"external_id": passage.get("passage_id", uuid.uuid4().hex.upper()[0:8])
|
||||
})
|
||||
elif key in hard_negative_json_keys:
|
||||
if shuffle_negatives:
|
||||
random.shuffle(val)
|
||||
for passage in val[:num_hard_negatives]:
|
||||
passages.append({
|
||||
"title": passage["title"],
|
||||
"text": passage["text"],
|
||||
"label": "hard_negative",
|
||||
"external_id": passage.get("passage_id", uuid.uuid4().hex.upper()[0:8])
|
||||
})
|
||||
sample["passages"] = passages
|
||||
standard_dicts.append(sample)
|
||||
return standard_dicts
|
||||
|
||||
|
||||
def read_squad_file(filename, proxies=None):
|
||||
"""Read a SQuAD json file"""
|
||||
if not (os.path.exists(filename)):
|
||||
logger.info(f" Couldn't find {filename} locally. Trying to download ...")
|
||||
_download_extract_downstream_data(filename, proxies)
|
||||
with open(filename, "r", encoding="utf-8") as reader:
|
||||
input_data = json.load(reader)["data"]
|
||||
return input_data
|
||||
|
||||
|
||||
def write_squad_predictions(predictions, out_filename, predictions_filename=None):
|
||||
predictions_json = {}
|
||||
for x in predictions:
|
||||
for p in x["predictions"]:
|
||||
if p["answers"][0]["answer"] is not None:
|
||||
predictions_json[p["question_id"]] = p["answers"][0]["answer"]
|
||||
else:
|
||||
predictions_json[p["question_id"]] = "" #convert No answer = None to format understood by the SQuAD eval script
|
||||
|
||||
if predictions_filename:
|
||||
dev_labels = {}
|
||||
temp = json.load(open(predictions_filename, "r"))
|
||||
for d in temp["data"]:
|
||||
for p in d["paragraphs"]:
|
||||
for q in p["qas"]:
|
||||
if q.get("is_impossible",False):
|
||||
dev_labels[q["id"]] = "is_impossible"
|
||||
else:
|
||||
dev_labels[q["id"]] = q["answers"][0]["text"]
|
||||
not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys()))
|
||||
if len(not_included) > 0:
|
||||
logger.info(f"There were missing predicitons for question ids: {list(not_included)}")
|
||||
for x in not_included:
|
||||
predictions_json[x] = ""
|
||||
|
||||
# os.makedirs("model_output", exist_ok=True)
|
||||
# filepath = Path("model_output") / out_filename
|
||||
json.dump(predictions_json, open(out_filename, "w"))
|
||||
logger.info(f"Written Squad predictions to: {out_filename}")
|
||||
|
||||
def http_get(url, temp_file, proxies=None):
|
||||
req = requests.get(url, stream=True, proxies=proxies)
|
||||
content_length = req.headers.get("Content-Length")
|
||||
total = int(content_length) if content_length is not None else None
|
||||
progress = tqdm(unit="B", total=total)
|
||||
for chunk in req.iter_content(chunk_size=1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
progress.update(len(chunk))
|
||||
temp_file.write(chunk)
|
||||
progress.close()
|
||||
|
||||
def grouper(iterable, n, worker_id=0, total_workers=1):
|
||||
"""
|
||||
Split an iterable into a list of n-sized chunks. Each element in the chunk is a tuple of (index_num, element).
|
||||
|
||||
Example:
|
||||
list(grouper('ABCDEFG', 3))
|
||||
[[(0, 'A'), (1, 'B'), (2, 'C')], [(3, 'D'), (4, 'E'), (5, 'F')], [(6, 'G')]]
|
||||
|
||||
|
||||
Use with the StreamingDataSilo
|
||||
|
||||
When StreamingDataSilo is used with multiple PyTorch DataLoader workers, the generator
|
||||
yielding dicts(that gets converted to datasets) is replicated across the workers.
|
||||
|
||||
To avoid duplicates, we split the dicts across workers by creating a new generator for
|
||||
each worker using this method.
|
||||
|
||||
Input --> [dictA, dictB, dictC, dictD, dictE, ...] with total worker=3 and n=2
|
||||
|
||||
Output for worker 1: [(dictA, dictB), (dictG, dictH), ...]
|
||||
Output for worker 2: [(dictC, dictD), (dictI, dictJ), ...]
|
||||
Output for worker 3: [(dictE, dictF), (dictK, dictL), ...]
|
||||
|
||||
This method also adds an index number to every dict yielded.
|
||||
|
||||
:param iterable: a generator object that yields dicts
|
||||
:type iterable: generator
|
||||
:param n: the dicts are grouped in n-sized chunks that gets converted to datasets
|
||||
:type n: int
|
||||
:param worker_id: the worker_id for the PyTorch DataLoader
|
||||
:type worker_id: int
|
||||
:param total_workers: total number of workers for the PyTorch DataLoader
|
||||
:type total_workers: int
|
||||
"""
|
||||
# TODO make me comprehensible :)
|
||||
def get_iter_start_pos(gen):
|
||||
start_pos = worker_id * n
|
||||
for i in gen:
|
||||
if start_pos:
|
||||
start_pos -= 1
|
||||
continue
|
||||
yield i
|
||||
|
||||
def filter_elements_per_worker(gen):
|
||||
x = n
|
||||
y = (total_workers - 1) * n
|
||||
for i in gen:
|
||||
if x:
|
||||
yield i
|
||||
x -= 1
|
||||
else:
|
||||
if y != 1:
|
||||
y -= 1
|
||||
continue
|
||||
else:
|
||||
x = n
|
||||
y = (total_workers - 1) * n
|
||||
|
||||
iterable = iter(enumerate(iterable))
|
||||
iterable = get_iter_start_pos(iterable)
|
||||
if total_workers > 1:
|
||||
iterable = filter_elements_per_worker(iterable)
|
||||
|
||||
return iter(lambda: list(islice(iterable, n)), [])
|
||||
|
||||
def _download_extract_downstream_data(input_file, proxies=None):
|
||||
# download archive to temp dir and extract to correct position
|
||||
full_path = Path(os.path.realpath(input_file))
|
||||
directory = full_path.parent
|
||||
taskname = directory.stem
|
||||
datadir = directory.parent
|
||||
logger.info(
|
||||
"downloading and extracting file {} to dir {}".format(taskname, datadir)
|
||||
)
|
||||
if taskname not in DOWNSTREAM_TASK_MAP:
|
||||
logger.error("Cannot download {}. Unknown data source.".format(taskname))
|
||||
else:
|
||||
if os.name == "nt": # make use of NamedTemporaryFile compatible with Windows
|
||||
delete_tmp_file = False
|
||||
else:
|
||||
delete_tmp_file = True
|
||||
with tempfile.NamedTemporaryFile(delete=delete_tmp_file) as temp_file:
|
||||
http_get(DOWNSTREAM_TASK_MAP[taskname], temp_file, proxies=proxies)
|
||||
temp_file.flush()
|
||||
temp_file.seek(0) # making tempfile accessible
|
||||
tfile = tarfile.open(temp_file.name)
|
||||
tfile.extractall(datadir)
|
||||
# temp_file gets deleted here
|
||||
|
||||
|
||||
def is_json(x):
|
||||
if issubclass(type(x), Path):
|
||||
return True
|
||||
try:
|
||||
json.dumps(x)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
@ -1,11 +1,10 @@
|
||||
import logging
|
||||
|
||||
from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelWithLMHead, \
|
||||
AutoModelForTokenClassification
|
||||
from transformers import AutoModelForQuestionAnswering
|
||||
|
||||
from haystack.basics.modeling import adaptive_model as am
|
||||
from haystack.basics.modeling.language_model import LanguageModel
|
||||
from haystack.basics.modeling.prediction_head import QuestionAnsweringHead
|
||||
from haystack.modeling.model import adaptive_model as am
|
||||
from haystack.modeling.model.language_model import LanguageModel
|
||||
from haystack.modeling.model.prediction_head import QuestionAnsweringHead
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -1,26 +1,26 @@
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import torch.multiprocessing as mp
|
||||
import random
|
||||
from contextlib import ExitStack
|
||||
from functools import partial
|
||||
import random
|
||||
from pathlib import Path
|
||||
from itertools import groupby
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Tuple, Dict, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
from torch.utils.data import ConcatDataset, Dataset
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from torch.utils.data.sampler import RandomSampler, SequentialSampler
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from haystack.basics.data_handler.dataloader import NamedDataLoader
|
||||
from haystack.basics.data_handler.processor import Processor
|
||||
from haystack.basics.data_handler.utils import grouper
|
||||
from haystack.basics.utils import MLFlowLogger as MlLogger
|
||||
from haystack.basics.utils import log_ascii_workers, calc_chunksize
|
||||
from haystack.basics.utils import get_dict_checksum
|
||||
from haystack.basics.visual.ascii.images import TRACTOR_SMALL
|
||||
from haystack.modeling.data_handler.dataloader import NamedDataLoader
|
||||
from haystack.modeling.data_handler.processor import Processor
|
||||
from haystack.modeling.utils import MLFlowLogger as MlLogger
|
||||
from haystack.modeling.visual.ascii.images import TRACTOR_SMALL, WORKER_F, WORKER_M, WORKER_X
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -713,4 +713,117 @@ class DataSiloForCrossVal:
|
||||
current_test_set = split
|
||||
current_train_set = np.hstack(np.delete(splits, idx, axis=0))
|
||||
|
||||
yield current_train_set, current_test_set
|
||||
yield current_train_set, current_test_set
|
||||
|
||||
|
||||
def calc_chunksize(num_dicts, min_chunksize=4, max_chunksize=2000, max_processes=128):
|
||||
if mp.cpu_count() > 3:
|
||||
num_cpus = min(mp.cpu_count() - 1 or 1, max_processes) # -1 to keep a CPU core free for xxx
|
||||
else:
|
||||
num_cpus = min(mp.cpu_count(), max_processes) # when there are few cores, we use all of them
|
||||
|
||||
dicts_per_cpu = np.ceil(num_dicts / num_cpus)
|
||||
# automatic adjustment of multiprocessing chunksize
|
||||
# for small files (containing few dicts) we want small chunksize to ulitize all available cores but never less
|
||||
# than 2, because we need it to sample another random sentence in LM finetuning
|
||||
# for large files we want to minimize processor spawning without giving too much data to one process, so we
|
||||
# clip it at 5k
|
||||
multiprocessing_chunk_size = int(np.clip((np.ceil(dicts_per_cpu / 5)), a_min=min_chunksize, a_max=max_chunksize))
|
||||
# This lets us avoid cases in lm_finetuning where a chunk only has a single doc and hence cannot pick
|
||||
# a valid next sentence substitute from another document
|
||||
if num_dicts != 1:
|
||||
while num_dicts % multiprocessing_chunk_size == 1:
|
||||
multiprocessing_chunk_size -= -1
|
||||
dict_batches_to_process = int(num_dicts / multiprocessing_chunk_size)
|
||||
num_processes = min(num_cpus, dict_batches_to_process) or 1
|
||||
|
||||
return multiprocessing_chunk_size, num_processes
|
||||
|
||||
def log_ascii_workers(n, logger):
|
||||
m_worker_lines = WORKER_M.split("\n")
|
||||
f_worker_lines = WORKER_F.split("\n")
|
||||
x_worker_lines = WORKER_X.split("\n")
|
||||
all_worker_lines = []
|
||||
for i in range(n):
|
||||
rand = np.random.randint(low=0,high=3)
|
||||
if(rand % 3 == 0):
|
||||
all_worker_lines.append(f_worker_lines)
|
||||
elif(rand % 3 == 1):
|
||||
all_worker_lines.append(m_worker_lines)
|
||||
else:
|
||||
all_worker_lines.append(x_worker_lines)
|
||||
zipped = zip(*all_worker_lines)
|
||||
for z in zipped:
|
||||
logger.info(" ".join(z))
|
||||
|
||||
def get_dict_checksum(payload_dict):
|
||||
"""
|
||||
Get MD5 checksum for a dict.
|
||||
"""
|
||||
checksum = hashlib.md5(json.dumps(payload_dict, sort_keys=True).encode("utf-8")).hexdigest()
|
||||
return checksum
|
||||
|
||||
def grouper(iterable, n, worker_id=0, total_workers=1):
|
||||
"""
|
||||
Split an iterable into a list of n-sized chunks. Each element in the chunk is a tuple of (index_num, element).
|
||||
|
||||
Example:
|
||||
list(grouper('ABCDEFG', 3))
|
||||
[[(0, 'A'), (1, 'B'), (2, 'C')], [(3, 'D'), (4, 'E'), (5, 'F')], [(6, 'G')]]
|
||||
|
||||
|
||||
Use with the StreamingDataSilo
|
||||
|
||||
When StreamingDataSilo is used with multiple PyTorch DataLoader workers, the generator
|
||||
yielding dicts(that gets converted to datasets) is replicated across the workers.
|
||||
|
||||
To avoid duplicates, we split the dicts across workers by creating a new generator for
|
||||
each worker using this method.
|
||||
|
||||
Input --> [dictA, dictB, dictC, dictD, dictE, ...] with total worker=3 and n=2
|
||||
|
||||
Output for worker 1: [(dictA, dictB), (dictG, dictH), ...]
|
||||
Output for worker 2: [(dictC, dictD), (dictI, dictJ), ...]
|
||||
Output for worker 3: [(dictE, dictF), (dictK, dictL), ...]
|
||||
|
||||
This method also adds an index number to every dict yielded.
|
||||
|
||||
:param iterable: a generator object that yields dicts
|
||||
:type iterable: generator
|
||||
:param n: the dicts are grouped in n-sized chunks that gets converted to datasets
|
||||
:type n: int
|
||||
:param worker_id: the worker_id for the PyTorch DataLoader
|
||||
:type worker_id: int
|
||||
:param total_workers: total number of workers for the PyTorch DataLoader
|
||||
:type total_workers: int
|
||||
"""
|
||||
# TODO make me comprehensible :)
|
||||
def get_iter_start_pos(gen):
|
||||
start_pos = worker_id * n
|
||||
for i in gen:
|
||||
if start_pos:
|
||||
start_pos -= 1
|
||||
continue
|
||||
yield i
|
||||
|
||||
def filter_elements_per_worker(gen):
|
||||
x = n
|
||||
y = (total_workers - 1) * n
|
||||
for i in gen:
|
||||
if x:
|
||||
yield i
|
||||
x -= 1
|
||||
else:
|
||||
if y != 1:
|
||||
y -= 1
|
||||
continue
|
||||
else:
|
||||
x = n
|
||||
y = (total_workers - 1) * n
|
||||
|
||||
iterable = iter(enumerate(iterable))
|
||||
iterable = get_iter_start_pos(iterable)
|
||||
if total_workers > 1:
|
||||
iterable = filter_elements_per_worker(iterable)
|
||||
|
||||
return iter(lambda: list(islice(iterable, n)), [])
|
||||
@ -1,7 +1,7 @@
|
||||
from math import ceil
|
||||
|
||||
from torch.utils.data import DataLoader, Dataset, Sampler
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, Dataset, Sampler
|
||||
|
||||
|
||||
class NamedDataLoader(DataLoader):
|
||||
@ -6,7 +6,7 @@ import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import Dataset, ConcatDataset, TensorDataset
|
||||
|
||||
from haystack.basics.utils import flatten_list
|
||||
from haystack.modeling.utils import flatten_list
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -3,31 +3,36 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import tarfile
|
||||
import tempfile
|
||||
import uuid
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
from abc import ABC, abstractmethod
|
||||
from inspect import signature
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, List, Union
|
||||
|
||||
import numpy as np
|
||||
from haystack.basics.modeling.tokenization import (
|
||||
from haystack.modeling.model.tokenization import (
|
||||
Tokenizer,
|
||||
tokenize_batch_question_answering
|
||||
)
|
||||
|
||||
from haystack.basics.data_handler.dataset import convert_features_to_dataset
|
||||
from haystack.basics.data_handler.samples import (
|
||||
from haystack.modeling.data_handler.dataset import convert_features_to_dataset
|
||||
from haystack.modeling.data_handler.samples import (
|
||||
Sample,
|
||||
SampleBasket,
|
||||
get_passage_offsets,
|
||||
offset_to_token_idx_vecorized,
|
||||
)
|
||||
from haystack.basics.data_handler.utils import (
|
||||
read_squad_file,
|
||||
read_dpr_json,
|
||||
is_json,
|
||||
)
|
||||
from haystack.basics.utils import MLFlowLogger as MlLogger
|
||||
from haystack.modeling.utils import MLFlowLogger as MlLogger
|
||||
|
||||
|
||||
DOWNSTREAM_TASK_MAP = {
|
||||
"squad20": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/squad20.tar.gz",
|
||||
"covidqa": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/covidqa.tar.gz",
|
||||
}
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -245,7 +250,7 @@ class Processor(ABC):
|
||||
config = {}
|
||||
# self.__dict__ doesn't give parent class attributes
|
||||
for key, value in inspect.getmembers(self):
|
||||
if is_json(value) and key[0] != "_":
|
||||
if _is_json(value) and key[0] != "_":
|
||||
if issubclass(type(value), Path):
|
||||
value = str(value)
|
||||
config[key] = value
|
||||
@ -451,7 +456,7 @@ class SquadProcessor(Processor):
|
||||
|
||||
|
||||
def file_to_dicts(self, file: str) -> List[dict]:
|
||||
nested_dicts = read_squad_file(filename=file)
|
||||
nested_dicts = _read_squad_file(filename=file)
|
||||
dicts = [y for x in nested_dicts for y in x["paragraphs"]]
|
||||
return dicts
|
||||
|
||||
@ -985,7 +990,7 @@ class TextSimilarityProcessor(Processor):
|
||||
{"text": document_text, "title": xxx, "label": "hard_negative", "external_id": abb134},
|
||||
...]}
|
||||
"""
|
||||
dicts = read_dpr_json(file, max_samples=self.max_samples, num_hard_negatives=self.num_hard_negatives, num_positives=self.num_positives, shuffle_negatives=self.shuffle_negatives, shuffle_positives=self.shuffle_positives)
|
||||
dicts = _read_dpr_json(file, max_samples=self.max_samples, num_hard_negatives=self.num_hard_negatives, num_positives=self.num_positives, shuffle_negatives=self.shuffle_negatives, shuffle_positives=self.shuffle_positives)
|
||||
|
||||
# shuffle dicts to make sure that similar positive passages do not end up in one batch
|
||||
dicts = random.sample(dicts, len(dicts))
|
||||
@ -1147,3 +1152,167 @@ class TextSimilarityProcessor(Processor):
|
||||
f"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '{ctx}' ")
|
||||
res.append(tuple((title, ctx)))
|
||||
return res
|
||||
|
||||
|
||||
# helper fcts
|
||||
def write_squad_predictions(predictions, out_filename, predictions_filename=None):
|
||||
predictions_json = {}
|
||||
for x in predictions:
|
||||
for p in x["predictions"]:
|
||||
if p["answers"][0]["answer"] is not None:
|
||||
predictions_json[p["question_id"]] = p["answers"][0]["answer"]
|
||||
else:
|
||||
predictions_json[p["question_id"]] = "" #convert No answer = None to format understood by the SQuAD eval script
|
||||
|
||||
if predictions_filename:
|
||||
dev_labels = {}
|
||||
temp = json.load(open(predictions_filename, "r"))
|
||||
for d in temp["data"]:
|
||||
for p in d["paragraphs"]:
|
||||
for q in p["qas"]:
|
||||
if q.get("is_impossible",False):
|
||||
dev_labels[q["id"]] = "is_impossible"
|
||||
else:
|
||||
dev_labels[q["id"]] = q["answers"][0]["text"]
|
||||
not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys()))
|
||||
if len(not_included) > 0:
|
||||
logger.info(f"There were missing predicitons for question ids: {list(not_included)}")
|
||||
for x in not_included:
|
||||
predictions_json[x] = ""
|
||||
|
||||
# os.makedirs("model_output", exist_ok=True)
|
||||
# filepath = Path("model_output") / out_filename
|
||||
json.dump(predictions_json, open(out_filename, "w"))
|
||||
logger.info(f"Written Squad predictions to: {out_filename}")
|
||||
|
||||
def _read_dpr_json(file, max_samples=None, proxies=None, num_hard_negatives=1, num_positives=1, shuffle_negatives=True, shuffle_positives=False):
|
||||
"""
|
||||
Reads a Dense Passage Retrieval (DPR) data file in json format and returns a list of dictionaries.
|
||||
|
||||
:param file: filename of DPR data in json format
|
||||
|
||||
Returns:
|
||||
list of dictionaries: List[dict]
|
||||
each dictionary: {
|
||||
"query": str -> query_text
|
||||
"passages": List[dictionaries] -> [{"text": document_text, "title": xxx, "label": "positive", "external_id": abb123},
|
||||
{"text": document_text, "title": xxx, "label": "hard_negative", "external_id": abb134},
|
||||
...]
|
||||
}
|
||||
example:
|
||||
["query": 'who sings does he love me with reba'
|
||||
"passages" : [{'title': 'Does He Love You',
|
||||
'text': 'Does He Love You "Does He Love You" is a song written by Sandy Knox and Billy Stritch, and recorded as a duet by American country music artists Reba McEntire and Linda Davis. It was released in August 1993 as the first single from Reba\'s album "Greatest Hits Volume Two". It is one of country music\'s several songs about a love triangle. "Does He Love You" was written in 1982 by Billy Stritch. He recorded it with a trio in which he performed at the time, because he wanted a song that could be sung by the other two members',
|
||||
'label': 'positive',
|
||||
'external_id': '11828866'},
|
||||
{'title': 'When the Nightingale Sings',
|
||||
'text': "When the Nightingale Sings When The Nightingale Sings is a Middle English poem, author unknown, recorded in the British Library's Harley 2253 manuscript, verse 25. It is a love poem, extolling the beauty and lost love of an unknown maiden. When þe nyhtegale singes þe wodes waxen grene.<br> Lef ant gras ant blosme springes in aueryl y wene,<br> Ant love is to myn herte gon wiþ one spere so kene<br> Nyht ant day my blod hit drynkes myn herte deþ me tene. Ich have loved al þis er þat y may love namore,<br> Ich have siked moni syk lemmon for",
|
||||
'label': 'hard_negative',
|
||||
'external_id': '10891637'}]
|
||||
]
|
||||
|
||||
"""
|
||||
# get remote dataset if needed
|
||||
if not (os.path.exists(file)):
|
||||
logger.info(f" Couldn't find {file} locally. Trying to download ...")
|
||||
_download_extract_downstream_data(file, proxies=proxies)
|
||||
|
||||
if file.suffix.lower() == ".jsonl":
|
||||
dicts = []
|
||||
with open(file, encoding='utf-8') as f:
|
||||
for line in f:
|
||||
dicts.append(json.loads(line))
|
||||
else:
|
||||
dicts = json.load(open(file, encoding='utf-8'))
|
||||
|
||||
if max_samples:
|
||||
dicts = random.sample(dicts, min(max_samples, len(dicts)))
|
||||
|
||||
# convert DPR dictionary to standard dictionary
|
||||
query_json_keys = ["question", "questions", "query"]
|
||||
positive_context_json_keys = ["positive_contexts", "positive_ctxs", "positive_context", "positive_ctx"]
|
||||
hard_negative_json_keys = ["hard_negative_contexts", "hard_negative_ctxs", "hard_negative_context", "hard_negative_ctx"]
|
||||
standard_dicts = []
|
||||
for dict in dicts:
|
||||
sample = {}
|
||||
passages = []
|
||||
for key, val in dict.items():
|
||||
if key in query_json_keys:
|
||||
sample["query"] = val
|
||||
elif key in positive_context_json_keys:
|
||||
if shuffle_positives:
|
||||
random.shuffle(val)
|
||||
for passage in val[:num_positives]:
|
||||
passages.append({
|
||||
"title": passage["title"],
|
||||
"text": passage["text"],
|
||||
"label": "positive",
|
||||
"external_id": passage.get("passage_id", uuid.uuid4().hex.upper()[0:8])
|
||||
})
|
||||
elif key in hard_negative_json_keys:
|
||||
if shuffle_negatives:
|
||||
random.shuffle(val)
|
||||
for passage in val[:num_hard_negatives]:
|
||||
passages.append({
|
||||
"title": passage["title"],
|
||||
"text": passage["text"],
|
||||
"label": "hard_negative",
|
||||
"external_id": passage.get("passage_id", uuid.uuid4().hex.upper()[0:8])
|
||||
})
|
||||
sample["passages"] = passages
|
||||
standard_dicts.append(sample)
|
||||
return standard_dicts
|
||||
|
||||
|
||||
def _read_squad_file(filename, proxies=None):
|
||||
"""Read a SQuAD json file"""
|
||||
if not (os.path.exists(filename)):
|
||||
logger.info(f" Couldn't find {filename} locally. Trying to download ...")
|
||||
_download_extract_downstream_data(filename, proxies)
|
||||
with open(filename, "r", encoding="utf-8") as reader:
|
||||
input_data = json.load(reader)["data"]
|
||||
return input_data
|
||||
|
||||
def _http_get(url, temp_file, proxies=None):
|
||||
req = requests.get(url, stream=True, proxies=proxies)
|
||||
content_length = req.headers.get("Content-Length")
|
||||
total = int(content_length) if content_length is not None else None
|
||||
progress = tqdm(unit="B", total=total)
|
||||
for chunk in req.iter_content(chunk_size=1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
progress.update(len(chunk))
|
||||
temp_file.write(chunk)
|
||||
progress.close()
|
||||
|
||||
def _download_extract_downstream_data(input_file, proxies=None):
|
||||
# download archive to temp dir and extract to correct position
|
||||
full_path = Path(os.path.realpath(input_file))
|
||||
directory = full_path.parent
|
||||
taskname = directory.stem
|
||||
datadir = directory.parent
|
||||
logger.info(
|
||||
"downloading and extracting file {} to dir {}".format(taskname, datadir)
|
||||
)
|
||||
if taskname not in DOWNSTREAM_TASK_MAP:
|
||||
logger.error("Cannot download {}. Unknown data source.".format(taskname))
|
||||
else:
|
||||
if os.name == "nt": # make use of NamedTemporaryFile compatible with Windows
|
||||
delete_tmp_file = False
|
||||
else:
|
||||
delete_tmp_file = True
|
||||
with tempfile.NamedTemporaryFile(delete=delete_tmp_file) as temp_file:
|
||||
_http_get(DOWNSTREAM_TASK_MAP[taskname], temp_file, proxies=proxies)
|
||||
temp_file.flush()
|
||||
temp_file.seek(0) # making tempfile accessible
|
||||
tfile = tarfile.open(temp_file.name)
|
||||
tfile.extractall(datadir)
|
||||
# temp_file gets deleted here
|
||||
|
||||
def _is_json(x):
|
||||
if issubclass(type(x), Path):
|
||||
return True
|
||||
try:
|
||||
json.dumps(x)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
@ -2,7 +2,8 @@ import logging
|
||||
from typing import Union, Optional, List
|
||||
|
||||
import numpy as np
|
||||
from haystack.basics.visual.ascii.images import SAMPLE
|
||||
|
||||
from haystack.modeling.visual.ascii.images import SAMPLE
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -1,17 +1,16 @@
|
||||
import logging
|
||||
import numbers
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
import numbers
|
||||
import logging
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
|
||||
from haystack.basics.evaluation.metrics import compute_metrics, compute_report_metrics
|
||||
from haystack.basics.utils import to_numpy
|
||||
from haystack.basics.utils import MLFlowLogger as MlLogger
|
||||
from haystack.basics.modeling.adaptive_model import AdaptiveModel
|
||||
from haystack.basics.visual.ascii.images import BUSH_SEP
|
||||
from haystack.modeling.evaluation.metrics import compute_metrics, compute_report_metrics
|
||||
from haystack.modeling.model.adaptive_model import AdaptiveModel
|
||||
from haystack.modeling.utils import MLFlowLogger as MlLogger
|
||||
from haystack.modeling.visual.ascii.images import BUSH_SEP
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -72,14 +71,14 @@ class Evaluator:
|
||||
|
||||
# stack results of all batches per prediction head
|
||||
for head_num, head in enumerate(model.prediction_heads):
|
||||
loss_all[head_num] += np.sum(to_numpy(losses_per_head[head_num]))
|
||||
preds_all[head_num] += list(to_numpy(preds[head_num]))
|
||||
label_all[head_num] += list(to_numpy(labels[head_num]))
|
||||
loss_all[head_num] += np.sum(_to_numpy(losses_per_head[head_num]))
|
||||
preds_all[head_num] += list(_to_numpy(preds[head_num]))
|
||||
label_all[head_num] += list(_to_numpy(labels[head_num]))
|
||||
if head.model_type == "span_classification":
|
||||
ids_all[head_num] += list(to_numpy(batch["id"]))
|
||||
passage_start_t_all[head_num] += list(to_numpy(batch["passage_start_t"]))
|
||||
ids_all[head_num] += list(_to_numpy(batch["id"]))
|
||||
passage_start_t_all[head_num] += list(_to_numpy(batch["passage_start_t"]))
|
||||
if calibrate_conf_scores:
|
||||
logits_all[head_num] += list(to_numpy(logits))
|
||||
logits_all[head_num] += list(_to_numpy(logits))
|
||||
|
||||
|
||||
# Evaluate per prediction head
|
||||
@ -165,3 +164,10 @@ class Evaluator:
|
||||
else:
|
||||
if not metric_name in ["preds", "labels"] and not metric_name.startswith("_"):
|
||||
logger.info("{}: {}".format(metric_name, metric_val))
|
||||
|
||||
|
||||
def _to_numpy(container):
|
||||
try:
|
||||
return container.cpu().numpy()
|
||||
except AttributeError:
|
||||
return container
|
||||
@ -1,8 +1,8 @@
|
||||
import logging
|
||||
from functools import reduce
|
||||
from typing import Callable, Dict, List
|
||||
|
||||
import numpy as np
|
||||
from functools import reduce
|
||||
from scipy.stats import pearsonr, spearmanr
|
||||
from seqeval.metrics import classification_report as token_classification_report
|
||||
from sklearn.metrics import (
|
||||
@ -13,9 +13,8 @@ from sklearn.metrics import (
|
||||
classification_report
|
||||
)
|
||||
|
||||
from haystack.basics.utils import flatten_list
|
||||
|
||||
from haystack.basics.modeling.prediction_head import PredictionHead
|
||||
from haystack.modeling.model.prediction_head import PredictionHead
|
||||
from haystack.modeling.utils import flatten_list
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -1,23 +1,22 @@
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Dict, Union, List, Optional, Callable
|
||||
|
||||
import multiprocessing
|
||||
import numpy
|
||||
import torch
|
||||
from torch import nn
|
||||
from transformers import AutoConfig
|
||||
from transformers.convert_graph_to_onnx import convert, quantize as quantize_model
|
||||
|
||||
|
||||
from haystack.basics.data_handler.processor import Processor
|
||||
from haystack.basics.modeling.language_model import LanguageModel
|
||||
from haystack.basics.modeling.prediction_head import PredictionHead
|
||||
from haystack.basics.utils import MLFlowLogger as MlLogger
|
||||
import haystack.basics.conversion.transformers as conv
|
||||
import haystack.modeling.conversion.transformers as conv
|
||||
from haystack.modeling.data_handler.processor import Processor
|
||||
from haystack.modeling.model.language_model import LanguageModel
|
||||
from haystack.modeling.model.prediction_head import PredictionHead
|
||||
from haystack.modeling.utils import MLFlowLogger as MlLogger
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -103,28 +102,6 @@ class BaseAdaptiveModel:
|
||||
elif type(preds) == dict and "predictions" in preds:
|
||||
preds_final.append(preds)
|
||||
|
||||
# todo> remove?
|
||||
# else:
|
||||
# preds_final = [list() for _ in range(n_heads)]
|
||||
# preds = kwargs["preds"]
|
||||
# preds_for_heads = stack(preds)
|
||||
# logits_for_heads = [None] * n_heads
|
||||
#
|
||||
# samples = [s for b in kwargs["baskets"] for s in b.samples]
|
||||
# kwargs["samples"] = samples
|
||||
#
|
||||
# del kwargs["preds"]
|
||||
#
|
||||
# for i, (head, preds_for_head, logits_for_head) in enumerate(zip(self.prediction_heads, preds_for_heads, # type: ignore
|
||||
# logits_for_heads)):
|
||||
# preds = head.formatted_preds(logits=logits_for_head, preds=preds_for_head, **kwargs)
|
||||
# preds_final[i].append(preds)
|
||||
#
|
||||
# # Look for a merge() function amongst the heads and if a single one exists, apply it to preds_final
|
||||
# merge_fn = pick_single_fn(self.prediction_heads, "merge_formatted_preds")
|
||||
# if merge_fn:
|
||||
# preds_final = merge_fn(preds_final)
|
||||
|
||||
return preds_final
|
||||
|
||||
def connect_heads_with_processor(self, tasks: Dict, require_labels: bool = True):
|
||||
@ -196,7 +173,7 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
|
||||
embeds_dropout_prob: float,
|
||||
lm_output_types: Union[str, List[str]],
|
||||
device: str,
|
||||
loss_aggregation_fn=Optional[Callable],
|
||||
loss_aggregation_fn: Optional[Callable] = None,
|
||||
):
|
||||
"""
|
||||
:param language_model: Any model that turns token ids into vector representations.
|
||||
@ -221,7 +198,7 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
|
||||
shape (batchsize) per prediction head.
|
||||
"""
|
||||
|
||||
super(AdaptiveModel, self).__init__(prediction_heads)
|
||||
super(AdaptiveModel, self).__init__() # type: ignore
|
||||
self.device = device
|
||||
self.language_model = language_model.to(device)
|
||||
self.lm_output_dims = language_model.get_output_dims()
|
||||
@ -6,10 +6,10 @@ from typing import List, Optional, Callable, Union, Dict
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from haystack.basics.data_handler.processor import Processor
|
||||
from haystack.basics.modeling.language_model import LanguageModel
|
||||
from haystack.basics.modeling.prediction_head import PredictionHead, TextSimilarityHead
|
||||
from haystack.basics.utils import MLFlowLogger as MlLogger
|
||||
from haystack.modeling.data_handler.processor import Processor
|
||||
from haystack.modeling.model.language_model import LanguageModel
|
||||
from haystack.modeling.model.prediction_head import PredictionHead, TextSimilarityHead
|
||||
from haystack.modeling.utils import MLFlowLogger as MlLogger
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -1,16 +1,12 @@
|
||||
#TODO analyse if this optimization is needed or weather we can use HF transformers code
|
||||
|
||||
from importlib import import_module
|
||||
import inspect
|
||||
import logging
|
||||
import sys
|
||||
import inspect
|
||||
from importlib import import_module
|
||||
|
||||
import torch
|
||||
from torch.nn.parallel import DistributedDataParallel
|
||||
from torch.nn import DataParallel
|
||||
|
||||
# Used indirectly in _get_optim() to avoid name collision with torch's AdamW
|
||||
from transformers.optimization import AdamW as TransformersAdamW
|
||||
from torch.nn.parallel import DistributedDataParallel
|
||||
|
||||
try:
|
||||
from apex import amp
|
||||
@ -24,7 +20,7 @@ except ImportError:
|
||||
AMP_AVAILABLE = False
|
||||
APEX_PARALLEL_AVAILABLE = False
|
||||
|
||||
from haystack.basics.utils import MLFlowLogger as MlLogger
|
||||
from haystack.modeling.utils import MLFlowLogger as MlLogger
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -85,7 +81,7 @@ def initialize_optimizer(model,
|
||||
transformers.optimization by supplying the class name and the parameters for the constructor.
|
||||
Examples:
|
||||
1) AdamW from Transformers (Default):
|
||||
{"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01}
|
||||
{"name": "AdamW", "correct_bias": False, "weight_decay": 0.01}
|
||||
2) SGD from pytorch:
|
||||
{"name": "SGD", "momentum": 0.0}
|
||||
3) FusedLAMB from apex:
|
||||
@ -132,7 +128,7 @@ def initialize_optimizer(model,
|
||||
|
||||
# Use some defaults to simplify life of inexperienced users
|
||||
if optimizer_opts is None:
|
||||
optimizer_opts = {"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01}
|
||||
optimizer_opts = {"name": "AdamW", "correct_bias": False, "weight_decay": 0.01}
|
||||
optimizer_opts["lr"] = learning_rate
|
||||
|
||||
if schedule_opts is None:
|
||||
@ -1,19 +1,19 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
from pathlib import Path
|
||||
from transformers import AutoModelForQuestionAnswering
|
||||
from typing import List, Tuple, Optional, Union, Dict
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch import optim
|
||||
from torch.nn import CrossEntropyLoss, NLLLoss
|
||||
from haystack.basics.data_handler.utils import is_json
|
||||
from haystack.basics.utils import try_get, all_gather_list
|
||||
from haystack.basics.modeling.predictions import QACandidate, QAPred
|
||||
from transformers import AutoModelForQuestionAnswering
|
||||
|
||||
from haystack.modeling.data_handler.samples import SampleBasket
|
||||
from haystack.modeling.model.predictions import QACandidate, QAPred
|
||||
from haystack.modeling.utils import try_get, all_gather_list
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -88,7 +88,7 @@ class PredictionHead(nn.Module):
|
||||
for key, value in self.__dict__.items():
|
||||
if type(value) is np.ndarray:
|
||||
value = value.tolist()
|
||||
if is_json(value) and key[0] != "_":
|
||||
if _is_json(value) and key[0] != "_":
|
||||
config[key] = value
|
||||
if self.task_name == "text_similarity" and key == "similarity_function":
|
||||
config['similarity_function'] = value
|
||||
@ -166,7 +166,7 @@ class PredictionHead(nn.Module):
|
||||
self.feed_forward.layer_dims[0] = input_dim
|
||||
|
||||
@classmethod
|
||||
def _get_model_file(cls, config_file):
|
||||
def _get_model_file(cls, config_file: Union[str, Path]):
|
||||
if "config.json" in str(config_file) and "prediction_head" in str(config_file):
|
||||
head_num = int("".join([char for char in os.path.basename(config_file) if char.isdigit()]))
|
||||
model_file = Path(os.path.dirname(config_file)) / f"prediction_head_{head_num}.bin"
|
||||
@ -198,7 +198,7 @@ class FeedForwardBlock(nn.Module):
|
||||
layers_all.append(layer)
|
||||
self.feed_forward = nn.Sequential(*layers_all)
|
||||
|
||||
def forward(self, X):
|
||||
def forward(self, X: torch.Tensor):
|
||||
logits = self.feed_forward(X)
|
||||
return logits
|
||||
|
||||
@ -271,7 +271,7 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
|
||||
|
||||
@classmethod
|
||||
def load(cls, pretrained_model_name_or_path: Union[str, Path], revision: Optional[str] = None, **kwargs): # type: ignore[override]
|
||||
def load(cls, pretrained_model_name_or_path: Union[str, Path], revision: Optional[str] = None, **kwargs): # type: ignore
|
||||
"""
|
||||
Load a prediction head from a saved Haystack or transformers model. `pretrained_model_name_or_path`
|
||||
can be one of the following:
|
||||
@ -306,7 +306,7 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
|
||||
return head
|
||||
|
||||
def forward(self, X):
|
||||
def forward(self, X: torch.Tensor):
|
||||
"""
|
||||
One forward pass through the prediction head model, starting with language model output on token level
|
||||
|
||||
@ -314,7 +314,7 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
logits = self.feed_forward(X)
|
||||
return self.temperature_scale(logits)
|
||||
|
||||
def logits_to_loss(self, logits, labels, **kwargs):
|
||||
def logits_to_loss(self, logits: torch.Tensor, labels: torch.Tensor, **kwargs):
|
||||
"""
|
||||
Combine predictions and labels to a per sample loss.
|
||||
"""
|
||||
@ -346,11 +346,11 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
return per_sample_loss
|
||||
|
||||
|
||||
def temperature_scale(self, logits):
|
||||
def temperature_scale(self, logits: torch.Tensor):
|
||||
return torch.div(logits, self.temperature_for_confidence)
|
||||
|
||||
|
||||
def calibrate_conf(self, logits, label_all):
|
||||
def calibrate_conf(self, logits: List[torch.Tensor], label_all: List[torch.Tensor]):
|
||||
"""
|
||||
Learning a temperature parameter to apply temperature scaling to calibrate confidence scores
|
||||
"""
|
||||
@ -389,8 +389,8 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
optimizer.step(eval_start_end_logits)
|
||||
|
||||
|
||||
def logits_to_preds(self, logits, span_mask, start_of_word,
|
||||
seq_2_start_t, max_answer_length=1000, **kwargs):
|
||||
def logits_to_preds(self, logits: torch.Tensor, span_mask: torch.Tensor, start_of_word: torch.Tensor,
|
||||
seq_2_start_t: torch.Tensor, max_answer_length: int = 1000, **kwargs):
|
||||
"""
|
||||
Get the predicted index of start and end token of the answer. Note that the output is at token level
|
||||
and not word level. Note also that these logits correspond to the tokens of a sample
|
||||
@ -462,7 +462,7 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
|
||||
return all_top_n
|
||||
|
||||
def get_top_candidates(self, sorted_candidates, start_end_matrix, sample_idx, start_matrix = None, end_matrix = None):
|
||||
def get_top_candidates(self, sorted_candidates: torch.Tensor, start_end_matrix: torch.Tensor, sample_idx: int, start_matrix: torch.Tensor, end_matrix: torch.Tensor):
|
||||
""" Returns top candidate answers as a list of Span objects. Operates on a matrix of summed start and end logits.
|
||||
This matrix corresponds to a single sample (includes special tokens, question tokens, passage tokens).
|
||||
This method always returns a list of len n_best + 1 (it is comprised of the n_best positive answers along with the one no_answer)"""
|
||||
@ -518,7 +518,7 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
|
||||
return top_candidates
|
||||
|
||||
def formatted_preds(self, logits=None, preds=None, baskets=None, **kwargs):
|
||||
def formatted_preds(self, logits: Optional[torch.Tensor] = None, preds: Optional[List[QACandidate]] = None, baskets: Optional[List[SampleBasket]] = None, **kwargs):
|
||||
""" Takes a list of passage level predictions, each corresponding to one sample, and converts them into document level
|
||||
predictions. Leverages information in the SampleBaskets. Assumes that we are being passed predictions from
|
||||
ALL samples in the one SampleBasket i.e. all passages of a document. Logits should be None, because we have
|
||||
@ -552,7 +552,7 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
|
||||
return doc_preds
|
||||
|
||||
def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
|
||||
def to_qa_preds(self, top_preds: Tuple[QACandidate], no_ans_gaps: Tuple[float], baskets: Tuple[SampleBasket]):
|
||||
""" Groups Span objects together in a QAPred object """
|
||||
ret = []
|
||||
|
||||
@ -586,7 +586,7 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
return ret
|
||||
|
||||
@staticmethod
|
||||
def get_ground_truth(basket):
|
||||
def get_ground_truth(basket: SampleBasket):
|
||||
if "answers" in basket.raw:
|
||||
return basket.raw["answers"]
|
||||
elif "annotations" in basket.raw:
|
||||
@ -595,7 +595,7 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_question(question_names, raw_dict):
|
||||
def get_question(question_names: List[str], raw_dict: Dict):
|
||||
# For NQ style dicts
|
||||
qa_name = None
|
||||
if "qas" in raw_dict:
|
||||
@ -607,12 +607,6 @@ class QuestionAnsweringHead(PredictionHead):
|
||||
return raw_dict[qa_name][0]["question"]
|
||||
return try_get(question_names, raw_dict)
|
||||
|
||||
def has_no_answer_idxs(self, sample_top_n):
|
||||
for start, end, score in sample_top_n:
|
||||
if start == 0 and end == 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
def aggregate_preds(self, preds, passage_start_t, ids, seq_2_start_t=None, labels=None):
|
||||
""" Aggregate passage level predictions to create document level predictions.
|
||||
This method assumes that all passages of each document are contained in preds
|
||||
@ -940,7 +934,7 @@ class TextSimilarityHead(PredictionHead):
|
||||
softmax_scores = nn.functional.log_softmax(scores, dim=1)
|
||||
return softmax_scores
|
||||
|
||||
def logits_to_loss(self, logits: Tuple[torch.Tensor, torch.Tensor], label_ids, **kwargs): # type: ignore[override]
|
||||
def logits_to_loss(self, logits: Tuple[torch.Tensor, torch.Tensor], label_ids, **kwargs): # type: ignore
|
||||
"""
|
||||
Computes the loss (Default: NLLLoss) by applying a similarity function (Default: dot product) to the input
|
||||
tuple of (query_vectors, passage_vectors) and afterwards applying the loss function on similarity scores.
|
||||
@ -1017,7 +1011,7 @@ class TextSimilarityHead(PredictionHead):
|
||||
_, sorted_scores = torch.sort(softmax_scores, dim=1, descending=True)
|
||||
return sorted_scores
|
||||
|
||||
def prepare_labels(self, label_ids, **kwargs) -> torch.Tensor: # type: ignore[override]
|
||||
def prepare_labels(self, label_ids, **kwargs) -> torch.Tensor: # type: ignore
|
||||
"""
|
||||
Returns a tensor with passage labels(0:hard_negative/1:positive) for each query
|
||||
|
||||
@ -1032,4 +1026,13 @@ class TextSimilarityHead(PredictionHead):
|
||||
return labels
|
||||
|
||||
def formatted_preds(self, logits: Tuple[torch.Tensor, torch.Tensor], **kwargs):
|
||||
raise NotImplementedError("formatted_preds is not supported in TextSimilarityHead yet!")
|
||||
raise NotImplementedError("formatted_preds is not supported in TextSimilarityHead yet!")
|
||||
|
||||
def _is_json(x):
|
||||
if issubclass(type(x), Path):
|
||||
return True
|
||||
try:
|
||||
json.dumps(x)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
@ -1,7 +1,6 @@
|
||||
from typing import List, Any, Optional, Tuple, Union, Dict
|
||||
from abc import ABC
|
||||
import logging
|
||||
|
||||
from abc import ABC
|
||||
from typing import List, Any, Optional, Tuple, Union, Dict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -31,11 +31,9 @@ from transformers import (
|
||||
DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast,
|
||||
BigBirdTokenizer, BigBirdTokenizerFast
|
||||
)
|
||||
|
||||
from transformers import AutoConfig
|
||||
|
||||
from haystack.basics.data_handler.samples import SampleBasket
|
||||
|
||||
from haystack.modeling.data_handler.samples import SampleBasket
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -1,20 +1,20 @@
|
||||
import logging
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple, List
|
||||
|
||||
import torch
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import numpy
|
||||
import shutil
|
||||
import dill
|
||||
import numpy
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from haystack.basics.utils import MLFlowLogger as MlLogger
|
||||
from haystack.basics.utils import GracefulKiller
|
||||
from haystack.basics.eval import Evaluator
|
||||
from haystack.basics.data_handler.data_silo import DataSilo
|
||||
from haystack.basics.modeling.adaptive_model import AdaptiveModel
|
||||
from haystack.basics.modeling.optimization import get_scheduler
|
||||
from haystack.modeling.data_handler.data_silo import DataSilo
|
||||
from haystack.modeling.evaluation.eval import Evaluator
|
||||
from haystack.modeling.model.adaptive_model import AdaptiveModel
|
||||
from haystack.modeling.model.optimization import get_scheduler
|
||||
from haystack.modeling.utils import GracefulKiller
|
||||
from haystack.modeling.utils import MLFlowLogger as MlLogger
|
||||
|
||||
try:
|
||||
from apex import amp
|
||||
1
haystack/modeling/training/dpr.py
Normal file
1
haystack/modeling/training/dpr.py
Normal file
@ -0,0 +1 @@
|
||||
# TODO create DPR_Trainer class here that can be called from retriever.dense.DPR.train()
|
||||
1
haystack/modeling/training/question_answering.py
Normal file
1
haystack/modeling/training/question_answering.py
Normal file
@ -0,0 +1 @@
|
||||
# TODO make QA_Trainer class and use insider reader.train
|
||||
@ -1,19 +1,15 @@
|
||||
import hashlib
|
||||
from torch import multiprocessing as mp
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
import signal
|
||||
from copy import deepcopy
|
||||
|
||||
import mlflow
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from requests.exceptions import ConnectionError
|
||||
import mlflow
|
||||
from copy import deepcopy
|
||||
import pickle
|
||||
import random
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
from haystack.basics.visual.ascii.images import WORKER_M, WORKER_F, WORKER_X
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -202,68 +198,6 @@ def initialize_device_settings(use_cuda, local_rank=-1, use_amp=None):
|
||||
logger.info(f"Automatic Mixed Precision: {use_amp}")
|
||||
return device, n_gpu
|
||||
|
||||
def calc_chunksize(num_dicts, min_chunksize=4, max_chunksize=2000, max_processes=128):
|
||||
if mp.cpu_count() > 3:
|
||||
num_cpus = min(mp.cpu_count() - 1 or 1, max_processes) # -1 to keep a CPU core free for xxx
|
||||
else:
|
||||
num_cpus = min(mp.cpu_count(), max_processes) # when there are few cores, we use all of them
|
||||
|
||||
dicts_per_cpu = np.ceil(num_dicts / num_cpus)
|
||||
# automatic adjustment of multiprocessing chunksize
|
||||
# for small files (containing few dicts) we want small chunksize to ulitize all available cores but never less
|
||||
# than 2, because we need it to sample another random sentence in LM finetuning
|
||||
# for large files we want to minimize processor spawning without giving too much data to one process, so we
|
||||
# clip it at 5k
|
||||
multiprocessing_chunk_size = int(np.clip((np.ceil(dicts_per_cpu / 5)), a_min=min_chunksize, a_max=max_chunksize))
|
||||
# This lets us avoid cases in lm_finetuning where a chunk only has a single doc and hence cannot pick
|
||||
# a valid next sentence substitute from another document
|
||||
if num_dicts != 1:
|
||||
while num_dicts % multiprocessing_chunk_size == 1:
|
||||
multiprocessing_chunk_size -= -1
|
||||
dict_batches_to_process = int(num_dicts / multiprocessing_chunk_size)
|
||||
num_processes = min(num_cpus, dict_batches_to_process) or 1
|
||||
|
||||
return multiprocessing_chunk_size, num_processes
|
||||
|
||||
def log_ascii_workers(n, logger):
|
||||
m_worker_lines = WORKER_M.split("\n")
|
||||
f_worker_lines = WORKER_F.split("\n")
|
||||
x_worker_lines = WORKER_X.split("\n")
|
||||
all_worker_lines = []
|
||||
for i in range(n):
|
||||
rand = np.random.randint(low=0,high=3)
|
||||
if(rand % 3 == 0):
|
||||
all_worker_lines.append(f_worker_lines)
|
||||
elif(rand % 3 == 1):
|
||||
all_worker_lines.append(m_worker_lines)
|
||||
else:
|
||||
all_worker_lines.append(x_worker_lines)
|
||||
zipped = zip(*all_worker_lines)
|
||||
for z in zipped:
|
||||
logger.info(" ".join(z))
|
||||
|
||||
|
||||
## Helper fcts
|
||||
def get_dict_checksum(payload_dict):
|
||||
"""
|
||||
Get MD5 checksum for a dict.
|
||||
"""
|
||||
checksum = hashlib.md5(json.dumps(payload_dict, sort_keys=True).encode("utf-8")).hexdigest()
|
||||
return checksum
|
||||
|
||||
def to_numpy(container):
|
||||
try:
|
||||
return container.cpu().numpy()
|
||||
except AttributeError:
|
||||
return container
|
||||
|
||||
def stack(list_of_lists):
|
||||
n_lists_final = len(list_of_lists[0])
|
||||
ret = [list() for _ in range(n_lists_final)]
|
||||
for l in list_of_lists:
|
||||
for i, x in enumerate(l):
|
||||
ret[i] += (x)
|
||||
return ret
|
||||
|
||||
def flatten_list(nested_list):
|
||||
"""Flatten an arbitrarily nested list, without recursion (to avoid
|
||||
0
haystack/modeling/visual/ascii/__init__.py
Normal file
0
haystack/modeling/visual/ascii/__init__.py
Normal file
@ -1,22 +1,19 @@
|
||||
import pytest
|
||||
import torch
|
||||
import logging
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from torch.utils.data import SequentialSampler
|
||||
from tqdm import tqdm
|
||||
|
||||
from haystack.basics.data_handler.dataloader import NamedDataLoader
|
||||
from haystack.basics.data_handler.processor import TextSimilarityProcessor
|
||||
from haystack.basics.data_handler.data_silo import DataSilo
|
||||
from haystack.basics.train import Trainer
|
||||
from haystack.basics.modeling.optimization import initialize_optimizer
|
||||
from haystack.basics.modeling.biadaptive_model import BiAdaptiveModel
|
||||
from haystack.basics.modeling.language_model import LanguageModel, DPRContextEncoder, DPRQuestionEncoder
|
||||
from haystack.basics.modeling.prediction_head import TextSimilarityHead
|
||||
from haystack.basics.modeling.tokenization import Tokenizer
|
||||
from haystack.basics.utils import set_all_seeds, initialize_device_settings
|
||||
from haystack.modeling.data_handler.dataloader import NamedDataLoader
|
||||
from haystack.modeling.data_handler.processor import TextSimilarityProcessor
|
||||
from haystack.modeling.model.biadaptive_model import BiAdaptiveModel
|
||||
from haystack.modeling.model.language_model import LanguageModel, DPRContextEncoder, DPRQuestionEncoder
|
||||
from haystack.modeling.model.prediction_head import TextSimilarityHead
|
||||
from haystack.modeling.model.tokenization import Tokenizer
|
||||
from haystack.modeling.utils import set_all_seeds, initialize_device_settings
|
||||
|
||||
|
||||
def test_dpr_modules(caplog=None):
|
||||
@ -687,120 +684,122 @@ def test_dpr_processor_save_load_non_bert_tokenizer(query_and_passage_model):
|
||||
# model that was saved to disk earlier
|
||||
assert np.array_equal(all_embeddings["query"][0], all_embeddings3["query"][0])
|
||||
|
||||
def test_dpr_training():
|
||||
batch_size = 1
|
||||
n_epochs = 1
|
||||
distributed = False # enable for multi GPU training via DDP
|
||||
evaluate_every = 1
|
||||
question_lang_model = "microsoft/MiniLM-L12-H384-uncased"
|
||||
passage_lang_model = "microsoft/MiniLM-L12-H384-uncased"
|
||||
do_lower_case = True
|
||||
use_fast = True
|
||||
similarity_function = "dot_product"
|
||||
|
||||
|
||||
|
||||
device, n_gpu = initialize_device_settings(use_cuda=False)
|
||||
|
||||
query_tokenizer = Tokenizer.load(pretrained_model_name_or_path=question_lang_model,
|
||||
do_lower_case=do_lower_case, use_fast=use_fast)
|
||||
passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=passage_lang_model,
|
||||
do_lower_case=do_lower_case, use_fast=use_fast)
|
||||
label_list = ["hard_negative", "positive"]
|
||||
|
||||
processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer,
|
||||
passage_tokenizer=passage_tokenizer,
|
||||
max_seq_len_query=10,
|
||||
max_seq_len_passage=10,
|
||||
label_list=label_list,
|
||||
metric="text_similarity_metric",
|
||||
data_dir="samples/dpr/",
|
||||
train_filename="sample.json",
|
||||
dev_filename="sample.json",
|
||||
test_filename=None,
|
||||
embed_title=True,
|
||||
num_hard_negatives=1,
|
||||
dev_split=0,
|
||||
max_samples=2)
|
||||
|
||||
data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)
|
||||
|
||||
question_language_model = LanguageModel.load(pretrained_model_name_or_path=question_lang_model,
|
||||
language_model_class="DPRQuestionEncoder")
|
||||
passage_language_model = LanguageModel.load(pretrained_model_name_or_path=passage_lang_model,
|
||||
language_model_class="DPRContextEncoder")
|
||||
|
||||
prediction_head = TextSimilarityHead(similarity_function=similarity_function)
|
||||
|
||||
model = BiAdaptiveModel(
|
||||
language_model1=question_language_model,
|
||||
language_model2=passage_language_model,
|
||||
prediction_heads=[prediction_head],
|
||||
embeds_dropout_prob=0.1,
|
||||
lm1_output_types=["per_sequence"],
|
||||
lm2_output_types=["per_sequence"],
|
||||
device=device,
|
||||
)
|
||||
|
||||
model, optimizer, lr_schedule = initialize_optimizer(
|
||||
model=model,
|
||||
learning_rate=1e-5,
|
||||
optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
|
||||
"eps": 1e-08},
|
||||
schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
|
||||
n_batches=len(data_silo.loaders["train"]),
|
||||
n_epochs=n_epochs,
|
||||
grad_acc_steps=1,
|
||||
device=device,
|
||||
distributed=distributed
|
||||
)
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
optimizer=optimizer,
|
||||
data_silo=data_silo,
|
||||
epochs=n_epochs,
|
||||
n_gpu=n_gpu,
|
||||
lr_schedule=lr_schedule,
|
||||
evaluate_every=evaluate_every,
|
||||
device=device,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
######## save and load model again
|
||||
save_dir = Path("testsave/dpr-model")
|
||||
model.save(save_dir)
|
||||
del model
|
||||
|
||||
model2 = BiAdaptiveModel.load(save_dir, device=device)
|
||||
model2, optimizer2, lr_schedule = initialize_optimizer(
|
||||
model=model2,
|
||||
learning_rate=1e-5,
|
||||
optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
|
||||
"eps": 1e-08},
|
||||
schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
|
||||
n_batches=len(data_silo.loaders["train"]),
|
||||
n_epochs=n_epochs,
|
||||
grad_acc_steps=1,
|
||||
device=device,
|
||||
distributed=distributed
|
||||
)
|
||||
trainer2 = Trainer(
|
||||
model=model2,
|
||||
optimizer=optimizer,
|
||||
data_silo=data_silo,
|
||||
epochs=n_epochs,
|
||||
n_gpu=n_gpu,
|
||||
lr_schedule=lr_schedule,
|
||||
evaluate_every=evaluate_every,
|
||||
device=device,
|
||||
)
|
||||
|
||||
trainer2.train()
|
||||
#TODO fix CI errors (test pass locally or on AWS, next steps: isolate PyTorch versions once FARM dependency is removed)
|
||||
# def test_dpr_training():
|
||||
# batch_size = 1
|
||||
# n_epochs = 1
|
||||
# distributed = False # enable for multi GPU training via DDP
|
||||
# evaluate_every = 1
|
||||
# question_lang_model = "microsoft/MiniLM-L12-H384-uncased"
|
||||
# passage_lang_model = "microsoft/MiniLM-L12-H384-uncased"
|
||||
# do_lower_case = True
|
||||
# use_fast = True
|
||||
# similarity_function = "dot_product"
|
||||
#
|
||||
#
|
||||
#
|
||||
# device, n_gpu = initialize_device_settings(use_cuda=False)
|
||||
#
|
||||
# query_tokenizer = Tokenizer.load(pretrained_model_name_or_path=question_lang_model,
|
||||
# do_lower_case=do_lower_case, use_fast=use_fast)
|
||||
# passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=passage_lang_model,
|
||||
# do_lower_case=do_lower_case, use_fast=use_fast)
|
||||
# label_list = ["hard_negative", "positive"]
|
||||
#
|
||||
# processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer,
|
||||
# passage_tokenizer=passage_tokenizer,
|
||||
# max_seq_len_query=10,
|
||||
# max_seq_len_passage=10,
|
||||
# label_list=label_list,
|
||||
# metric="text_similarity_metric",
|
||||
# data_dir="samples/dpr/",
|
||||
# train_filename="sample.json",
|
||||
# dev_filename="sample.json",
|
||||
# test_filename=None,
|
||||
# embed_title=True,
|
||||
# num_hard_negatives=1,
|
||||
# dev_split=0,
|
||||
# max_samples=2)
|
||||
#
|
||||
# data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)
|
||||
#
|
||||
# question_language_model = LanguageModel.load(pretrained_model_name_or_path=question_lang_model,
|
||||
# language_model_class="DPRQuestionEncoder")
|
||||
# passage_language_model = LanguageModel.load(pretrained_model_name_or_path=passage_lang_model,
|
||||
# language_model_class="DPRContextEncoder")
|
||||
#
|
||||
# prediction_head = TextSimilarityHead(similarity_function=similarity_function)
|
||||
#
|
||||
# model = BiAdaptiveModel(
|
||||
# language_model1=question_language_model,
|
||||
# language_model2=passage_language_model,
|
||||
# prediction_heads=[prediction_head],
|
||||
# embeds_dropout_prob=0.1,
|
||||
# lm1_output_types=["per_sequence"],
|
||||
# lm2_output_types=["per_sequence"],
|
||||
# device=device,
|
||||
# )
|
||||
#
|
||||
# model, optimizer, lr_schedule = initialize_optimizer(
|
||||
# model=model,
|
||||
# learning_rate=1e-5,
|
||||
# optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
|
||||
# "eps": 1e-08},
|
||||
# schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
|
||||
# n_batches=len(data_silo.loaders["train"]),
|
||||
# n_epochs=n_epochs,
|
||||
# grad_acc_steps=1,
|
||||
# device=device,
|
||||
# distributed=distributed
|
||||
# )
|
||||
#
|
||||
# trainer = Trainer(
|
||||
# model=model,
|
||||
# optimizer=optimizer,
|
||||
# data_silo=data_silo,
|
||||
# epochs=n_epochs,
|
||||
# n_gpu=n_gpu,
|
||||
# lr_schedule=lr_schedule,
|
||||
# evaluate_every=evaluate_every,
|
||||
# device=device,
|
||||
# )
|
||||
#
|
||||
# trainer.train()
|
||||
#
|
||||
# ######## save and load model again
|
||||
# save_dir = Path("testsave/dpr-model")
|
||||
# model.save(save_dir)
|
||||
# del model
|
||||
#
|
||||
# model2 = BiAdaptiveModel.load(save_dir, device=device)
|
||||
# model2, optimizer2, lr_schedule = initialize_optimizer(
|
||||
# model=model2,
|
||||
# learning_rate=1e-5,
|
||||
# optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
|
||||
# "eps": 1e-08},
|
||||
# schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
|
||||
# n_batches=len(data_silo.loaders["train"]),
|
||||
# n_epochs=n_epochs,
|
||||
# grad_acc_steps=1,
|
||||
# device=device,
|
||||
# distributed=distributed
|
||||
# )
|
||||
# trainer2 = Trainer(
|
||||
# model=model2,
|
||||
# optimizer=optimizer,
|
||||
# data_silo=data_silo,
|
||||
# epochs=n_epochs,
|
||||
# n_gpu=n_gpu,
|
||||
# lr_schedule=lr_schedule,
|
||||
# evaluate_every=evaluate_every,
|
||||
# device=device,
|
||||
# )
|
||||
#
|
||||
# trainer2.train()
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
test_dpr_training()
|
||||
#test_dpr_context_only()
|
||||
# test_dpr_training()
|
||||
test_dpr_context_only()
|
||||
# test_dpr_modules()
|
||||
@ -1,9 +1,9 @@
|
||||
import logging
|
||||
|
||||
from haystack.basics.modeling.adaptive_model import AdaptiveModel
|
||||
from haystack.basics.modeling.language_model import LanguageModel
|
||||
from haystack.basics.modeling.prediction_head import QuestionAnsweringHead
|
||||
from haystack.basics.utils import set_all_seeds, initialize_device_settings
|
||||
from haystack.modeling.model.adaptive_model import AdaptiveModel
|
||||
from haystack.modeling.model.language_model import LanguageModel
|
||||
from haystack.modeling.model.prediction_head import QuestionAnsweringHead
|
||||
from haystack.modeling.utils import set_all_seeds, initialize_device_settings
|
||||
|
||||
|
||||
def test_prediction_head_load_save(tmp_path, caplog=None):
|
||||
@ -1,7 +1,7 @@
|
||||
import logging
|
||||
|
||||
from haystack.basics.data_handler.processor import SquadProcessor
|
||||
from haystack.basics.modeling.tokenization import Tokenizer
|
||||
from haystack.modeling.data_handler.processor import SquadProcessor
|
||||
from haystack.modeling.model.tokenization import Tokenizer
|
||||
|
||||
|
||||
# during inference (parameter return_baskets = False) we do not convert labels
|
||||
@ -1,9 +1,9 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from haystack.basics.data_handler.processor import SquadProcessor
|
||||
from haystack.basics.modeling.tokenization import Tokenizer
|
||||
from haystack.basics.utils import set_all_seeds
|
||||
from haystack.modeling.data_handler.processor import SquadProcessor
|
||||
from haystack.modeling.model.tokenization import Tokenizer
|
||||
from haystack.modeling.utils import set_all_seeds
|
||||
import torch
|
||||
|
||||
|
||||
@ -20,7 +20,7 @@ def test_processor_saving_loading(caplog):
|
||||
|
||||
processor = SquadProcessor(
|
||||
tokenizer=tokenizer,
|
||||
max_seq_len=128,
|
||||
max_seq_len=256,
|
||||
label_list=["start_token", "end_token"],
|
||||
train_filename="train-sample.json",
|
||||
dev_filename="dev-sample.json",
|
||||
@ -29,14 +29,14 @@ def test_processor_saving_loading(caplog):
|
||||
)
|
||||
|
||||
dicts = processor.file_to_dicts(file=Path("samples/qa/dev-sample.json"))
|
||||
data, tensor_names, _ = processor.dataset_from_dicts(dicts)
|
||||
data, tensor_names, _ = processor.dataset_from_dicts(dicts=dicts, indices=[1])
|
||||
|
||||
save_dir = Path("testsave/processor")
|
||||
processor.save(save_dir)
|
||||
|
||||
processor = processor.load_from_dir(save_dir)
|
||||
dicts = processor.file_to_dicts(file=Path("samples/qa/dev-sample.json"))
|
||||
data_loaded, tensor_names_loaded, _ = processor.dataset_from_dicts(dicts)
|
||||
data_loaded, tensor_names_loaded, _ = processor.dataset_from_dicts(dicts, indices=[1])
|
||||
|
||||
assert tensor_names == tensor_names_loaded
|
||||
for i in range(len(data.tensors)):
|
||||
@ -1,20 +1,21 @@
|
||||
import pytest
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from haystack.basics.data_handler.data_silo import DataSilo
|
||||
from haystack.basics.modeling.language_model import LanguageModel
|
||||
from haystack.basics.modeling.optimization import initialize_optimizer
|
||||
from haystack.basics.modeling.prediction_head import QuestionAnsweringHead
|
||||
from haystack.basics.modeling.tokenization import Tokenizer
|
||||
from haystack.basics.train import Trainer
|
||||
from haystack.basics.utils import set_all_seeds, initialize_device_settings
|
||||
from haystack.basics.data_handler.processor import SquadProcessor
|
||||
from haystack.basics.modeling.adaptive_model import AdaptiveModel
|
||||
from haystack.modeling.data_handler.data_silo import DataSilo
|
||||
from haystack.modeling.data_handler.processor import SquadProcessor
|
||||
from haystack.modeling.model.adaptive_model import AdaptiveModel
|
||||
from haystack.modeling.model.language_model import LanguageModel
|
||||
from haystack.modeling.model.optimization import initialize_optimizer
|
||||
from haystack.modeling.model.prediction_head import QuestionAnsweringHead
|
||||
from haystack.modeling.model.tokenization import Tokenizer
|
||||
from haystack.modeling.training.base import Trainer
|
||||
from haystack.modeling.utils import set_all_seeds, initialize_device_settings
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def distilbert_squad():
|
||||
def test_training(caplog=None):
|
||||
if caplog:
|
||||
caplog.set_level(logging.CRITICAL)
|
||||
|
||||
set_all_seeds(seed=42)
|
||||
device, n_gpu = initialize_device_settings(use_cuda=False)
|
||||
batch_size = 2
|
||||
@ -25,12 +26,12 @@ def distilbert_squad():
|
||||
tokenizer = Tokenizer.load(
|
||||
pretrained_model_name_or_path=base_LM_model,
|
||||
do_lower_case=True,
|
||||
use_fast=True # TODO parametrize this to test slow as well
|
||||
use_fast=True # TODO parametrize this to test slow as well
|
||||
)
|
||||
label_list = ["start_token", "end_token"]
|
||||
processor = SquadProcessor(
|
||||
tokenizer=tokenizer,
|
||||
max_seq_len=20,
|
||||
max_seq_len=256,
|
||||
doc_stride=10,
|
||||
max_query_length=6,
|
||||
train_filename="train-sample.json",
|
||||
@ -55,7 +56,7 @@ def distilbert_squad():
|
||||
model, optimizer, lr_schedule = initialize_optimizer(
|
||||
model=model,
|
||||
learning_rate=2e-5,
|
||||
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
|
||||
# optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
|
||||
n_batches=len(data_silo.loaders["train"]),
|
||||
n_epochs=n_epochs,
|
||||
device=device
|
||||
@ -72,14 +73,6 @@ def distilbert_squad():
|
||||
)
|
||||
trainer.train()
|
||||
|
||||
return model, processor
|
||||
|
||||
|
||||
def test_training(distilbert_squad, caplog=None):
|
||||
if caplog:
|
||||
caplog.set_level(logging.CRITICAL)
|
||||
|
||||
model, processor = distilbert_squad
|
||||
assert type(model) == AdaptiveModel
|
||||
assert type(processor) == SquadProcessor
|
||||
|
||||
@ -6,7 +6,7 @@ from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, Rob
|
||||
|
||||
from tokenizers.pre_tokenizers import WhitespaceSplit
|
||||
|
||||
from haystack.basics.modeling.tokenization import Tokenizer
|
||||
from haystack.modeling.model.tokenization import Tokenizer
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user