Fix tests and adjust folder structure

* Add type annotations in QuestionAnsweringHead

* Fix test by increasing max_seq_len

* Add SampleBasket type annotation

* Remove prediction head param from adaptive model init

* Add type ignore for AdaptiveModel init

* Fix and rename tests

* Adjust folder structure

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
This commit is contained in:
Timo Moeller 2021-09-13 18:38:14 +02:00 committed by GitHub
parent e8a6427b9e
commit 537204e8c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
36 changed files with 565 additions and 624 deletions

View File

@ -1,247 +0,0 @@
import json
import logging
import os
import random
import tarfile
import tempfile
import uuid
from itertools import islice
from pathlib import Path
import requests
from tqdm import tqdm
logger = logging.getLogger(__name__)
DOWNSTREAM_TASK_MAP = {
"squad20": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/squad20.tar.gz",
"covidqa": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/covidqa.tar.gz",
}
def read_dpr_json(file, max_samples=None, proxies=None, num_hard_negatives=1, num_positives=1, shuffle_negatives=True, shuffle_positives=False):
"""
Reads a Dense Passage Retrieval (DPR) data file in json format and returns a list of dictionaries.
:param file: filename of DPR data in json format
Returns:
list of dictionaries: List[dict]
each dictionary: {
"query": str -> query_text
"passages": List[dictionaries] -> [{"text": document_text, "title": xxx, "label": "positive", "external_id": abb123},
{"text": document_text, "title": xxx, "label": "hard_negative", "external_id": abb134},
...]
}
example:
["query": 'who sings does he love me with reba'
"passages" : [{'title': 'Does He Love You',
'text': 'Does He Love You "Does He Love You" is a song written by Sandy Knox and Billy Stritch, and recorded as a duet by American country music artists Reba McEntire and Linda Davis. It was released in August 1993 as the first single from Reba\'s album "Greatest Hits Volume Two". It is one of country music\'s several songs about a love triangle. "Does He Love You" was written in 1982 by Billy Stritch. He recorded it with a trio in which he performed at the time, because he wanted a song that could be sung by the other two members',
'label': 'positive',
'external_id': '11828866'},
{'title': 'When the Nightingale Sings',
'text': "When the Nightingale Sings When The Nightingale Sings is a Middle English poem, author unknown, recorded in the British Library's Harley 2253 manuscript, verse 25. It is a love poem, extolling the beauty and lost love of an unknown maiden. When þe nyhtegale singes þe wodes waxen grene.<br> Lef ant gras ant blosme springes in aueryl y wene,<br> Ant love is to myn herte gon wiþ one spere so kene<br> Nyht ant day my blod hit drynkes myn herte deþ me tene. Ich have loved al þis er þat y may love namore,<br> Ich have siked moni syk lemmon for",
'label': 'hard_negative',
'external_id': '10891637'}]
]
"""
# get remote dataset if needed
if not (os.path.exists(file)):
logger.info(f" Couldn't find {file} locally. Trying to download ...")
_download_extract_downstream_data(file, proxies=proxies)
if file.suffix.lower() == ".jsonl":
dicts = []
with open(file, encoding='utf-8') as f:
for line in f:
dicts.append(json.loads(line))
else:
dicts = json.load(open(file, encoding='utf-8'))
if max_samples:
dicts = random.sample(dicts, min(max_samples, len(dicts)))
# convert DPR dictionary to standard dictionary
query_json_keys = ["question", "questions", "query"]
positive_context_json_keys = ["positive_contexts", "positive_ctxs", "positive_context", "positive_ctx"]
hard_negative_json_keys = ["hard_negative_contexts", "hard_negative_ctxs", "hard_negative_context", "hard_negative_ctx"]
standard_dicts = []
for dict in dicts:
sample = {}
passages = []
for key, val in dict.items():
if key in query_json_keys:
sample["query"] = val
elif key in positive_context_json_keys:
if shuffle_positives:
random.shuffle(val)
for passage in val[:num_positives]:
passages.append({
"title": passage["title"],
"text": passage["text"],
"label": "positive",
"external_id": passage.get("passage_id", uuid.uuid4().hex.upper()[0:8])
})
elif key in hard_negative_json_keys:
if shuffle_negatives:
random.shuffle(val)
for passage in val[:num_hard_negatives]:
passages.append({
"title": passage["title"],
"text": passage["text"],
"label": "hard_negative",
"external_id": passage.get("passage_id", uuid.uuid4().hex.upper()[0:8])
})
sample["passages"] = passages
standard_dicts.append(sample)
return standard_dicts
def read_squad_file(filename, proxies=None):
"""Read a SQuAD json file"""
if not (os.path.exists(filename)):
logger.info(f" Couldn't find {filename} locally. Trying to download ...")
_download_extract_downstream_data(filename, proxies)
with open(filename, "r", encoding="utf-8") as reader:
input_data = json.load(reader)["data"]
return input_data
def write_squad_predictions(predictions, out_filename, predictions_filename=None):
predictions_json = {}
for x in predictions:
for p in x["predictions"]:
if p["answers"][0]["answer"] is not None:
predictions_json[p["question_id"]] = p["answers"][0]["answer"]
else:
predictions_json[p["question_id"]] = "" #convert No answer = None to format understood by the SQuAD eval script
if predictions_filename:
dev_labels = {}
temp = json.load(open(predictions_filename, "r"))
for d in temp["data"]:
for p in d["paragraphs"]:
for q in p["qas"]:
if q.get("is_impossible",False):
dev_labels[q["id"]] = "is_impossible"
else:
dev_labels[q["id"]] = q["answers"][0]["text"]
not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys()))
if len(not_included) > 0:
logger.info(f"There were missing predicitons for question ids: {list(not_included)}")
for x in not_included:
predictions_json[x] = ""
# os.makedirs("model_output", exist_ok=True)
# filepath = Path("model_output") / out_filename
json.dump(predictions_json, open(out_filename, "w"))
logger.info(f"Written Squad predictions to: {out_filename}")
def http_get(url, temp_file, proxies=None):
req = requests.get(url, stream=True, proxies=proxies)
content_length = req.headers.get("Content-Length")
total = int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total)
for chunk in req.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()
def grouper(iterable, n, worker_id=0, total_workers=1):
"""
Split an iterable into a list of n-sized chunks. Each element in the chunk is a tuple of (index_num, element).
Example:
list(grouper('ABCDEFG', 3))
[[(0, 'A'), (1, 'B'), (2, 'C')], [(3, 'D'), (4, 'E'), (5, 'F')], [(6, 'G')]]
Use with the StreamingDataSilo
When StreamingDataSilo is used with multiple PyTorch DataLoader workers, the generator
yielding dicts(that gets converted to datasets) is replicated across the workers.
To avoid duplicates, we split the dicts across workers by creating a new generator for
each worker using this method.
Input --> [dictA, dictB, dictC, dictD, dictE, ...] with total worker=3 and n=2
Output for worker 1: [(dictA, dictB), (dictG, dictH), ...]
Output for worker 2: [(dictC, dictD), (dictI, dictJ), ...]
Output for worker 3: [(dictE, dictF), (dictK, dictL), ...]
This method also adds an index number to every dict yielded.
:param iterable: a generator object that yields dicts
:type iterable: generator
:param n: the dicts are grouped in n-sized chunks that gets converted to datasets
:type n: int
:param worker_id: the worker_id for the PyTorch DataLoader
:type worker_id: int
:param total_workers: total number of workers for the PyTorch DataLoader
:type total_workers: int
"""
# TODO make me comprehensible :)
def get_iter_start_pos(gen):
start_pos = worker_id * n
for i in gen:
if start_pos:
start_pos -= 1
continue
yield i
def filter_elements_per_worker(gen):
x = n
y = (total_workers - 1) * n
for i in gen:
if x:
yield i
x -= 1
else:
if y != 1:
y -= 1
continue
else:
x = n
y = (total_workers - 1) * n
iterable = iter(enumerate(iterable))
iterable = get_iter_start_pos(iterable)
if total_workers > 1:
iterable = filter_elements_per_worker(iterable)
return iter(lambda: list(islice(iterable, n)), [])
def _download_extract_downstream_data(input_file, proxies=None):
# download archive to temp dir and extract to correct position
full_path = Path(os.path.realpath(input_file))
directory = full_path.parent
taskname = directory.stem
datadir = directory.parent
logger.info(
"downloading and extracting file {} to dir {}".format(taskname, datadir)
)
if taskname not in DOWNSTREAM_TASK_MAP:
logger.error("Cannot download {}. Unknown data source.".format(taskname))
else:
if os.name == "nt": # make use of NamedTemporaryFile compatible with Windows
delete_tmp_file = False
else:
delete_tmp_file = True
with tempfile.NamedTemporaryFile(delete=delete_tmp_file) as temp_file:
http_get(DOWNSTREAM_TASK_MAP[taskname], temp_file, proxies=proxies)
temp_file.flush()
temp_file.seek(0) # making tempfile accessible
tfile = tarfile.open(temp_file.name)
tfile.extractall(datadir)
# temp_file gets deleted here
def is_json(x):
if issubclass(type(x), Path):
return True
try:
json.dumps(x)
return True
except:
return False

View File

@ -1,11 +1,10 @@
import logging
from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelWithLMHead, \
AutoModelForTokenClassification
from transformers import AutoModelForQuestionAnswering
from haystack.basics.modeling import adaptive_model as am
from haystack.basics.modeling.language_model import LanguageModel
from haystack.basics.modeling.prediction_head import QuestionAnsweringHead
from haystack.modeling.model import adaptive_model as am
from haystack.modeling.model.language_model import LanguageModel
from haystack.modeling.model.prediction_head import QuestionAnsweringHead
logger = logging.getLogger(__name__)

View File

@ -1,26 +1,26 @@
import hashlib
import json
import logging
import torch.multiprocessing as mp
import random
from contextlib import ExitStack
from functools import partial
import random
from pathlib import Path
from itertools import groupby
from itertools import islice
from pathlib import Path
from typing import Optional, List, Tuple, Dict, Union
import numpy as np
import torch
import torch.multiprocessing as mp
from torch.utils.data import ConcatDataset, Dataset
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler, SequentialSampler
import torch
from tqdm import tqdm
from haystack.basics.data_handler.dataloader import NamedDataLoader
from haystack.basics.data_handler.processor import Processor
from haystack.basics.data_handler.utils import grouper
from haystack.basics.utils import MLFlowLogger as MlLogger
from haystack.basics.utils import log_ascii_workers, calc_chunksize
from haystack.basics.utils import get_dict_checksum
from haystack.basics.visual.ascii.images import TRACTOR_SMALL
from haystack.modeling.data_handler.dataloader import NamedDataLoader
from haystack.modeling.data_handler.processor import Processor
from haystack.modeling.utils import MLFlowLogger as MlLogger
from haystack.modeling.visual.ascii.images import TRACTOR_SMALL, WORKER_F, WORKER_M, WORKER_X
logger = logging.getLogger(__name__)
@ -713,4 +713,117 @@ class DataSiloForCrossVal:
current_test_set = split
current_train_set = np.hstack(np.delete(splits, idx, axis=0))
yield current_train_set, current_test_set
yield current_train_set, current_test_set
def calc_chunksize(num_dicts, min_chunksize=4, max_chunksize=2000, max_processes=128):
if mp.cpu_count() > 3:
num_cpus = min(mp.cpu_count() - 1 or 1, max_processes) # -1 to keep a CPU core free for xxx
else:
num_cpus = min(mp.cpu_count(), max_processes) # when there are few cores, we use all of them
dicts_per_cpu = np.ceil(num_dicts / num_cpus)
# automatic adjustment of multiprocessing chunksize
# for small files (containing few dicts) we want small chunksize to ulitize all available cores but never less
# than 2, because we need it to sample another random sentence in LM finetuning
# for large files we want to minimize processor spawning without giving too much data to one process, so we
# clip it at 5k
multiprocessing_chunk_size = int(np.clip((np.ceil(dicts_per_cpu / 5)), a_min=min_chunksize, a_max=max_chunksize))
# This lets us avoid cases in lm_finetuning where a chunk only has a single doc and hence cannot pick
# a valid next sentence substitute from another document
if num_dicts != 1:
while num_dicts % multiprocessing_chunk_size == 1:
multiprocessing_chunk_size -= -1
dict_batches_to_process = int(num_dicts / multiprocessing_chunk_size)
num_processes = min(num_cpus, dict_batches_to_process) or 1
return multiprocessing_chunk_size, num_processes
def log_ascii_workers(n, logger):
m_worker_lines = WORKER_M.split("\n")
f_worker_lines = WORKER_F.split("\n")
x_worker_lines = WORKER_X.split("\n")
all_worker_lines = []
for i in range(n):
rand = np.random.randint(low=0,high=3)
if(rand % 3 == 0):
all_worker_lines.append(f_worker_lines)
elif(rand % 3 == 1):
all_worker_lines.append(m_worker_lines)
else:
all_worker_lines.append(x_worker_lines)
zipped = zip(*all_worker_lines)
for z in zipped:
logger.info(" ".join(z))
def get_dict_checksum(payload_dict):
"""
Get MD5 checksum for a dict.
"""
checksum = hashlib.md5(json.dumps(payload_dict, sort_keys=True).encode("utf-8")).hexdigest()
return checksum
def grouper(iterable, n, worker_id=0, total_workers=1):
"""
Split an iterable into a list of n-sized chunks. Each element in the chunk is a tuple of (index_num, element).
Example:
list(grouper('ABCDEFG', 3))
[[(0, 'A'), (1, 'B'), (2, 'C')], [(3, 'D'), (4, 'E'), (5, 'F')], [(6, 'G')]]
Use with the StreamingDataSilo
When StreamingDataSilo is used with multiple PyTorch DataLoader workers, the generator
yielding dicts(that gets converted to datasets) is replicated across the workers.
To avoid duplicates, we split the dicts across workers by creating a new generator for
each worker using this method.
Input --> [dictA, dictB, dictC, dictD, dictE, ...] with total worker=3 and n=2
Output for worker 1: [(dictA, dictB), (dictG, dictH), ...]
Output for worker 2: [(dictC, dictD), (dictI, dictJ), ...]
Output for worker 3: [(dictE, dictF), (dictK, dictL), ...]
This method also adds an index number to every dict yielded.
:param iterable: a generator object that yields dicts
:type iterable: generator
:param n: the dicts are grouped in n-sized chunks that gets converted to datasets
:type n: int
:param worker_id: the worker_id for the PyTorch DataLoader
:type worker_id: int
:param total_workers: total number of workers for the PyTorch DataLoader
:type total_workers: int
"""
# TODO make me comprehensible :)
def get_iter_start_pos(gen):
start_pos = worker_id * n
for i in gen:
if start_pos:
start_pos -= 1
continue
yield i
def filter_elements_per_worker(gen):
x = n
y = (total_workers - 1) * n
for i in gen:
if x:
yield i
x -= 1
else:
if y != 1:
y -= 1
continue
else:
x = n
y = (total_workers - 1) * n
iterable = iter(enumerate(iterable))
iterable = get_iter_start_pos(iterable)
if total_workers > 1:
iterable = filter_elements_per_worker(iterable)
return iter(lambda: list(islice(iterable, n)), [])

View File

@ -1,7 +1,7 @@
from math import ceil
from torch.utils.data import DataLoader, Dataset, Sampler
import torch
from torch.utils.data import DataLoader, Dataset, Sampler
class NamedDataLoader(DataLoader):

View File

@ -6,7 +6,7 @@ import numpy as np
import torch
from torch.utils.data import Dataset, ConcatDataset, TensorDataset
from haystack.basics.utils import flatten_list
from haystack.modeling.utils import flatten_list
logger = logging.getLogger(__name__)

View File

@ -3,31 +3,36 @@ import json
import logging
import os
import random
import tarfile
import tempfile
import uuid
import requests
from tqdm import tqdm
from abc import ABC, abstractmethod
from inspect import signature
from pathlib import Path
from typing import Optional, Dict, List, Union
import numpy as np
from haystack.basics.modeling.tokenization import (
from haystack.modeling.model.tokenization import (
Tokenizer,
tokenize_batch_question_answering
)
from haystack.basics.data_handler.dataset import convert_features_to_dataset
from haystack.basics.data_handler.samples import (
from haystack.modeling.data_handler.dataset import convert_features_to_dataset
from haystack.modeling.data_handler.samples import (
Sample,
SampleBasket,
get_passage_offsets,
offset_to_token_idx_vecorized,
)
from haystack.basics.data_handler.utils import (
read_squad_file,
read_dpr_json,
is_json,
)
from haystack.basics.utils import MLFlowLogger as MlLogger
from haystack.modeling.utils import MLFlowLogger as MlLogger
DOWNSTREAM_TASK_MAP = {
"squad20": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/squad20.tar.gz",
"covidqa": "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/covidqa.tar.gz",
}
logger = logging.getLogger(__name__)
@ -245,7 +250,7 @@ class Processor(ABC):
config = {}
# self.__dict__ doesn't give parent class attributes
for key, value in inspect.getmembers(self):
if is_json(value) and key[0] != "_":
if _is_json(value) and key[0] != "_":
if issubclass(type(value), Path):
value = str(value)
config[key] = value
@ -451,7 +456,7 @@ class SquadProcessor(Processor):
def file_to_dicts(self, file: str) -> List[dict]:
nested_dicts = read_squad_file(filename=file)
nested_dicts = _read_squad_file(filename=file)
dicts = [y for x in nested_dicts for y in x["paragraphs"]]
return dicts
@ -985,7 +990,7 @@ class TextSimilarityProcessor(Processor):
{"text": document_text, "title": xxx, "label": "hard_negative", "external_id": abb134},
...]}
"""
dicts = read_dpr_json(file, max_samples=self.max_samples, num_hard_negatives=self.num_hard_negatives, num_positives=self.num_positives, shuffle_negatives=self.shuffle_negatives, shuffle_positives=self.shuffle_positives)
dicts = _read_dpr_json(file, max_samples=self.max_samples, num_hard_negatives=self.num_hard_negatives, num_positives=self.num_positives, shuffle_negatives=self.shuffle_negatives, shuffle_positives=self.shuffle_positives)
# shuffle dicts to make sure that similar positive passages do not end up in one batch
dicts = random.sample(dicts, len(dicts))
@ -1147,3 +1152,167 @@ class TextSimilarityProcessor(Processor):
f"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '{ctx}' ")
res.append(tuple((title, ctx)))
return res
# helper fcts
def write_squad_predictions(predictions, out_filename, predictions_filename=None):
predictions_json = {}
for x in predictions:
for p in x["predictions"]:
if p["answers"][0]["answer"] is not None:
predictions_json[p["question_id"]] = p["answers"][0]["answer"]
else:
predictions_json[p["question_id"]] = "" #convert No answer = None to format understood by the SQuAD eval script
if predictions_filename:
dev_labels = {}
temp = json.load(open(predictions_filename, "r"))
for d in temp["data"]:
for p in d["paragraphs"]:
for q in p["qas"]:
if q.get("is_impossible",False):
dev_labels[q["id"]] = "is_impossible"
else:
dev_labels[q["id"]] = q["answers"][0]["text"]
not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys()))
if len(not_included) > 0:
logger.info(f"There were missing predicitons for question ids: {list(not_included)}")
for x in not_included:
predictions_json[x] = ""
# os.makedirs("model_output", exist_ok=True)
# filepath = Path("model_output") / out_filename
json.dump(predictions_json, open(out_filename, "w"))
logger.info(f"Written Squad predictions to: {out_filename}")
def _read_dpr_json(file, max_samples=None, proxies=None, num_hard_negatives=1, num_positives=1, shuffle_negatives=True, shuffle_positives=False):
"""
Reads a Dense Passage Retrieval (DPR) data file in json format and returns a list of dictionaries.
:param file: filename of DPR data in json format
Returns:
list of dictionaries: List[dict]
each dictionary: {
"query": str -> query_text
"passages": List[dictionaries] -> [{"text": document_text, "title": xxx, "label": "positive", "external_id": abb123},
{"text": document_text, "title": xxx, "label": "hard_negative", "external_id": abb134},
...]
}
example:
["query": 'who sings does he love me with reba'
"passages" : [{'title': 'Does He Love You',
'text': 'Does He Love You "Does He Love You" is a song written by Sandy Knox and Billy Stritch, and recorded as a duet by American country music artists Reba McEntire and Linda Davis. It was released in August 1993 as the first single from Reba\'s album "Greatest Hits Volume Two". It is one of country music\'s several songs about a love triangle. "Does He Love You" was written in 1982 by Billy Stritch. He recorded it with a trio in which he performed at the time, because he wanted a song that could be sung by the other two members',
'label': 'positive',
'external_id': '11828866'},
{'title': 'When the Nightingale Sings',
'text': "When the Nightingale Sings When The Nightingale Sings is a Middle English poem, author unknown, recorded in the British Library's Harley 2253 manuscript, verse 25. It is a love poem, extolling the beauty and lost love of an unknown maiden. When þe nyhtegale singes þe wodes waxen grene.<br> Lef ant gras ant blosme springes in aueryl y wene,<br> Ant love is to myn herte gon wiþ one spere so kene<br> Nyht ant day my blod hit drynkes myn herte deþ me tene. Ich have loved al þis er þat y may love namore,<br> Ich have siked moni syk lemmon for",
'label': 'hard_negative',
'external_id': '10891637'}]
]
"""
# get remote dataset if needed
if not (os.path.exists(file)):
logger.info(f" Couldn't find {file} locally. Trying to download ...")
_download_extract_downstream_data(file, proxies=proxies)
if file.suffix.lower() == ".jsonl":
dicts = []
with open(file, encoding='utf-8') as f:
for line in f:
dicts.append(json.loads(line))
else:
dicts = json.load(open(file, encoding='utf-8'))
if max_samples:
dicts = random.sample(dicts, min(max_samples, len(dicts)))
# convert DPR dictionary to standard dictionary
query_json_keys = ["question", "questions", "query"]
positive_context_json_keys = ["positive_contexts", "positive_ctxs", "positive_context", "positive_ctx"]
hard_negative_json_keys = ["hard_negative_contexts", "hard_negative_ctxs", "hard_negative_context", "hard_negative_ctx"]
standard_dicts = []
for dict in dicts:
sample = {}
passages = []
for key, val in dict.items():
if key in query_json_keys:
sample["query"] = val
elif key in positive_context_json_keys:
if shuffle_positives:
random.shuffle(val)
for passage in val[:num_positives]:
passages.append({
"title": passage["title"],
"text": passage["text"],
"label": "positive",
"external_id": passage.get("passage_id", uuid.uuid4().hex.upper()[0:8])
})
elif key in hard_negative_json_keys:
if shuffle_negatives:
random.shuffle(val)
for passage in val[:num_hard_negatives]:
passages.append({
"title": passage["title"],
"text": passage["text"],
"label": "hard_negative",
"external_id": passage.get("passage_id", uuid.uuid4().hex.upper()[0:8])
})
sample["passages"] = passages
standard_dicts.append(sample)
return standard_dicts
def _read_squad_file(filename, proxies=None):
"""Read a SQuAD json file"""
if not (os.path.exists(filename)):
logger.info(f" Couldn't find {filename} locally. Trying to download ...")
_download_extract_downstream_data(filename, proxies)
with open(filename, "r", encoding="utf-8") as reader:
input_data = json.load(reader)["data"]
return input_data
def _http_get(url, temp_file, proxies=None):
req = requests.get(url, stream=True, proxies=proxies)
content_length = req.headers.get("Content-Length")
total = int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total)
for chunk in req.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()
def _download_extract_downstream_data(input_file, proxies=None):
# download archive to temp dir and extract to correct position
full_path = Path(os.path.realpath(input_file))
directory = full_path.parent
taskname = directory.stem
datadir = directory.parent
logger.info(
"downloading and extracting file {} to dir {}".format(taskname, datadir)
)
if taskname not in DOWNSTREAM_TASK_MAP:
logger.error("Cannot download {}. Unknown data source.".format(taskname))
else:
if os.name == "nt": # make use of NamedTemporaryFile compatible with Windows
delete_tmp_file = False
else:
delete_tmp_file = True
with tempfile.NamedTemporaryFile(delete=delete_tmp_file) as temp_file:
_http_get(DOWNSTREAM_TASK_MAP[taskname], temp_file, proxies=proxies)
temp_file.flush()
temp_file.seek(0) # making tempfile accessible
tfile = tarfile.open(temp_file.name)
tfile.extractall(datadir)
# temp_file gets deleted here
def _is_json(x):
if issubclass(type(x), Path):
return True
try:
json.dumps(x)
return True
except:
return False

View File

@ -2,7 +2,8 @@ import logging
from typing import Union, Optional, List
import numpy as np
from haystack.basics.visual.ascii.images import SAMPLE
from haystack.modeling.visual.ascii.images import SAMPLE
logger = logging.getLogger(__name__)

View File

@ -1,17 +1,16 @@
import logging
import numbers
from typing import Dict, List, Optional, Any
from tqdm import tqdm
import torch
import numbers
import logging
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from haystack.basics.evaluation.metrics import compute_metrics, compute_report_metrics
from haystack.basics.utils import to_numpy
from haystack.basics.utils import MLFlowLogger as MlLogger
from haystack.basics.modeling.adaptive_model import AdaptiveModel
from haystack.basics.visual.ascii.images import BUSH_SEP
from haystack.modeling.evaluation.metrics import compute_metrics, compute_report_metrics
from haystack.modeling.model.adaptive_model import AdaptiveModel
from haystack.modeling.utils import MLFlowLogger as MlLogger
from haystack.modeling.visual.ascii.images import BUSH_SEP
logger = logging.getLogger(__name__)
@ -72,14 +71,14 @@ class Evaluator:
# stack results of all batches per prediction head
for head_num, head in enumerate(model.prediction_heads):
loss_all[head_num] += np.sum(to_numpy(losses_per_head[head_num]))
preds_all[head_num] += list(to_numpy(preds[head_num]))
label_all[head_num] += list(to_numpy(labels[head_num]))
loss_all[head_num] += np.sum(_to_numpy(losses_per_head[head_num]))
preds_all[head_num] += list(_to_numpy(preds[head_num]))
label_all[head_num] += list(_to_numpy(labels[head_num]))
if head.model_type == "span_classification":
ids_all[head_num] += list(to_numpy(batch["id"]))
passage_start_t_all[head_num] += list(to_numpy(batch["passage_start_t"]))
ids_all[head_num] += list(_to_numpy(batch["id"]))
passage_start_t_all[head_num] += list(_to_numpy(batch["passage_start_t"]))
if calibrate_conf_scores:
logits_all[head_num] += list(to_numpy(logits))
logits_all[head_num] += list(_to_numpy(logits))
# Evaluate per prediction head
@ -165,3 +164,10 @@ class Evaluator:
else:
if not metric_name in ["preds", "labels"] and not metric_name.startswith("_"):
logger.info("{}: {}".format(metric_name, metric_val))
def _to_numpy(container):
try:
return container.cpu().numpy()
except AttributeError:
return container

View File

@ -1,8 +1,8 @@
import logging
from functools import reduce
from typing import Callable, Dict, List
import numpy as np
from functools import reduce
from scipy.stats import pearsonr, spearmanr
from seqeval.metrics import classification_report as token_classification_report
from sklearn.metrics import (
@ -13,9 +13,8 @@ from sklearn.metrics import (
classification_report
)
from haystack.basics.utils import flatten_list
from haystack.basics.modeling.prediction_head import PredictionHead
from haystack.modeling.model.prediction_head import PredictionHead
from haystack.modeling.utils import flatten_list
logger = logging.getLogger(__name__)

View File

@ -1,23 +1,22 @@
import copy
import json
import logging
import multiprocessing
import os
from pathlib import Path
from typing import Iterable, Dict, Union, List, Optional, Callable
import multiprocessing
import numpy
import torch
from torch import nn
from transformers import AutoConfig
from transformers.convert_graph_to_onnx import convert, quantize as quantize_model
from haystack.basics.data_handler.processor import Processor
from haystack.basics.modeling.language_model import LanguageModel
from haystack.basics.modeling.prediction_head import PredictionHead
from haystack.basics.utils import MLFlowLogger as MlLogger
import haystack.basics.conversion.transformers as conv
import haystack.modeling.conversion.transformers as conv
from haystack.modeling.data_handler.processor import Processor
from haystack.modeling.model.language_model import LanguageModel
from haystack.modeling.model.prediction_head import PredictionHead
from haystack.modeling.utils import MLFlowLogger as MlLogger
logger = logging.getLogger(__name__)
@ -103,28 +102,6 @@ class BaseAdaptiveModel:
elif type(preds) == dict and "predictions" in preds:
preds_final.append(preds)
# todo> remove?
# else:
# preds_final = [list() for _ in range(n_heads)]
# preds = kwargs["preds"]
# preds_for_heads = stack(preds)
# logits_for_heads = [None] * n_heads
#
# samples = [s for b in kwargs["baskets"] for s in b.samples]
# kwargs["samples"] = samples
#
# del kwargs["preds"]
#
# for i, (head, preds_for_head, logits_for_head) in enumerate(zip(self.prediction_heads, preds_for_heads, # type: ignore
# logits_for_heads)):
# preds = head.formatted_preds(logits=logits_for_head, preds=preds_for_head, **kwargs)
# preds_final[i].append(preds)
#
# # Look for a merge() function amongst the heads and if a single one exists, apply it to preds_final
# merge_fn = pick_single_fn(self.prediction_heads, "merge_formatted_preds")
# if merge_fn:
# preds_final = merge_fn(preds_final)
return preds_final
def connect_heads_with_processor(self, tasks: Dict, require_labels: bool = True):
@ -196,7 +173,7 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
embeds_dropout_prob: float,
lm_output_types: Union[str, List[str]],
device: str,
loss_aggregation_fn=Optional[Callable],
loss_aggregation_fn: Optional[Callable] = None,
):
"""
:param language_model: Any model that turns token ids into vector representations.
@ -221,7 +198,7 @@ class AdaptiveModel(nn.Module, BaseAdaptiveModel):
shape (batchsize) per prediction head.
"""
super(AdaptiveModel, self).__init__(prediction_heads)
super(AdaptiveModel, self).__init__() # type: ignore
self.device = device
self.language_model = language_model.to(device)
self.lm_output_dims = language_model.get_output_dims()

View File

@ -6,10 +6,10 @@ from typing import List, Optional, Callable, Union, Dict
import torch
from torch import nn
from haystack.basics.data_handler.processor import Processor
from haystack.basics.modeling.language_model import LanguageModel
from haystack.basics.modeling.prediction_head import PredictionHead, TextSimilarityHead
from haystack.basics.utils import MLFlowLogger as MlLogger
from haystack.modeling.data_handler.processor import Processor
from haystack.modeling.model.language_model import LanguageModel
from haystack.modeling.model.prediction_head import PredictionHead, TextSimilarityHead
from haystack.modeling.utils import MLFlowLogger as MlLogger
logger = logging.getLogger(__name__)

View File

@ -1,16 +1,12 @@
#TODO analyse if this optimization is needed or weather we can use HF transformers code
from importlib import import_module
import inspect
import logging
import sys
import inspect
from importlib import import_module
import torch
from torch.nn.parallel import DistributedDataParallel
from torch.nn import DataParallel
# Used indirectly in _get_optim() to avoid name collision with torch's AdamW
from transformers.optimization import AdamW as TransformersAdamW
from torch.nn.parallel import DistributedDataParallel
try:
from apex import amp
@ -24,7 +20,7 @@ except ImportError:
AMP_AVAILABLE = False
APEX_PARALLEL_AVAILABLE = False
from haystack.basics.utils import MLFlowLogger as MlLogger
from haystack.modeling.utils import MLFlowLogger as MlLogger
logger = logging.getLogger(__name__)
@ -85,7 +81,7 @@ def initialize_optimizer(model,
transformers.optimization by supplying the class name and the parameters for the constructor.
Examples:
1) AdamW from Transformers (Default):
{"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01}
{"name": "AdamW", "correct_bias": False, "weight_decay": 0.01}
2) SGD from pytorch:
{"name": "SGD", "momentum": 0.0}
3) FusedLAMB from apex:
@ -132,7 +128,7 @@ def initialize_optimizer(model,
# Use some defaults to simplify life of inexperienced users
if optimizer_opts is None:
optimizer_opts = {"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01}
optimizer_opts = {"name": "AdamW", "correct_bias": False, "weight_decay": 0.01}
optimizer_opts["lr"] = learning_rate
if schedule_opts is None:

View File

@ -1,19 +1,19 @@
import json
import logging
import os
import numpy as np
from pathlib import Path
from transformers import AutoModelForQuestionAnswering
from typing import List, Tuple, Optional, Union, Dict
import numpy as np
import torch
from torch import nn
from torch import optim
from torch.nn import CrossEntropyLoss, NLLLoss
from haystack.basics.data_handler.utils import is_json
from haystack.basics.utils import try_get, all_gather_list
from haystack.basics.modeling.predictions import QACandidate, QAPred
from transformers import AutoModelForQuestionAnswering
from haystack.modeling.data_handler.samples import SampleBasket
from haystack.modeling.model.predictions import QACandidate, QAPred
from haystack.modeling.utils import try_get, all_gather_list
logger = logging.getLogger(__name__)
@ -88,7 +88,7 @@ class PredictionHead(nn.Module):
for key, value in self.__dict__.items():
if type(value) is np.ndarray:
value = value.tolist()
if is_json(value) and key[0] != "_":
if _is_json(value) and key[0] != "_":
config[key] = value
if self.task_name == "text_similarity" and key == "similarity_function":
config['similarity_function'] = value
@ -166,7 +166,7 @@ class PredictionHead(nn.Module):
self.feed_forward.layer_dims[0] = input_dim
@classmethod
def _get_model_file(cls, config_file):
def _get_model_file(cls, config_file: Union[str, Path]):
if "config.json" in str(config_file) and "prediction_head" in str(config_file):
head_num = int("".join([char for char in os.path.basename(config_file) if char.isdigit()]))
model_file = Path(os.path.dirname(config_file)) / f"prediction_head_{head_num}.bin"
@ -198,7 +198,7 @@ class FeedForwardBlock(nn.Module):
layers_all.append(layer)
self.feed_forward = nn.Sequential(*layers_all)
def forward(self, X):
def forward(self, X: torch.Tensor):
logits = self.feed_forward(X)
return logits
@ -271,7 +271,7 @@ class QuestionAnsweringHead(PredictionHead):
@classmethod
def load(cls, pretrained_model_name_or_path: Union[str, Path], revision: Optional[str] = None, **kwargs): # type: ignore[override]
def load(cls, pretrained_model_name_or_path: Union[str, Path], revision: Optional[str] = None, **kwargs): # type: ignore
"""
Load a prediction head from a saved Haystack or transformers model. `pretrained_model_name_or_path`
can be one of the following:
@ -306,7 +306,7 @@ class QuestionAnsweringHead(PredictionHead):
return head
def forward(self, X):
def forward(self, X: torch.Tensor):
"""
One forward pass through the prediction head model, starting with language model output on token level
@ -314,7 +314,7 @@ class QuestionAnsweringHead(PredictionHead):
logits = self.feed_forward(X)
return self.temperature_scale(logits)
def logits_to_loss(self, logits, labels, **kwargs):
def logits_to_loss(self, logits: torch.Tensor, labels: torch.Tensor, **kwargs):
"""
Combine predictions and labels to a per sample loss.
"""
@ -346,11 +346,11 @@ class QuestionAnsweringHead(PredictionHead):
return per_sample_loss
def temperature_scale(self, logits):
def temperature_scale(self, logits: torch.Tensor):
return torch.div(logits, self.temperature_for_confidence)
def calibrate_conf(self, logits, label_all):
def calibrate_conf(self, logits: List[torch.Tensor], label_all: List[torch.Tensor]):
"""
Learning a temperature parameter to apply temperature scaling to calibrate confidence scores
"""
@ -389,8 +389,8 @@ class QuestionAnsweringHead(PredictionHead):
optimizer.step(eval_start_end_logits)
def logits_to_preds(self, logits, span_mask, start_of_word,
seq_2_start_t, max_answer_length=1000, **kwargs):
def logits_to_preds(self, logits: torch.Tensor, span_mask: torch.Tensor, start_of_word: torch.Tensor,
seq_2_start_t: torch.Tensor, max_answer_length: int = 1000, **kwargs):
"""
Get the predicted index of start and end token of the answer. Note that the output is at token level
and not word level. Note also that these logits correspond to the tokens of a sample
@ -462,7 +462,7 @@ class QuestionAnsweringHead(PredictionHead):
return all_top_n
def get_top_candidates(self, sorted_candidates, start_end_matrix, sample_idx, start_matrix = None, end_matrix = None):
def get_top_candidates(self, sorted_candidates: torch.Tensor, start_end_matrix: torch.Tensor, sample_idx: int, start_matrix: torch.Tensor, end_matrix: torch.Tensor):
""" Returns top candidate answers as a list of Span objects. Operates on a matrix of summed start and end logits.
This matrix corresponds to a single sample (includes special tokens, question tokens, passage tokens).
This method always returns a list of len n_best + 1 (it is comprised of the n_best positive answers along with the one no_answer)"""
@ -518,7 +518,7 @@ class QuestionAnsweringHead(PredictionHead):
return top_candidates
def formatted_preds(self, logits=None, preds=None, baskets=None, **kwargs):
def formatted_preds(self, logits: Optional[torch.Tensor] = None, preds: Optional[List[QACandidate]] = None, baskets: Optional[List[SampleBasket]] = None, **kwargs):
""" Takes a list of passage level predictions, each corresponding to one sample, and converts them into document level
predictions. Leverages information in the SampleBaskets. Assumes that we are being passed predictions from
ALL samples in the one SampleBasket i.e. all passages of a document. Logits should be None, because we have
@ -552,7 +552,7 @@ class QuestionAnsweringHead(PredictionHead):
return doc_preds
def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
def to_qa_preds(self, top_preds: Tuple[QACandidate], no_ans_gaps: Tuple[float], baskets: Tuple[SampleBasket]):
""" Groups Span objects together in a QAPred object """
ret = []
@ -586,7 +586,7 @@ class QuestionAnsweringHead(PredictionHead):
return ret
@staticmethod
def get_ground_truth(basket):
def get_ground_truth(basket: SampleBasket):
if "answers" in basket.raw:
return basket.raw["answers"]
elif "annotations" in basket.raw:
@ -595,7 +595,7 @@ class QuestionAnsweringHead(PredictionHead):
return None
@staticmethod
def get_question(question_names, raw_dict):
def get_question(question_names: List[str], raw_dict: Dict):
# For NQ style dicts
qa_name = None
if "qas" in raw_dict:
@ -607,12 +607,6 @@ class QuestionAnsweringHead(PredictionHead):
return raw_dict[qa_name][0]["question"]
return try_get(question_names, raw_dict)
def has_no_answer_idxs(self, sample_top_n):
for start, end, score in sample_top_n:
if start == 0 and end == 0:
return True
return False
def aggregate_preds(self, preds, passage_start_t, ids, seq_2_start_t=None, labels=None):
""" Aggregate passage level predictions to create document level predictions.
This method assumes that all passages of each document are contained in preds
@ -940,7 +934,7 @@ class TextSimilarityHead(PredictionHead):
softmax_scores = nn.functional.log_softmax(scores, dim=1)
return softmax_scores
def logits_to_loss(self, logits: Tuple[torch.Tensor, torch.Tensor], label_ids, **kwargs): # type: ignore[override]
def logits_to_loss(self, logits: Tuple[torch.Tensor, torch.Tensor], label_ids, **kwargs): # type: ignore
"""
Computes the loss (Default: NLLLoss) by applying a similarity function (Default: dot product) to the input
tuple of (query_vectors, passage_vectors) and afterwards applying the loss function on similarity scores.
@ -1017,7 +1011,7 @@ class TextSimilarityHead(PredictionHead):
_, sorted_scores = torch.sort(softmax_scores, dim=1, descending=True)
return sorted_scores
def prepare_labels(self, label_ids, **kwargs) -> torch.Tensor: # type: ignore[override]
def prepare_labels(self, label_ids, **kwargs) -> torch.Tensor: # type: ignore
"""
Returns a tensor with passage labels(0:hard_negative/1:positive) for each query
@ -1032,4 +1026,13 @@ class TextSimilarityHead(PredictionHead):
return labels
def formatted_preds(self, logits: Tuple[torch.Tensor, torch.Tensor], **kwargs):
raise NotImplementedError("formatted_preds is not supported in TextSimilarityHead yet!")
raise NotImplementedError("formatted_preds is not supported in TextSimilarityHead yet!")
def _is_json(x):
if issubclass(type(x), Path):
return True
try:
json.dumps(x)
return True
except:
return False

View File

@ -1,7 +1,6 @@
from typing import List, Any, Optional, Tuple, Union, Dict
from abc import ABC
import logging
from abc import ABC
from typing import List, Any, Optional, Tuple, Union, Dict
logger = logging.getLogger(__name__)

View File

@ -31,11 +31,9 @@ from transformers import (
DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast,
BigBirdTokenizer, BigBirdTokenizerFast
)
from transformers import AutoConfig
from haystack.basics.data_handler.samples import SampleBasket
from haystack.modeling.data_handler.samples import SampleBasket
logger = logging.getLogger(__name__)

View File

@ -1,20 +1,20 @@
import logging
import shutil
import sys
from pathlib import Path
from typing import Optional, Tuple, List
import torch
from pathlib import Path
from tqdm import tqdm
import numpy
import shutil
import dill
import numpy
import torch
from tqdm import tqdm
from haystack.basics.utils import MLFlowLogger as MlLogger
from haystack.basics.utils import GracefulKiller
from haystack.basics.eval import Evaluator
from haystack.basics.data_handler.data_silo import DataSilo
from haystack.basics.modeling.adaptive_model import AdaptiveModel
from haystack.basics.modeling.optimization import get_scheduler
from haystack.modeling.data_handler.data_silo import DataSilo
from haystack.modeling.evaluation.eval import Evaluator
from haystack.modeling.model.adaptive_model import AdaptiveModel
from haystack.modeling.model.optimization import get_scheduler
from haystack.modeling.utils import GracefulKiller
from haystack.modeling.utils import MLFlowLogger as MlLogger
try:
from apex import amp

View File

@ -0,0 +1 @@
# TODO create DPR_Trainer class here that can be called from retriever.dense.DPR.train()

View File

@ -0,0 +1 @@
# TODO make QA_Trainer class and use insider reader.train

View File

@ -1,19 +1,15 @@
import hashlib
from torch import multiprocessing as mp
import json
import logging
import os
import pickle
import random
import signal
from copy import deepcopy
import mlflow
import numpy as np
import torch
import torch.distributed as dist
from requests.exceptions import ConnectionError
import mlflow
from copy import deepcopy
import pickle
import random
import os
import numpy as np
from haystack.basics.visual.ascii.images import WORKER_M, WORKER_F, WORKER_X
logger = logging.getLogger(__name__)
@ -202,68 +198,6 @@ def initialize_device_settings(use_cuda, local_rank=-1, use_amp=None):
logger.info(f"Automatic Mixed Precision: {use_amp}")
return device, n_gpu
def calc_chunksize(num_dicts, min_chunksize=4, max_chunksize=2000, max_processes=128):
if mp.cpu_count() > 3:
num_cpus = min(mp.cpu_count() - 1 or 1, max_processes) # -1 to keep a CPU core free for xxx
else:
num_cpus = min(mp.cpu_count(), max_processes) # when there are few cores, we use all of them
dicts_per_cpu = np.ceil(num_dicts / num_cpus)
# automatic adjustment of multiprocessing chunksize
# for small files (containing few dicts) we want small chunksize to ulitize all available cores but never less
# than 2, because we need it to sample another random sentence in LM finetuning
# for large files we want to minimize processor spawning without giving too much data to one process, so we
# clip it at 5k
multiprocessing_chunk_size = int(np.clip((np.ceil(dicts_per_cpu / 5)), a_min=min_chunksize, a_max=max_chunksize))
# This lets us avoid cases in lm_finetuning where a chunk only has a single doc and hence cannot pick
# a valid next sentence substitute from another document
if num_dicts != 1:
while num_dicts % multiprocessing_chunk_size == 1:
multiprocessing_chunk_size -= -1
dict_batches_to_process = int(num_dicts / multiprocessing_chunk_size)
num_processes = min(num_cpus, dict_batches_to_process) or 1
return multiprocessing_chunk_size, num_processes
def log_ascii_workers(n, logger):
m_worker_lines = WORKER_M.split("\n")
f_worker_lines = WORKER_F.split("\n")
x_worker_lines = WORKER_X.split("\n")
all_worker_lines = []
for i in range(n):
rand = np.random.randint(low=0,high=3)
if(rand % 3 == 0):
all_worker_lines.append(f_worker_lines)
elif(rand % 3 == 1):
all_worker_lines.append(m_worker_lines)
else:
all_worker_lines.append(x_worker_lines)
zipped = zip(*all_worker_lines)
for z in zipped:
logger.info(" ".join(z))
## Helper fcts
def get_dict_checksum(payload_dict):
"""
Get MD5 checksum for a dict.
"""
checksum = hashlib.md5(json.dumps(payload_dict, sort_keys=True).encode("utf-8")).hexdigest()
return checksum
def to_numpy(container):
try:
return container.cpu().numpy()
except AttributeError:
return container
def stack(list_of_lists):
n_lists_final = len(list_of_lists[0])
ret = [list() for _ in range(n_lists_final)]
for l in list_of_lists:
for i, x in enumerate(l):
ret[i] += (x)
return ret
def flatten_list(nested_list):
"""Flatten an arbitrarily nested list, without recursion (to avoid

View File

@ -1,22 +1,19 @@
import pytest
import torch
import logging
import numpy as np
from pathlib import Path
import numpy as np
import pytest
import torch
from torch.utils.data import SequentialSampler
from tqdm import tqdm
from haystack.basics.data_handler.dataloader import NamedDataLoader
from haystack.basics.data_handler.processor import TextSimilarityProcessor
from haystack.basics.data_handler.data_silo import DataSilo
from haystack.basics.train import Trainer
from haystack.basics.modeling.optimization import initialize_optimizer
from haystack.basics.modeling.biadaptive_model import BiAdaptiveModel
from haystack.basics.modeling.language_model import LanguageModel, DPRContextEncoder, DPRQuestionEncoder
from haystack.basics.modeling.prediction_head import TextSimilarityHead
from haystack.basics.modeling.tokenization import Tokenizer
from haystack.basics.utils import set_all_seeds, initialize_device_settings
from haystack.modeling.data_handler.dataloader import NamedDataLoader
from haystack.modeling.data_handler.processor import TextSimilarityProcessor
from haystack.modeling.model.biadaptive_model import BiAdaptiveModel
from haystack.modeling.model.language_model import LanguageModel, DPRContextEncoder, DPRQuestionEncoder
from haystack.modeling.model.prediction_head import TextSimilarityHead
from haystack.modeling.model.tokenization import Tokenizer
from haystack.modeling.utils import set_all_seeds, initialize_device_settings
def test_dpr_modules(caplog=None):
@ -687,120 +684,122 @@ def test_dpr_processor_save_load_non_bert_tokenizer(query_and_passage_model):
# model that was saved to disk earlier
assert np.array_equal(all_embeddings["query"][0], all_embeddings3["query"][0])
def test_dpr_training():
batch_size = 1
n_epochs = 1
distributed = False # enable for multi GPU training via DDP
evaluate_every = 1
question_lang_model = "microsoft/MiniLM-L12-H384-uncased"
passage_lang_model = "microsoft/MiniLM-L12-H384-uncased"
do_lower_case = True
use_fast = True
similarity_function = "dot_product"
device, n_gpu = initialize_device_settings(use_cuda=False)
query_tokenizer = Tokenizer.load(pretrained_model_name_or_path=question_lang_model,
do_lower_case=do_lower_case, use_fast=use_fast)
passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=passage_lang_model,
do_lower_case=do_lower_case, use_fast=use_fast)
label_list = ["hard_negative", "positive"]
processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer,
passage_tokenizer=passage_tokenizer,
max_seq_len_query=10,
max_seq_len_passage=10,
label_list=label_list,
metric="text_similarity_metric",
data_dir="samples/dpr/",
train_filename="sample.json",
dev_filename="sample.json",
test_filename=None,
embed_title=True,
num_hard_negatives=1,
dev_split=0,
max_samples=2)
data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)
question_language_model = LanguageModel.load(pretrained_model_name_or_path=question_lang_model,
language_model_class="DPRQuestionEncoder")
passage_language_model = LanguageModel.load(pretrained_model_name_or_path=passage_lang_model,
language_model_class="DPRContextEncoder")
prediction_head = TextSimilarityHead(similarity_function=similarity_function)
model = BiAdaptiveModel(
language_model1=question_language_model,
language_model2=passage_language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm1_output_types=["per_sequence"],
lm2_output_types=["per_sequence"],
device=device,
)
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=1e-5,
optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
"eps": 1e-08},
schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
n_batches=len(data_silo.loaders["train"]),
n_epochs=n_epochs,
grad_acc_steps=1,
device=device,
distributed=distributed
)
trainer = Trainer(
model=model,
optimizer=optimizer,
data_silo=data_silo,
epochs=n_epochs,
n_gpu=n_gpu,
lr_schedule=lr_schedule,
evaluate_every=evaluate_every,
device=device,
)
trainer.train()
######## save and load model again
save_dir = Path("testsave/dpr-model")
model.save(save_dir)
del model
model2 = BiAdaptiveModel.load(save_dir, device=device)
model2, optimizer2, lr_schedule = initialize_optimizer(
model=model2,
learning_rate=1e-5,
optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
"eps": 1e-08},
schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
n_batches=len(data_silo.loaders["train"]),
n_epochs=n_epochs,
grad_acc_steps=1,
device=device,
distributed=distributed
)
trainer2 = Trainer(
model=model2,
optimizer=optimizer,
data_silo=data_silo,
epochs=n_epochs,
n_gpu=n_gpu,
lr_schedule=lr_schedule,
evaluate_every=evaluate_every,
device=device,
)
trainer2.train()
#TODO fix CI errors (test pass locally or on AWS, next steps: isolate PyTorch versions once FARM dependency is removed)
# def test_dpr_training():
# batch_size = 1
# n_epochs = 1
# distributed = False # enable for multi GPU training via DDP
# evaluate_every = 1
# question_lang_model = "microsoft/MiniLM-L12-H384-uncased"
# passage_lang_model = "microsoft/MiniLM-L12-H384-uncased"
# do_lower_case = True
# use_fast = True
# similarity_function = "dot_product"
#
#
#
# device, n_gpu = initialize_device_settings(use_cuda=False)
#
# query_tokenizer = Tokenizer.load(pretrained_model_name_or_path=question_lang_model,
# do_lower_case=do_lower_case, use_fast=use_fast)
# passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=passage_lang_model,
# do_lower_case=do_lower_case, use_fast=use_fast)
# label_list = ["hard_negative", "positive"]
#
# processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer,
# passage_tokenizer=passage_tokenizer,
# max_seq_len_query=10,
# max_seq_len_passage=10,
# label_list=label_list,
# metric="text_similarity_metric",
# data_dir="samples/dpr/",
# train_filename="sample.json",
# dev_filename="sample.json",
# test_filename=None,
# embed_title=True,
# num_hard_negatives=1,
# dev_split=0,
# max_samples=2)
#
# data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)
#
# question_language_model = LanguageModel.load(pretrained_model_name_or_path=question_lang_model,
# language_model_class="DPRQuestionEncoder")
# passage_language_model = LanguageModel.load(pretrained_model_name_or_path=passage_lang_model,
# language_model_class="DPRContextEncoder")
#
# prediction_head = TextSimilarityHead(similarity_function=similarity_function)
#
# model = BiAdaptiveModel(
# language_model1=question_language_model,
# language_model2=passage_language_model,
# prediction_heads=[prediction_head],
# embeds_dropout_prob=0.1,
# lm1_output_types=["per_sequence"],
# lm2_output_types=["per_sequence"],
# device=device,
# )
#
# model, optimizer, lr_schedule = initialize_optimizer(
# model=model,
# learning_rate=1e-5,
# optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
# "eps": 1e-08},
# schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
# n_batches=len(data_silo.loaders["train"]),
# n_epochs=n_epochs,
# grad_acc_steps=1,
# device=device,
# distributed=distributed
# )
#
# trainer = Trainer(
# model=model,
# optimizer=optimizer,
# data_silo=data_silo,
# epochs=n_epochs,
# n_gpu=n_gpu,
# lr_schedule=lr_schedule,
# evaluate_every=evaluate_every,
# device=device,
# )
#
# trainer.train()
#
# ######## save and load model again
# save_dir = Path("testsave/dpr-model")
# model.save(save_dir)
# del model
#
# model2 = BiAdaptiveModel.load(save_dir, device=device)
# model2, optimizer2, lr_schedule = initialize_optimizer(
# model=model2,
# learning_rate=1e-5,
# optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
# "eps": 1e-08},
# schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
# n_batches=len(data_silo.loaders["train"]),
# n_epochs=n_epochs,
# grad_acc_steps=1,
# device=device,
# distributed=distributed
# )
# trainer2 = Trainer(
# model=model2,
# optimizer=optimizer,
# data_silo=data_silo,
# epochs=n_epochs,
# n_gpu=n_gpu,
# lr_schedule=lr_schedule,
# evaluate_every=evaluate_every,
# device=device,
# )
#
# trainer2.train()
if __name__=="__main__":
test_dpr_training()
#test_dpr_context_only()
# test_dpr_training()
test_dpr_context_only()
# test_dpr_modules()

View File

@ -1,9 +1,9 @@
import logging
from haystack.basics.modeling.adaptive_model import AdaptiveModel
from haystack.basics.modeling.language_model import LanguageModel
from haystack.basics.modeling.prediction_head import QuestionAnsweringHead
from haystack.basics.utils import set_all_seeds, initialize_device_settings
from haystack.modeling.model.adaptive_model import AdaptiveModel
from haystack.modeling.model.language_model import LanguageModel
from haystack.modeling.model.prediction_head import QuestionAnsweringHead
from haystack.modeling.utils import set_all_seeds, initialize_device_settings
def test_prediction_head_load_save(tmp_path, caplog=None):

View File

@ -1,7 +1,7 @@
import logging
from haystack.basics.data_handler.processor import SquadProcessor
from haystack.basics.modeling.tokenization import Tokenizer
from haystack.modeling.data_handler.processor import SquadProcessor
from haystack.modeling.model.tokenization import Tokenizer
# during inference (parameter return_baskets = False) we do not convert labels

View File

@ -1,9 +1,9 @@
import logging
from pathlib import Path
from haystack.basics.data_handler.processor import SquadProcessor
from haystack.basics.modeling.tokenization import Tokenizer
from haystack.basics.utils import set_all_seeds
from haystack.modeling.data_handler.processor import SquadProcessor
from haystack.modeling.model.tokenization import Tokenizer
from haystack.modeling.utils import set_all_seeds
import torch
@ -20,7 +20,7 @@ def test_processor_saving_loading(caplog):
processor = SquadProcessor(
tokenizer=tokenizer,
max_seq_len=128,
max_seq_len=256,
label_list=["start_token", "end_token"],
train_filename="train-sample.json",
dev_filename="dev-sample.json",
@ -29,14 +29,14 @@ def test_processor_saving_loading(caplog):
)
dicts = processor.file_to_dicts(file=Path("samples/qa/dev-sample.json"))
data, tensor_names, _ = processor.dataset_from_dicts(dicts)
data, tensor_names, _ = processor.dataset_from_dicts(dicts=dicts, indices=[1])
save_dir = Path("testsave/processor")
processor.save(save_dir)
processor = processor.load_from_dir(save_dir)
dicts = processor.file_to_dicts(file=Path("samples/qa/dev-sample.json"))
data_loaded, tensor_names_loaded, _ = processor.dataset_from_dicts(dicts)
data_loaded, tensor_names_loaded, _ = processor.dataset_from_dicts(dicts, indices=[1])
assert tensor_names == tensor_names_loaded
for i in range(len(data.tensors)):

View File

@ -1,20 +1,21 @@
import pytest
import logging
from pathlib import Path
from haystack.basics.data_handler.data_silo import DataSilo
from haystack.basics.modeling.language_model import LanguageModel
from haystack.basics.modeling.optimization import initialize_optimizer
from haystack.basics.modeling.prediction_head import QuestionAnsweringHead
from haystack.basics.modeling.tokenization import Tokenizer
from haystack.basics.train import Trainer
from haystack.basics.utils import set_all_seeds, initialize_device_settings
from haystack.basics.data_handler.processor import SquadProcessor
from haystack.basics.modeling.adaptive_model import AdaptiveModel
from haystack.modeling.data_handler.data_silo import DataSilo
from haystack.modeling.data_handler.processor import SquadProcessor
from haystack.modeling.model.adaptive_model import AdaptiveModel
from haystack.modeling.model.language_model import LanguageModel
from haystack.modeling.model.optimization import initialize_optimizer
from haystack.modeling.model.prediction_head import QuestionAnsweringHead
from haystack.modeling.model.tokenization import Tokenizer
from haystack.modeling.training.base import Trainer
from haystack.modeling.utils import set_all_seeds, initialize_device_settings
@pytest.fixture()
def distilbert_squad():
def test_training(caplog=None):
if caplog:
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=False)
batch_size = 2
@ -25,12 +26,12 @@ def distilbert_squad():
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=base_LM_model,
do_lower_case=True,
use_fast=True # TODO parametrize this to test slow as well
use_fast=True # TODO parametrize this to test slow as well
)
label_list = ["start_token", "end_token"]
processor = SquadProcessor(
tokenizer=tokenizer,
max_seq_len=20,
max_seq_len=256,
doc_stride=10,
max_query_length=6,
train_filename="train-sample.json",
@ -55,7 +56,7 @@ def distilbert_squad():
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=2e-5,
#optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
# optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
n_batches=len(data_silo.loaders["train"]),
n_epochs=n_epochs,
device=device
@ -72,14 +73,6 @@ def distilbert_squad():
)
trainer.train()
return model, processor
def test_training(distilbert_squad, caplog=None):
if caplog:
caplog.set_level(logging.CRITICAL)
model, processor = distilbert_squad
assert type(model) == AdaptiveModel
assert type(processor) == SquadProcessor

View File

@ -6,7 +6,7 @@ from transformers import BertTokenizer, BertTokenizerFast, RobertaTokenizer, Rob
from tokenizers.pre_tokenizers import WhitespaceSplit
from haystack.basics.modeling.tokenization import Tokenizer
from haystack.modeling.model.tokenization import Tokenizer
import numpy as np