diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py index 61400ffb8..5871fc404 100644 --- a/haystack/reader/farm.py +++ b/haystack/reader/farm.py @@ -508,7 +508,7 @@ class FARMReader(BaseReader): # Create DataLoader that can be passed to the Evaluator tic = perf_counter() indices = range(len(farm_input)) - dataset, tensor_names = self.inferencer.processor.dataset_from_dicts(farm_input, indices=indices) + dataset, tensor_names, problematic_ids = self.inferencer.processor.dataset_from_dicts(farm_input, indices=indices) data_loader = NamedDataLoader(dataset=dataset, batch_size=self.inferencer.batch_size, tensor_names=tensor_names) evaluator = Evaluator(data_loader=data_loader, tasks=self.inferencer.processor.tasks, device=device) diff --git a/haystack/retriever/dense.py b/haystack/retriever/dense.py index 89e33939f..127decacc 100644 --- a/haystack/retriever/dense.py +++ b/haystack/retriever/dense.py @@ -179,7 +179,7 @@ class DensePassageRetriever(BaseRetriever): """ - dataset, tensor_names, baskets = self.processor.dataset_from_dicts( + dataset, tensor_names, problematic_ids, baskets = self.processor.dataset_from_dicts( dicts, indices=[i for i in range(len(dicts))], return_baskets=True ) diff --git a/haystack/summarizer/transformers.py b/haystack/summarizer/transformers.py index 9e7a237ca..329468d2a 100644 --- a/haystack/summarizer/transformers.py +++ b/haystack/summarizer/transformers.py @@ -2,6 +2,8 @@ import logging from typing import Any, Dict, List, Optional from transformers import pipeline +from transformers.models.auto.modeling_auto import AutoModelForSeq2SeqLM +from transformers import AutoTokenizer from haystack import Document from haystack.summarizer.base import BaseSummarizer @@ -73,7 +75,11 @@ class TransformersSummarizer(BaseSummarizer): into a single text. This separator appears between those subsequent docs. """ - self.summarizer = pipeline("summarization", model=model_name_or_path, tokenizer=tokenizer, device=use_gpu) + # TODO AutoModelForSeq2SeqLM is only necessary with transformers==4.1.1, with newer versions use the pipeline directly + if tokenizer is None: + tokenizer = model_name_or_path + model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_name_or_path) + self.summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=use_gpu) self.max_length = max_length self.min_length = min_length self.clean_up_tokenization_spaces = clean_up_tokenization_spaces diff --git a/requirements.txt b/requirements.txt index a279ab8a3..83a77c027 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -farm==0.5.0 +farm==0.6.2 --find-links=https://download.pytorch.org/whl/torch_stable.html fastapi uvicorn diff --git a/test/test_dpr_retriever.py b/test/test_dpr_retriever.py index 9433ed4dc..40f7ab4d5 100644 --- a/test/test_dpr_retriever.py +++ b/test/test_dpr_retriever.py @@ -113,6 +113,6 @@ def test_dpr_saving_and_loading(retriever, document_store): assert loaded_retriever.query_tokenizer.do_lower_case == True assert loaded_retriever.passage_tokenizer.vocab_size == 30522 assert loaded_retriever.query_tokenizer.vocab_size == 30522 - assert loaded_retriever.passage_tokenizer.max_len == 512 - assert loaded_retriever.query_tokenizer.max_len == 512 + assert loaded_retriever.passage_tokenizer.model_max_length == 512 + assert loaded_retriever.query_tokenizer.model_max_length == 512