mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-13 15:57:24 +00:00
LFQA: Remove InferenceProcessor dependency (#1559)
This commit is contained in:
parent
3539e6b041
commit
74d052277d
@ -6,6 +6,7 @@ from torch.nn import DataParallel
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from haystack.modeling.data_handler.dataset import convert_features_to_dataset
|
||||||
from haystack.modeling.utils import initialize_device_settings
|
from haystack.modeling.utils import initialize_device_settings
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
from transformers import AutoTokenizer, AutoModel
|
from transformers import AutoTokenizer, AutoModel
|
||||||
@ -19,7 +20,7 @@ from haystack.modeling.model.tokenization import Tokenizer
|
|||||||
from haystack.modeling.model.language_model import LanguageModel
|
from haystack.modeling.model.language_model import LanguageModel
|
||||||
from haystack.modeling.model.biadaptive_model import BiAdaptiveModel
|
from haystack.modeling.model.biadaptive_model import BiAdaptiveModel
|
||||||
from haystack.modeling.model.prediction_head import TextSimilarityHead
|
from haystack.modeling.model.prediction_head import TextSimilarityHead
|
||||||
from haystack.modeling.data_handler.processor import TextSimilarityProcessor, InferenceProcessor
|
from haystack.modeling.data_handler.processor import TextSimilarityProcessor
|
||||||
from haystack.modeling.data_handler.data_silo import DataSilo
|
from haystack.modeling.data_handler.data_silo import DataSilo
|
||||||
from haystack.modeling.data_handler.dataloader import NamedDataLoader
|
from haystack.modeling.data_handler.dataloader import NamedDataLoader
|
||||||
from haystack.modeling.model.optimization import initialize_optimizer
|
from haystack.modeling.model.optimization import initialize_optimizer
|
||||||
@ -693,11 +694,8 @@ class _RetribertEmbeddingEncoder(_EmbeddingEncoder):
|
|||||||
else:
|
else:
|
||||||
self.device = torch.device("cpu")
|
self.device = torch.device("cpu")
|
||||||
|
|
||||||
embedding_tokenizer = AutoTokenizer.from_pretrained(retriever.embedding_model,
|
self.embedding_tokenizer = AutoTokenizer.from_pretrained(retriever.embedding_model)
|
||||||
use_fast_tokenizers=True)
|
|
||||||
self.embedding_model = AutoModel.from_pretrained(retriever.embedding_model).to(self.device)
|
self.embedding_model = AutoModel.from_pretrained(retriever.embedding_model).to(self.device)
|
||||||
self.processor = InferenceProcessor(tokenizer=embedding_tokenizer,
|
|
||||||
max_seq_len=embedding_tokenizer.max_len_single_sentence)
|
|
||||||
|
|
||||||
def embed_queries(self, texts: List[str]) -> List[np.ndarray]:
|
def embed_queries(self, texts: List[str]) -> List[np.ndarray]:
|
||||||
|
|
||||||
@ -735,8 +733,29 @@ class _RetribertEmbeddingEncoder(_EmbeddingEncoder):
|
|||||||
|
|
||||||
def _create_dataloader(self, text_to_encode: List[dict]) -> NamedDataLoader:
|
def _create_dataloader(self, text_to_encode: List[dict]) -> NamedDataLoader:
|
||||||
|
|
||||||
dataset, tensor_names, _ = self.processor.dataset_from_dicts(text_to_encode,
|
dataset, tensor_names = self.dataset_from_dicts(text_to_encode)
|
||||||
indices=[i for i in range(len(text_to_encode))])
|
|
||||||
dataloader = NamedDataLoader(dataset=dataset, sampler=SequentialSampler(dataset),
|
dataloader = NamedDataLoader(dataset=dataset, sampler=SequentialSampler(dataset),
|
||||||
batch_size=32, tensor_names=tensor_names)
|
batch_size=32, tensor_names=tensor_names)
|
||||||
return dataloader
|
return dataloader
|
||||||
|
|
||||||
|
def dataset_from_dicts(self, dicts: List[dict]):
|
||||||
|
texts = [x["text"] for x in dicts]
|
||||||
|
tokenized_batch = self.embedding_tokenizer(
|
||||||
|
texts,
|
||||||
|
return_token_type_ids=True,
|
||||||
|
return_attention_mask=True,
|
||||||
|
truncation=True,
|
||||||
|
padding=True
|
||||||
|
)
|
||||||
|
|
||||||
|
features_flat = []
|
||||||
|
for input_ids, segment_ids, padding_mask in zip(
|
||||||
|
tokenized_batch["input_ids"], tokenized_batch["token_type_ids"], tokenized_batch["attention_mask"]
|
||||||
|
):
|
||||||
|
feat_dict = {"input_ids": input_ids,
|
||||||
|
"padding_mask": padding_mask,
|
||||||
|
"segment_ids": segment_ids}
|
||||||
|
features_flat.append(feat_dict)
|
||||||
|
|
||||||
|
dataset, tensornames = convert_features_to_dataset(features=features_flat)
|
||||||
|
return dataset, tensornames
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user