diff --git a/haystack/retriever/dpr_utils.py b/haystack/retriever/dpr_utils.py deleted file mode 100644 index 86a66949c..000000000 --- a/haystack/retriever/dpr_utils.py +++ /dev/null @@ -1,726 +0,0 @@ -#!/usr/bin/env python3 -# Utilility functions and classes required for DensePassageRetriever -# -# Building upon the code (https://github.com/facebookresearch/DPR) published by Facebook research under Creative Commons License (https://github.com/facebookresearch/DPR/blob/master/LICENSE) -# It is based on the following research work: -# Karpukhin, Vladimir, et al. "Dense Passage Retrieval for Open-Domain Question Answering." arXiv preprint arXiv:2004.04906 (2020). -# (https://arxiv.org/abs/2004.04906) - -import logging -from typing import Tuple, Union, List - -import gzip -import re - -import torch -from torch import nn, Tensor - -from transformers.modeling_bert import BertModel, BertConfig -from transformers.file_utils import add_start_docstrings_to_callable -from transformers.modeling_utils import PreTrainedModel -from transformers.file_utils import add_start_docstrings -from transformers.tokenization_bert import BertTokenizer, BertTokenizerFast - -import logging -from dataclasses import dataclass -from typing import Optional, Tuple, Union - - - -logger = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - -_CONFIG_FOR_DOC = "DPRConfig" - -DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "facebook/dpr-ctx_encoder-single-nq-base", -] -DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "facebook/dpr-question_encoder-single-nq-base", -] -DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "facebook/dpr-reader-single-nq-base", -] -# CLASSES -############ -# file_utils -############ - -class ModelOutput: - """ - Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows preprocessor by integer or slice (like - a tuple) or strings (like a dictionnary) that will ignore the ``None`` attributes. - """ - - def to_tuple(self): - """ - Converts :obj:`self` to a tuple. - - Return: A tuple containing all non-:obj:`None` attributes of the :obj:`self`. - """ - return tuple(getattr(self, f) for f in self.__dataclass_fields__.keys() if getattr(self, f, None) is not None) - - def to_dict(self): - """ - Converts :obj:`self` to a Python dictionary. - - Return: A dictionary containing all non-:obj:`None` attributes of the :obj:`self`. - """ - return {f: getattr(self, f) for f in self.__dataclass_fields__.keys() if getattr(self, f, None) is not None} - - def __getitem__(self, i): - return self.to_dict()[i] if isinstance(i, str) else self.to_tuple()[i] - - def __len__(self): - return len(self.to_tuple()) - -RETURN_INTRODUCTION = r""" - Returns: - :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)` (if ``return_tuple=True`` is passed or when ``config.return_tuple=True``) comprising various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs: -""" - -def _prepare_output_docstrings(output_type, config_class): - """ - Prepares the return part of the docstring using `output_type`. - """ - docstrings = output_type.__doc__ - - # Remove the head of the docstring to keep the list of args only - lines = docstrings.split("\n") - i = 0 - while i < len(lines) and re.search(r"^\s*(Args|Parameters):\s*$", lines[i]) is None: - i += 1 - if i < len(lines): - docstrings = "\n".join(lines[(i + 1) :]) - - # Add the return introduction - full_output_type = f"{output_type.__module__}.{output_type.__name__}" - intro = RETURN_INTRODUCTION.format(full_output_type=full_output_type, config_class=config_class) - return intro + docstrings - -def replace_return_docstrings(output_type=None, config_class=None): - def docstring_decorator(fn): - docstrings = fn.__doc__ - lines = docstrings.split("\n") - i = 0 - while i < len(lines) and re.search(r"^\s*Returns?:\s*$", lines[i]) is None: - i += 1 - if i < len(lines): - lines[i] = _prepare_output_docstrings(output_type, config_class) - docstrings = "\n".join(lines) - else: - raise ValueError( - f"The function {fn} should have an empty 'Return:' or 'Returns:' in its docstring as placeholder, current docstring is:\n{docstrings}" - ) - fn.__doc__ = docstrings - return fn - - return docstring_decorator - -########### -# modeling_outputs -########### -@dataclass -class BaseModelOutputWithPooling(ModelOutput): - """ - Base class for model's outputs that also contains a pooling of the last hidden states. - - Args: - last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during pretraining. - - This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - last_hidden_state: torch.FloatTensor - pooler_output: torch.FloatTensor - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -########### -#tokenization_dpr -########### - -VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} - -CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = { - "vocab_file": { - "facebook/dpr-ctx_encoder-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", - } -} -QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = { - "vocab_file": { - "facebook/dpr-question_encoder-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", - } -} -READER_PRETRAINED_VOCAB_FILES_MAP = { - "vocab_file": { - "facebook/dpr-reader-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", - } -} - -CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - "facebook/dpr-ctx_encoder-single-nq-base": 512, -} -QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - "facebook/dpr-question_encoder-single-nq-base": 512, -} -READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - "facebook/dpr-reader-single-nq-base": 512, -} - - -CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION = { - "facebook/dpr-ctx_encoder-single-nq-base": {"do_lower_case": True}, -} -QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION = { - "facebook/dpr-question_encoder-single-nq-base": {"do_lower_case": True}, -} -READER_PRETRAINED_INIT_CONFIGURATION = { - "facebook/dpr-reader-single-nq-base": {"do_lower_case": True}, -} - - -class DPRContextEncoderTokenizer(BertTokenizer): - r""" - Constructs a DPRContextEncoderTokenizer. - - :class:`~transformers.DPRContextEncoderTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end - tokenization: punctuation splitting + wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning - parameters. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION - - -class DPRContextEncoderTokenizerFast(BertTokenizerFast): - r""" - Constructs a "Fast" DPRContextEncoderTokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.DPRContextEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end - tokenization: punctuation splitting + wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning - parameters. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION - - -class DPRQuestionEncoderTokenizer(BertTokenizer): - r""" - Constructs a DPRQuestionEncoderTokenizer. - - :class:`~transformers.DPRQuestionEncoderTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end - tokenization: punctuation splitting + wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning - parameters. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION - - -class DPRQuestionEncoderTokenizerFast(BertTokenizerFast): - r""" - Constructs a "Fast" DPRQuestionEncoderTokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.DPRQuestionEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end - tokenization: punctuation splitting + wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning - parameters. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION - - - - -########## -# configuration_dpr -########## - - -DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "facebook/dpr-ctx_encoder-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/dpr-ctx_encoder-single-nq-base/config.json", - "facebook/dpr-question_encoder-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/dpr-question_encoder-single-nq-base/config.json", - "facebook/dpr-reader-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/dpr-reader-single-nq-base/config.json", -} - - -class DPRConfig(BertConfig): - r""" - :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a - `DPRModel`. - - This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`. - It is used to instantiate the components of the DPR model. - - Args: - projection_dim (:obj:`int`, optional, defaults to 0): - Dimension of the projection for the context and question encoders. - If it is set to zero (default), then no projection is done. - """ - model_type = "dpr" - - def __init__(self, projection_dim: int = 0, **kwargs): # projection of the encoders, 0 for no projection - super().__init__(**kwargs) - self.projection_dim = projection_dim - -########## -# Outputs -########## - - -@dataclass -class DPRContextEncoderOutput(ModelOutput): - """ - Class for outputs of :class:`~transformers.DPRQuestionEncoder`. - - Args: - pooler_output: (:obj:``torch.FloatTensor`` of shape ``(batch_size, embeddings_size)``): - The DPR encoder outputs the `pooler_output` that corresponds to the context representation. - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer. This output is to be used to embed contexts for - nearest neighbors queries with questions embeddings. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - pooler_output: torch.FloatTensor - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - -@dataclass -class DPRQuestionEncoderOutput(ModelOutput): - """ - Class for outputs of :class:`~transformers.DPRQuestionEncoder`. - - Args: - pooler_output: (:obj:``torch.FloatTensor`` of shape ``(batch_size, embeddings_size)``): - The DPR encoder outputs the `pooler_output` that corresponds to the question representation. - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer. This output is to be used to embed questions for - nearest neighbors queries with context embeddings. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - pooler_output: torch.FloatTensor - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -class DPRReaderOutput(ModelOutput): - """ - Class for outputs of :class:`~transformers.DPRQuestionEncoder`. - - Args: - start_logits: (:obj:``torch.FloatTensor`` of shape ``(n_passages, sequence_length)``): - Logits of the start index of the span for each passage. - end_logits: (:obj:``torch.FloatTensor`` of shape ``(n_passages, sequence_length)``): - Logits of the end index of the span for each passage. - relevance_logits: (:obj:`torch.FloatTensor`` of shape ``(n_passages, )``): - Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage - to answer the question, compared to all the other passages. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - start_logits: torch.FloatTensor - end_logits: torch.FloatTensor - relevance_logits: torch.FloatTensor - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - -################## -# PreTrainedModel -################## - -class DPREncoder(PreTrainedModel): - - base_model_prefix = "bert_model" - - def __init__(self, config: DPRConfig): - super().__init__(config) - self.bert_model = BertModel(config) - assert self.bert_model.config.hidden_size > 0, "Encoder hidden_size can't be zero" - self.projection_dim = config.projection_dim - if self.projection_dim > 0: - self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim) - self.init_weights() - - def forward( - self, - input_ids: Tensor, - attention_mask: Optional[Tensor] = None, - token_type_ids: Optional[Tensor] = None, - inputs_embeds: Optional[Tensor] = None, - output_attentions: bool = False, - output_hidden_states: bool = False, - return_tuple: bool = True, - ) -> Union[BaseModelOutputWithPooling, Tuple[Tensor, ...]]: - outputs = self.bert_model( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - sequence_output, pooled_output = outputs[:2] - pooled_output = sequence_output[:, 0, :] - if self.projection_dim > 0: - pooled_output = self.encode_proj(pooled_output) - - if return_tuple: - return (sequence_output, pooled_output) + outputs[2:] - - return BaseModelOutputWithPooling( - last_hidden_state=sequence_output, - pooler_output=pooled_output, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - @property - def embeddings_size(self) -> int: - if self.projection_dim > 0: - return self.encode_proj.out_features - return self.bert_model.config.hidden_size - - def init_weights(self): - self.bert_model.init_weights() - if self.projection_dim > 0: - self.encode_proj.apply(self.bert_model._init_weights) - - -class DPRPretrainedContextEncoder(PreTrainedModel): - """ An abstract class to handle weights initialization and - a simple interface for downloading and loading pretrained models. - """ - - config_class = DPRConfig - load_tf_weights = None - base_model_prefix = "ctx_encoder" - - def init_weights(self): - self.ctx_encoder.init_weights() - - -class DPRPretrainedQuestionEncoder(PreTrainedModel): - """ An abstract class to handle weights initialization and - a simple interface for downloading and loading pretrained models. - """ - - config_class = DPRConfig - load_tf_weights = None - base_model_prefix = "question_encoder" - - def init_weights(self): - self.question_encoder.init_weights() - - -DPR_START_DOCSTRING = r""" - - This model is a PyTorch `torch.nn.Module `_ sub-class. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general - usage and behavior. - - Parameters: - config (:class:`~transformers.DPRConfig`): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the configuration. - Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. -""" - - -DPR_ENCODERS_INPUTS_DOCSTRING = r""" - Args: - input_ids: (:obj:``torch.LongTensor`` of shape ``(batch_size, sequence_length)``): - Indices of input sequence tokens in the vocabulary. - To match pre-training, DPR input sequence should be formatted with [CLS] and [SEP] tokens as follows: - - (a) For sequence pairs (for a pair title+text for example): - - ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` - - ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` - - (b) For single sequences (for a question for example): - - ``tokens: [CLS] the dog is hairy . [SEP]`` - - ``token_type_ids: 0 0 0 0 0 0 0`` - - DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on - the right rather than the left. - - Indices can be obtained using :class:`transformers.DPRTokenizer`. - See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. - attention_mask: (:obj:``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): - Mask to avoid performing attention on padding token indices. - Mask values selected in ``[0, 1]``: - ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. - token_type_ids: (:obj:``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): - Segment token indices to indicate first and second portions of the inputs. - Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` - corresponds to a `sentence B` token - inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): - Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. - output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the hidden states tensors of all layers are returned. See ``hidden_states`` under returned tensors for more detail. -""" - - -@add_start_docstrings( - "The bare DPRContextEncoder transformer outputting pooler outputs as context representations.", - DPR_START_DOCSTRING, -) -class DPRContextEncoder(DPRPretrainedContextEncoder): - def __init__(self, config: DPRConfig): - super().__init__(config) - self.config = config - self.ctx_encoder = DPREncoder(config) - self.init_weights() - - @add_start_docstrings_to_callable(DPR_ENCODERS_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=DPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: Optional[Tensor] = None, - attention_mask: Optional[Tensor] = None, - token_type_ids: Optional[Tensor] = None, - inputs_embeds: Optional[Tensor] = None, - output_attentions=None, - output_hidden_states=None, - return_tuple=True, - ) -> Union[DPRContextEncoderOutput, Tuple[Tensor, ...]]: - r""" - Return: - - Examples:: - - from transformers import DPRContextEncoder, DPRContextEncoderTokenizer - tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') - model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') - input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"] - embeddings = model(input_ids)[0] # the embeddings of the given context. - - """ - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple - - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - device = input_ids.device - elif inputs_embeds is not None: - input_shape = torch.Size(inputs_embeds.size()[:-1]) - device = inputs_embeds.device - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - if attention_mask is None: - attention_mask = ( - torch.ones(input_shape, device=device) - if input_ids is None - else (input_ids != self.config.pad_token_id) - ) - if token_type_ids is None: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) - - outputs = self.ctx_encoder( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - if return_tuple: - return outputs[1:] - return DPRContextEncoderOutput( - pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions - ) - - -@add_start_docstrings( - "The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.", - DPR_START_DOCSTRING, -) -class DPRQuestionEncoder(DPRPretrainedQuestionEncoder): - def __init__(self, config: DPRConfig): - super().__init__(config) - self.config = config - self.question_encoder = DPREncoder(config) - self.init_weights() - - @add_start_docstrings_to_callable(DPR_ENCODERS_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=DPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: Optional[Tensor] = None, - attention_mask: Optional[Tensor] = None, - token_type_ids: Optional[Tensor] = None, - inputs_embeds: Optional[Tensor] = None, - output_attentions=None, - output_hidden_states=None, - return_tuple=True, - ) -> Union[DPRQuestionEncoderOutput, Tuple[Tensor, ...]]: - r""" - Return: - - Examples:: - - from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer - tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base') - model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base') - input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"] - embeddings = model(input_ids)[0] # the embeddings of the given question. - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple - - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - device = input_ids.device - elif inputs_embeds is not None: - input_shape = torch.Size(inputs_embeds.size()[:-1]) - device = inputs_embeds.device - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - if attention_mask is None: - attention_mask = ( - torch.ones(input_shape, device=device) - if input_ids is None - else (input_ids != self.config.pad_token_id) - ) - if token_type_ids is None: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) - - outputs = self.question_encoder( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_tuple=return_tuple, - ) - - if return_tuple: - return outputs[1:] - return DPRQuestionEncoderOutput( - pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions - ) - - -# UTILS -def move_to_device(sample, device): - if len(sample) == 0: - return {} - - def _move_to_device(maybe_tensor, device): - if torch.is_tensor(maybe_tensor): - return maybe_tensor.to(device) - elif isinstance(maybe_tensor, dict): - return { - key: _move_to_device(value, device) - for key, value in maybe_tensor.items() - } - elif isinstance(maybe_tensor, list): - return [_move_to_device(x, device) for x in maybe_tensor] - elif isinstance(maybe_tensor, tuple): - return [_move_to_device(x, device) for x in maybe_tensor] - else: - return maybe_tensor - - return _move_to_device(sample, device) - -def unpack(gzip_file: str, out_file: str): - print('Uncompressing ', gzip_file) - input = gzip.GzipFile(gzip_file, 'rb') - s = input.read() - input.close() - output = open(out_file, 'wb') - output.write(s) - output.close() - print('Saved to ', out_file) \ No newline at end of file