mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 04:56:45 +00:00
remove dpr_utils.py
This commit is contained in:
parent
06243dbda4
commit
4e46d9d176
@ -1,726 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# Utilility functions and classes required for DensePassageRetriever
|
||||
#
|
||||
# Building upon the code (https://github.com/facebookresearch/DPR) published by Facebook research under Creative Commons License (https://github.com/facebookresearch/DPR/blob/master/LICENSE)
|
||||
# It is based on the following research work:
|
||||
# Karpukhin, Vladimir, et al. "Dense Passage Retrieval for Open-Domain Question Answering." arXiv preprint arXiv:2004.04906 (2020).
|
||||
# (https://arxiv.org/abs/2004.04906)
|
||||
|
||||
import logging
|
||||
from typing import Tuple, Union, List
|
||||
|
||||
import gzip
|
||||
import re
|
||||
|
||||
import torch
|
||||
from torch import nn, Tensor
|
||||
|
||||
from transformers.modeling_bert import BertModel, BertConfig
|
||||
from transformers.file_utils import add_start_docstrings_to_callable
|
||||
from transformers.modeling_utils import PreTrainedModel
|
||||
from transformers.file_utils import add_start_docstrings
|
||||
from transformers.tokenization_bert import BertTokenizer, BertTokenizerFast
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
_CONFIG_FOR_DOC = "DPRConfig"
|
||||
|
||||
DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
"facebook/dpr-ctx_encoder-single-nq-base",
|
||||
]
|
||||
DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
"facebook/dpr-question_encoder-single-nq-base",
|
||||
]
|
||||
DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
"facebook/dpr-reader-single-nq-base",
|
||||
]
|
||||
# CLASSES
|
||||
############
|
||||
# file_utils
|
||||
############
|
||||
|
||||
class ModelOutput:
|
||||
"""
|
||||
Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows preprocessor by integer or slice (like
|
||||
a tuple) or strings (like a dictionnary) that will ignore the ``None`` attributes.
|
||||
"""
|
||||
|
||||
def to_tuple(self):
|
||||
"""
|
||||
Converts :obj:`self` to a tuple.
|
||||
|
||||
Return: A tuple containing all non-:obj:`None` attributes of the :obj:`self`.
|
||||
"""
|
||||
return tuple(getattr(self, f) for f in self.__dataclass_fields__.keys() if getattr(self, f, None) is not None)
|
||||
|
||||
def to_dict(self):
|
||||
"""
|
||||
Converts :obj:`self` to a Python dictionary.
|
||||
|
||||
Return: A dictionary containing all non-:obj:`None` attributes of the :obj:`self`.
|
||||
"""
|
||||
return {f: getattr(self, f) for f in self.__dataclass_fields__.keys() if getattr(self, f, None) is not None}
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self.to_dict()[i] if isinstance(i, str) else self.to_tuple()[i]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.to_tuple())
|
||||
|
||||
RETURN_INTRODUCTION = r"""
|
||||
Returns:
|
||||
:class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)` (if ``return_tuple=True`` is passed or when ``config.return_tuple=True``) comprising various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs:
|
||||
"""
|
||||
|
||||
def _prepare_output_docstrings(output_type, config_class):
|
||||
"""
|
||||
Prepares the return part of the docstring using `output_type`.
|
||||
"""
|
||||
docstrings = output_type.__doc__
|
||||
|
||||
# Remove the head of the docstring to keep the list of args only
|
||||
lines = docstrings.split("\n")
|
||||
i = 0
|
||||
while i < len(lines) and re.search(r"^\s*(Args|Parameters):\s*$", lines[i]) is None:
|
||||
i += 1
|
||||
if i < len(lines):
|
||||
docstrings = "\n".join(lines[(i + 1) :])
|
||||
|
||||
# Add the return introduction
|
||||
full_output_type = f"{output_type.__module__}.{output_type.__name__}"
|
||||
intro = RETURN_INTRODUCTION.format(full_output_type=full_output_type, config_class=config_class)
|
||||
return intro + docstrings
|
||||
|
||||
def replace_return_docstrings(output_type=None, config_class=None):
|
||||
def docstring_decorator(fn):
|
||||
docstrings = fn.__doc__
|
||||
lines = docstrings.split("\n")
|
||||
i = 0
|
||||
while i < len(lines) and re.search(r"^\s*Returns?:\s*$", lines[i]) is None:
|
||||
i += 1
|
||||
if i < len(lines):
|
||||
lines[i] = _prepare_output_docstrings(output_type, config_class)
|
||||
docstrings = "\n".join(lines)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"The function {fn} should have an empty 'Return:' or 'Returns:' in its docstring as placeholder, current docstring is:\n{docstrings}"
|
||||
)
|
||||
fn.__doc__ = docstrings
|
||||
return fn
|
||||
|
||||
return docstring_decorator
|
||||
|
||||
###########
|
||||
# modeling_outputs
|
||||
###########
|
||||
@dataclass
|
||||
class BaseModelOutputWithPooling(ModelOutput):
|
||||
"""
|
||||
Base class for model's outputs that also contains a pooling of the last hidden states.
|
||||
|
||||
Args:
|
||||
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
|
||||
Sequence of hidden-states at the output of the last layer of the model.
|
||||
pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
|
||||
Last layer hidden-state of the first token of the sequence (classification token)
|
||||
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||
layer weights are trained from the next sentence prediction (classification)
|
||||
objective during pretraining.
|
||||
|
||||
This output is usually *not* a good summary
|
||||
of the semantic content of the input, you're often better with averaging or pooling
|
||||
the sequence of hidden-states for the whole input sequence.
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
last_hidden_state: torch.FloatTensor
|
||||
pooler_output: torch.FloatTensor
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
|
||||
###########
|
||||
#tokenization_dpr
|
||||
###########
|
||||
|
||||
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
|
||||
|
||||
CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"facebook/dpr-ctx_encoder-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
|
||||
}
|
||||
}
|
||||
QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"facebook/dpr-question_encoder-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
|
||||
}
|
||||
}
|
||||
READER_PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"facebook/dpr-reader-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
|
||||
}
|
||||
}
|
||||
|
||||
CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
"facebook/dpr-ctx_encoder-single-nq-base": 512,
|
||||
}
|
||||
QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
"facebook/dpr-question_encoder-single-nq-base": 512,
|
||||
}
|
||||
READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
"facebook/dpr-reader-single-nq-base": 512,
|
||||
}
|
||||
|
||||
|
||||
CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION = {
|
||||
"facebook/dpr-ctx_encoder-single-nq-base": {"do_lower_case": True},
|
||||
}
|
||||
QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION = {
|
||||
"facebook/dpr-question_encoder-single-nq-base": {"do_lower_case": True},
|
||||
}
|
||||
READER_PRETRAINED_INIT_CONFIGURATION = {
|
||||
"facebook/dpr-reader-single-nq-base": {"do_lower_case": True},
|
||||
}
|
||||
|
||||
|
||||
class DPRContextEncoderTokenizer(BertTokenizer):
|
||||
r"""
|
||||
Constructs a DPRContextEncoderTokenizer.
|
||||
|
||||
:class:`~transformers.DPRContextEncoderTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
|
||||
tokenization: punctuation splitting + wordpiece.
|
||||
|
||||
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
|
||||
parameters.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION
|
||||
|
||||
|
||||
class DPRContextEncoderTokenizerFast(BertTokenizerFast):
|
||||
r"""
|
||||
Constructs a "Fast" DPRContextEncoderTokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
|
||||
:class:`~transformers.DPRContextEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
|
||||
tokenization: punctuation splitting + wordpiece.
|
||||
|
||||
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
|
||||
parameters.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION
|
||||
|
||||
|
||||
class DPRQuestionEncoderTokenizer(BertTokenizer):
|
||||
r"""
|
||||
Constructs a DPRQuestionEncoderTokenizer.
|
||||
|
||||
:class:`~transformers.DPRQuestionEncoderTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
|
||||
tokenization: punctuation splitting + wordpiece.
|
||||
|
||||
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
|
||||
parameters.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION
|
||||
|
||||
|
||||
class DPRQuestionEncoderTokenizerFast(BertTokenizerFast):
|
||||
r"""
|
||||
Constructs a "Fast" DPRQuestionEncoderTokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
|
||||
:class:`~transformers.DPRQuestionEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
|
||||
tokenization: punctuation splitting + wordpiece.
|
||||
|
||||
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
|
||||
parameters.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION
|
||||
|
||||
|
||||
|
||||
|
||||
##########
|
||||
# configuration_dpr
|
||||
##########
|
||||
|
||||
|
||||
DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"facebook/dpr-ctx_encoder-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/dpr-ctx_encoder-single-nq-base/config.json",
|
||||
"facebook/dpr-question_encoder-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/dpr-question_encoder-single-nq-base/config.json",
|
||||
"facebook/dpr-reader-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/dpr-reader-single-nq-base/config.json",
|
||||
}
|
||||
|
||||
|
||||
class DPRConfig(BertConfig):
|
||||
r"""
|
||||
:class:`~transformers.DPRConfig` is the configuration class to store the configuration of a
|
||||
`DPRModel`.
|
||||
|
||||
This is the configuration class to store the configuration of a `DPRContextEncoder`, `DPRQuestionEncoder`, or a `DPRReader`.
|
||||
It is used to instantiate the components of the DPR model.
|
||||
|
||||
Args:
|
||||
projection_dim (:obj:`int`, optional, defaults to 0):
|
||||
Dimension of the projection for the context and question encoders.
|
||||
If it is set to zero (default), then no projection is done.
|
||||
"""
|
||||
model_type = "dpr"
|
||||
|
||||
def __init__(self, projection_dim: int = 0, **kwargs): # projection of the encoders, 0 for no projection
|
||||
super().__init__(**kwargs)
|
||||
self.projection_dim = projection_dim
|
||||
|
||||
##########
|
||||
# Outputs
|
||||
##########
|
||||
|
||||
|
||||
@dataclass
|
||||
class DPRContextEncoderOutput(ModelOutput):
|
||||
"""
|
||||
Class for outputs of :class:`~transformers.DPRQuestionEncoder`.
|
||||
|
||||
Args:
|
||||
pooler_output: (:obj:``torch.FloatTensor`` of shape ``(batch_size, embeddings_size)``):
|
||||
The DPR encoder outputs the `pooler_output` that corresponds to the context representation.
|
||||
Last layer hidden-state of the first token of the sequence (classification token)
|
||||
further processed by a Linear layer. This output is to be used to embed contexts for
|
||||
nearest neighbors queries with questions embeddings.
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
pooler_output: torch.FloatTensor
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
@dataclass
|
||||
class DPRQuestionEncoderOutput(ModelOutput):
|
||||
"""
|
||||
Class for outputs of :class:`~transformers.DPRQuestionEncoder`.
|
||||
|
||||
Args:
|
||||
pooler_output: (:obj:``torch.FloatTensor`` of shape ``(batch_size, embeddings_size)``):
|
||||
The DPR encoder outputs the `pooler_output` that corresponds to the question representation.
|
||||
Last layer hidden-state of the first token of the sequence (classification token)
|
||||
further processed by a Linear layer. This output is to be used to embed questions for
|
||||
nearest neighbors queries with context embeddings.
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
pooler_output: torch.FloatTensor
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DPRReaderOutput(ModelOutput):
|
||||
"""
|
||||
Class for outputs of :class:`~transformers.DPRQuestionEncoder`.
|
||||
|
||||
Args:
|
||||
start_logits: (:obj:``torch.FloatTensor`` of shape ``(n_passages, sequence_length)``):
|
||||
Logits of the start index of the span for each passage.
|
||||
end_logits: (:obj:``torch.FloatTensor`` of shape ``(n_passages, sequence_length)``):
|
||||
Logits of the end index of the span for each passage.
|
||||
relevance_logits: (:obj:`torch.FloatTensor`` of shape ``(n_passages, )``):
|
||||
Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage
|
||||
to answer the question, compared to all the other passages.
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
|
||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
start_logits: torch.FloatTensor
|
||||
end_logits: torch.FloatTensor
|
||||
relevance_logits: torch.FloatTensor
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
|
||||
##################
|
||||
# PreTrainedModel
|
||||
##################
|
||||
|
||||
class DPREncoder(PreTrainedModel):
|
||||
|
||||
base_model_prefix = "bert_model"
|
||||
|
||||
def __init__(self, config: DPRConfig):
|
||||
super().__init__(config)
|
||||
self.bert_model = BertModel(config)
|
||||
assert self.bert_model.config.hidden_size > 0, "Encoder hidden_size can't be zero"
|
||||
self.projection_dim = config.projection_dim
|
||||
if self.projection_dim > 0:
|
||||
self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim)
|
||||
self.init_weights()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Tensor,
|
||||
attention_mask: Optional[Tensor] = None,
|
||||
token_type_ids: Optional[Tensor] = None,
|
||||
inputs_embeds: Optional[Tensor] = None,
|
||||
output_attentions: bool = False,
|
||||
output_hidden_states: bool = False,
|
||||
return_tuple: bool = True,
|
||||
) -> Union[BaseModelOutputWithPooling, Tuple[Tensor, ...]]:
|
||||
outputs = self.bert_model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
)
|
||||
sequence_output, pooled_output = outputs[:2]
|
||||
pooled_output = sequence_output[:, 0, :]
|
||||
if self.projection_dim > 0:
|
||||
pooled_output = self.encode_proj(pooled_output)
|
||||
|
||||
if return_tuple:
|
||||
return (sequence_output, pooled_output) + outputs[2:]
|
||||
|
||||
return BaseModelOutputWithPooling(
|
||||
last_hidden_state=sequence_output,
|
||||
pooler_output=pooled_output,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
@property
|
||||
def embeddings_size(self) -> int:
|
||||
if self.projection_dim > 0:
|
||||
return self.encode_proj.out_features
|
||||
return self.bert_model.config.hidden_size
|
||||
|
||||
def init_weights(self):
|
||||
self.bert_model.init_weights()
|
||||
if self.projection_dim > 0:
|
||||
self.encode_proj.apply(self.bert_model._init_weights)
|
||||
|
||||
|
||||
class DPRPretrainedContextEncoder(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = DPRConfig
|
||||
load_tf_weights = None
|
||||
base_model_prefix = "ctx_encoder"
|
||||
|
||||
def init_weights(self):
|
||||
self.ctx_encoder.init_weights()
|
||||
|
||||
|
||||
class DPRPretrainedQuestionEncoder(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = DPRConfig
|
||||
load_tf_weights = None
|
||||
base_model_prefix = "question_encoder"
|
||||
|
||||
def init_weights(self):
|
||||
self.question_encoder.init_weights()
|
||||
|
||||
|
||||
DPR_START_DOCSTRING = r"""
|
||||
|
||||
This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
|
||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
|
||||
usage and behavior.
|
||||
|
||||
Parameters:
|
||||
config (:class:`~transformers.DPRConfig`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
||||
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
|
||||
|
||||
DPR_ENCODERS_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids: (:obj:``torch.LongTensor`` of shape ``(batch_size, sequence_length)``):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
To match pre-training, DPR input sequence should be formatted with [CLS] and [SEP] tokens as follows:
|
||||
|
||||
(a) For sequence pairs (for a pair title+text for example):
|
||||
|
||||
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
||||
|
||||
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
||||
|
||||
(b) For single sequences (for a question for example):
|
||||
|
||||
``tokens: [CLS] the dog is hairy . [SEP]``
|
||||
|
||||
``token_type_ids: 0 0 0 0 0 0 0``
|
||||
|
||||
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on
|
||||
the right rather than the left.
|
||||
|
||||
Indices can be obtained using :class:`transformers.DPRTokenizer`.
|
||||
See :func:`transformers.PreTrainedTokenizer.encode` and
|
||||
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||
attention_mask: (:obj:``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
token_type_ids: (:obj:``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
Segment token indices to indicate first and second portions of the inputs.
|
||||
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||
corresponds to a `sentence B` token
|
||||
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
|
||||
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
|
||||
If set to ``True``, the hidden states tensors of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare DPRContextEncoder transformer outputting pooler outputs as context representations.",
|
||||
DPR_START_DOCSTRING,
|
||||
)
|
||||
class DPRContextEncoder(DPRPretrainedContextEncoder):
|
||||
def __init__(self, config: DPRConfig):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
self.ctx_encoder = DPREncoder(config)
|
||||
self.init_weights()
|
||||
|
||||
@add_start_docstrings_to_callable(DPR_ENCODERS_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=DPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[Tensor] = None,
|
||||
attention_mask: Optional[Tensor] = None,
|
||||
token_type_ids: Optional[Tensor] = None,
|
||||
inputs_embeds: Optional[Tensor] = None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=True,
|
||||
) -> Union[DPRContextEncoderOutput, Tuple[Tensor, ...]]:
|
||||
r"""
|
||||
Return:
|
||||
|
||||
Examples::
|
||||
|
||||
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
|
||||
tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
|
||||
model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
|
||||
input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
|
||||
embeddings = model(input_ids)[0] # the embeddings of the given context.
|
||||
|
||||
"""
|
||||
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
elif input_ids is not None:
|
||||
input_shape = input_ids.size()
|
||||
device = input_ids.device
|
||||
elif inputs_embeds is not None:
|
||||
input_shape = torch.Size(inputs_embeds.size()[:-1])
|
||||
device = inputs_embeds.device
|
||||
else:
|
||||
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||
|
||||
if attention_mask is None:
|
||||
attention_mask = (
|
||||
torch.ones(input_shape, device=device)
|
||||
if input_ids is None
|
||||
else (input_ids != self.config.pad_token_id)
|
||||
)
|
||||
if token_type_ids is None:
|
||||
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
||||
|
||||
outputs = self.ctx_encoder(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
)
|
||||
|
||||
if return_tuple:
|
||||
return outputs[1:]
|
||||
return DPRContextEncoderOutput(
|
||||
pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.",
|
||||
DPR_START_DOCSTRING,
|
||||
)
|
||||
class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
|
||||
def __init__(self, config: DPRConfig):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
self.question_encoder = DPREncoder(config)
|
||||
self.init_weights()
|
||||
|
||||
@add_start_docstrings_to_callable(DPR_ENCODERS_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=DPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[Tensor] = None,
|
||||
attention_mask: Optional[Tensor] = None,
|
||||
token_type_ids: Optional[Tensor] = None,
|
||||
inputs_embeds: Optional[Tensor] = None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_tuple=True,
|
||||
) -> Union[DPRQuestionEncoderOutput, Tuple[Tensor, ...]]:
|
||||
r"""
|
||||
Return:
|
||||
|
||||
Examples::
|
||||
|
||||
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
|
||||
tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
|
||||
model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
|
||||
input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
|
||||
embeddings = model(input_ids)[0] # the embeddings of the given question.
|
||||
"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
elif input_ids is not None:
|
||||
input_shape = input_ids.size()
|
||||
device = input_ids.device
|
||||
elif inputs_embeds is not None:
|
||||
input_shape = torch.Size(inputs_embeds.size()[:-1])
|
||||
device = inputs_embeds.device
|
||||
else:
|
||||
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||
|
||||
if attention_mask is None:
|
||||
attention_mask = (
|
||||
torch.ones(input_shape, device=device)
|
||||
if input_ids is None
|
||||
else (input_ids != self.config.pad_token_id)
|
||||
)
|
||||
if token_type_ids is None:
|
||||
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
||||
|
||||
outputs = self.question_encoder(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_tuple=return_tuple,
|
||||
)
|
||||
|
||||
if return_tuple:
|
||||
return outputs[1:]
|
||||
return DPRQuestionEncoderOutput(
|
||||
pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
|
||||
)
|
||||
|
||||
|
||||
# UTILS
|
||||
def move_to_device(sample, device):
|
||||
if len(sample) == 0:
|
||||
return {}
|
||||
|
||||
def _move_to_device(maybe_tensor, device):
|
||||
if torch.is_tensor(maybe_tensor):
|
||||
return maybe_tensor.to(device)
|
||||
elif isinstance(maybe_tensor, dict):
|
||||
return {
|
||||
key: _move_to_device(value, device)
|
||||
for key, value in maybe_tensor.items()
|
||||
}
|
||||
elif isinstance(maybe_tensor, list):
|
||||
return [_move_to_device(x, device) for x in maybe_tensor]
|
||||
elif isinstance(maybe_tensor, tuple):
|
||||
return [_move_to_device(x, device) for x in maybe_tensor]
|
||||
else:
|
||||
return maybe_tensor
|
||||
|
||||
return _move_to_device(sample, device)
|
||||
|
||||
def unpack(gzip_file: str, out_file: str):
|
||||
print('Uncompressing ', gzip_file)
|
||||
input = gzip.GzipFile(gzip_file, 'rb')
|
||||
s = input.read()
|
||||
input.close()
|
||||
output = open(out_file, 'wb')
|
||||
output.write(s)
|
||||
output.close()
|
||||
print('Saved to ', out_file)
|
||||
Loading…
x
Reference in New Issue
Block a user