refactor: adopt token instead of use_auth_token in HF components (#6040)

* move embedding backends

* use token in Sentence Transformers embeddings

* more compact token handling

* token parameter in reader

* add token to ranker

* release note

* add test for reader
This commit is contained in:
Stefano Fiorucci 2023-10-17 16:32:13 +02:00 committed by GitHub
parent 4e4af99a5e
commit 21d894d85a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 161 additions and 46 deletions

View File

@ -17,7 +17,7 @@ class SentenceTransformersDocumentEmbedder:
self,
model_name_or_path: str = "sentence-transformers/all-mpnet-base-v2",
device: Optional[str] = None,
use_auth_token: Union[bool, str, None] = None,
token: Union[bool, str, None] = None,
prefix: str = "",
suffix: str = "",
batch_size: int = 32,
@ -33,7 +33,7 @@ class SentenceTransformersDocumentEmbedder:
such as ``'sentence-transformers/all-mpnet-base-v2'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
Defaults to CPU.
:param use_auth_token: The API token used to download private models from Hugging Face.
:param token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param prefix: A string to add to the beginning of each Document text before embedding.
@ -48,7 +48,7 @@ class SentenceTransformersDocumentEmbedder:
self.model_name_or_path = model_name_or_path
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.use_auth_token = use_auth_token
self.token = token
self.prefix = prefix
self.suffix = suffix
self.batch_size = batch_size
@ -71,7 +71,7 @@ class SentenceTransformersDocumentEmbedder:
self,
model_name_or_path=self.model_name_or_path,
device=self.device,
use_auth_token=self.use_auth_token,
token=self.token if not isinstance(self.token, str) else None, # don't serialize valid tokens
prefix=self.prefix,
suffix=self.suffix,
batch_size=self.batch_size,
@ -94,7 +94,7 @@ class SentenceTransformersDocumentEmbedder:
"""
if not hasattr(self, "embedding_backend"):
self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.token
)
@component.output_types(documents=List[Document])

View File

@ -16,7 +16,7 @@ class SentenceTransformersTextEmbedder:
self,
model_name_or_path: str = "sentence-transformers/all-mpnet-base-v2",
device: Optional[str] = None,
use_auth_token: Union[bool, str, None] = None,
token: Union[bool, str, None] = None,
prefix: str = "",
suffix: str = "",
batch_size: int = 32,
@ -30,7 +30,7 @@ class SentenceTransformersTextEmbedder:
such as ``'sentence-transformers/all-mpnet-base-v2'``.
:param device: Device (like 'cuda' / 'cpu') that should be used for computation.
Defaults to CPU.
:param use_auth_token: The API token used to download private models from Hugging Face.
:param token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param prefix: A string to add to the beginning of each text.
@ -43,7 +43,7 @@ class SentenceTransformersTextEmbedder:
self.model_name_or_path = model_name_or_path
# TODO: remove device parameter and use Haystack's device management once migrated
self.device = device or "cpu"
self.use_auth_token = use_auth_token
self.token = token
self.prefix = prefix
self.suffix = suffix
self.batch_size = batch_size
@ -64,7 +64,7 @@ class SentenceTransformersTextEmbedder:
self,
model_name_or_path=self.model_name_or_path,
device=self.device,
use_auth_token=self.use_auth_token,
token=self.token if not isinstance(self.token, str) else None, # don't serialize valid tokens
prefix=self.prefix,
suffix=self.suffix,
batch_size=self.batch_size,
@ -85,7 +85,7 @@ class SentenceTransformersTextEmbedder:
"""
if not hasattr(self, "embedding_backend"):
self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.use_auth_token
model_name_or_path=self.model_name_or_path, device=self.device, use_auth_token=self.token
)
@component.output_types(embedding=List[float])

View File

@ -36,15 +36,19 @@ class SimilarityRanker:
def __init__(
self,
model_name_or_path: Union[str, Path] = "cross-encoder/ms-marco-MiniLM-L-6-v2",
top_k: int = 10,
device: str = "cpu",
token: Union[bool, str, None] = None,
top_k: int = 10,
):
"""
Creates an instance of SimilarityRanker.
:param model_name_or_path: Path to a pre-trained sentence-transformers model.
:param top_k: The maximum number of documents to return per query.
:param device: torch device (for example, cuda:0, cpu, mps) to limit model inference to a specific device.
:param token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param top_k: The maximum number of documents to return per query.
"""
torch_and_transformers_import.check()
@ -53,6 +57,7 @@ class SimilarityRanker:
raise ValueError(f"top_k must be > 0, but got {top_k}")
self.top_k = top_k
self.device = device
self.token = token
self.model = None
self.tokenizer = None
@ -67,16 +72,22 @@ class SimilarityRanker:
Warm up the model and tokenizer used in scoring the documents.
"""
if self.model_name_or_path and not self.model:
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path)
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path, token=self.token)
self.model = self.model.to(self.device)
self.model.eval()
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, token=self.token)
def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(self, top_k=self.top_k, device=self.device, model_name_or_path=self.model_name_or_path)
return default_to_dict(
self,
device=self.device,
model_name_or_path=self.model_name_or_path,
token=self.token if not isinstance(self.token, str) else None, # don't serialize valid tokens
top_k=self.top_k,
)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "SimilarityRanker":

View File

@ -25,6 +25,7 @@ class ExtractiveReader:
self,
model_name_or_path: Union[Path, str] = "deepset/roberta-base-squad2-distilled",
device: Optional[str] = None,
token: Union[bool, str, None] = None,
top_k: int = 20,
confidence_threshold: Optional[float] = None,
max_seq_length: int = 384,
@ -40,6 +41,9 @@ class ExtractiveReader:
Can either be a path to a folder containing the model files or an identifier for the HF hub
Default: `'deepset/roberta-base-squad2-distilled'`
:param device: Pytorch device string. Uses GPU by default if available
:param token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param top_k: Number of answers to return per query.
It is required even if confidence_threshold is set. Defaults to 20.
:param confidence_threshold: Answers with a confidence score below this value will not be returned
@ -58,6 +62,7 @@ class ExtractiveReader:
self.model_name_or_path = str(model_name_or_path)
self.model = None
self.device = device
self.token = token
self.max_seq_length = max_seq_length
self.top_k = top_k
self.confidence_threshold = confidence_threshold
@ -81,6 +86,7 @@ class ExtractiveReader:
self,
model_name_or_path=self.model_name_or_path,
device=self.device,
token=self.token if not isinstance(self.token, str) else None,
max_seq_length=self.max_seq_length,
top_k=self.top_k,
confidence_threshold=self.confidence_threshold,
@ -104,8 +110,10 @@ class ExtractiveReader:
self.device = self.device or "cuda:0"
else:
self.device = self.device or "cpu:0"
self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_name_or_path).to(self.device)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_name_or_path, token=self.token).to(
self.device
)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, token=self.token)
def _flatten_documents(
self, queries: List[str], documents: List[List[Document]]

View File

@ -0,0 +1,8 @@
---
preview:
- |
Adopt Hugging Face `token` instead of the deprecated `use_auth_token`.
Add this parameter to `ExtractiveReader` and `SimilarityRanker` to allow
loading private models.
Proper handling of `token` during serialization: if it is a string (a possible valid token)
it is not serialized.

View File

@ -14,7 +14,7 @@ class TestSentenceTransformersDocumentEmbedder:
embedder = SentenceTransformersDocumentEmbedder(model_name_or_path="model")
assert embedder.model_name_or_path == "model"
assert embedder.device == "cpu"
assert embedder.use_auth_token is None
assert embedder.token is None
assert embedder.prefix == ""
assert embedder.suffix == ""
assert embedder.batch_size == 32
@ -28,7 +28,7 @@ class TestSentenceTransformersDocumentEmbedder:
embedder = SentenceTransformersDocumentEmbedder(
model_name_or_path="model",
device="cuda",
use_auth_token=True,
token=True,
prefix="prefix",
suffix="suffix",
batch_size=64,
@ -39,7 +39,7 @@ class TestSentenceTransformersDocumentEmbedder:
)
assert embedder.model_name_or_path == "model"
assert embedder.device == "cuda"
assert embedder.use_auth_token is True
assert embedder.token is True
assert embedder.prefix == "prefix"
assert embedder.suffix == "suffix"
assert embedder.batch_size == 64
@ -57,7 +57,7 @@ class TestSentenceTransformersDocumentEmbedder:
"init_parameters": {
"model_name_or_path": "model",
"device": "cpu",
"use_auth_token": None,
"token": None,
"prefix": "",
"suffix": "",
"batch_size": 32,
@ -73,7 +73,7 @@ class TestSentenceTransformersDocumentEmbedder:
component = SentenceTransformersDocumentEmbedder(
model_name_or_path="model",
device="cuda",
use_auth_token="the-token",
token="the-token",
prefix="prefix",
suffix="suffix",
batch_size=64,
@ -83,12 +83,13 @@ class TestSentenceTransformersDocumentEmbedder:
embedding_separator=" - ",
)
data = component.to_dict()
assert data == {
"type": "SentenceTransformersDocumentEmbedder",
"init_parameters": {
"model_name_or_path": "model",
"device": "cuda",
"use_auth_token": "the-token",
"token": None, # the token is not serialized
"prefix": "prefix",
"suffix": "suffix",
"batch_size": 64,
@ -106,7 +107,7 @@ class TestSentenceTransformersDocumentEmbedder:
"init_parameters": {
"model_name_or_path": "model",
"device": "cuda",
"use_auth_token": "the-token",
"token": None,
"prefix": "prefix",
"suffix": "suffix",
"batch_size": 64,
@ -119,7 +120,7 @@ class TestSentenceTransformersDocumentEmbedder:
component = SentenceTransformersDocumentEmbedder.from_dict(data)
assert component.model_name_or_path == "model"
assert component.device == "cuda"
assert component.use_auth_token == "the-token"
assert component.token is None
assert component.prefix == "prefix"
assert component.suffix == "suffix"
assert component.batch_size == 64

View File

@ -12,7 +12,7 @@ class TestSentenceTransformersTextEmbedder:
embedder = SentenceTransformersTextEmbedder(model_name_or_path="model")
assert embedder.model_name_or_path == "model"
assert embedder.device == "cpu"
assert embedder.use_auth_token is None
assert embedder.token is None
assert embedder.prefix == ""
assert embedder.suffix == ""
assert embedder.batch_size == 32
@ -24,7 +24,7 @@ class TestSentenceTransformersTextEmbedder:
embedder = SentenceTransformersTextEmbedder(
model_name_or_path="model",
device="cuda",
use_auth_token=True,
token=True,
prefix="prefix",
suffix="suffix",
batch_size=64,
@ -33,7 +33,7 @@ class TestSentenceTransformersTextEmbedder:
)
assert embedder.model_name_or_path == "model"
assert embedder.device == "cuda"
assert embedder.use_auth_token is True
assert embedder.token is True
assert embedder.prefix == "prefix"
assert embedder.suffix == "suffix"
assert embedder.batch_size == 64
@ -49,7 +49,7 @@ class TestSentenceTransformersTextEmbedder:
"init_parameters": {
"model_name_or_path": "model",
"device": "cpu",
"use_auth_token": None,
"token": None,
"prefix": "",
"suffix": "",
"batch_size": 32,
@ -63,7 +63,7 @@ class TestSentenceTransformersTextEmbedder:
component = SentenceTransformersTextEmbedder(
model_name_or_path="model",
device="cuda",
use_auth_token=True,
token=True,
prefix="prefix",
suffix="suffix",
batch_size=64,
@ -76,7 +76,7 @@ class TestSentenceTransformersTextEmbedder:
"init_parameters": {
"model_name_or_path": "model",
"device": "cuda",
"use_auth_token": True,
"token": True,
"prefix": "prefix",
"suffix": "suffix",
"batch_size": 64,
@ -85,6 +85,24 @@ class TestSentenceTransformersTextEmbedder:
},
}
@pytest.mark.unit
def test_to_dict_not_serialize_token(self):
component = SentenceTransformersTextEmbedder(model_name_or_path="model", token="awesome-token")
data = component.to_dict()
assert data == {
"type": "SentenceTransformersTextEmbedder",
"init_parameters": {
"model_name_or_path": "model",
"device": "cpu",
"token": None,
"prefix": "",
"suffix": "",
"batch_size": 32,
"progress_bar": True,
"normalize_embeddings": False,
},
}
@pytest.mark.unit
def test_from_dict(self):
data = {
@ -92,7 +110,7 @@ class TestSentenceTransformersTextEmbedder:
"init_parameters": {
"model_name_or_path": "model",
"device": "cuda",
"use_auth_token": True,
"token": True,
"prefix": "prefix",
"suffix": "suffix",
"batch_size": 64,
@ -103,7 +121,7 @@ class TestSentenceTransformersTextEmbedder:
component = SentenceTransformersTextEmbedder.from_dict(data)
assert component.model_name_or_path == "model"
assert component.device == "cuda"
assert component.use_auth_token is True
assert component.token is True
assert component.prefix == "prefix"
assert component.suffix == "suffix"
assert component.batch_size == 64

View File

@ -7,19 +7,6 @@ from haystack.preview.components.rankers.similarity import SimilarityRanker
class TestSimilarityRanker:
@pytest.mark.unit
def test_to_dict(self):
component = SimilarityRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2")
data = component.to_dict()
assert data == {
"type": "SimilarityRanker",
"init_parameters": {
"device": "cpu",
"top_k": 10,
"model_name_or_path": "cross-encoder/ms-marco-MiniLM-L-6-v2",
},
}
@pytest.mark.unit
def test_to_dict_with_custom_init_parameters(self):
component = SimilarityRanker()
data = component.to_dict()
assert data == {
@ -28,6 +15,21 @@ class TestSimilarityRanker:
"device": "cpu",
"top_k": 10,
"model_name_or_path": "cross-encoder/ms-marco-MiniLM-L-6-v2",
"token": None,
},
}
@pytest.mark.unit
def test_to_dict_with_custom_init_parameters(self):
component = SimilarityRanker(model_name_or_path="my_model", device="cuda", token="my_token", top_k=5)
data = component.to_dict()
assert data == {
"type": "SimilarityRanker",
"init_parameters": {
"device": "cuda",
"model_name_or_path": "my_model",
"token": None, # we don't serialize valid tokens,
"top_k": 5,
},
}

View File

@ -87,6 +87,62 @@ example_documents = [
] * 2
@pytest.mark.unit
def test_to_dict():
component = ExtractiveReader("my-model", token="secret-token")
data = component.to_dict()
assert data == {
"type": "ExtractiveReader",
"init_parameters": {
"model_name_or_path": "my-model",
"device": None,
"token": None, # don't serialize valid tokens
"top_k": 20,
"confidence_threshold": None,
"max_seq_length": 384,
"stride": 128,
"max_batch_size": None,
"answers_per_seq": None,
"no_answer": True,
"calibration_factor": 0.1,
},
}
@pytest.mark.unit
def test_from_dict():
data = {
"type": "ExtractiveReader",
"init_parameters": {
"model_name_or_path": "my-model",
"device": "cpu",
"token": None,
"top_k": 30,
"confidence_threshold": 0.5,
"max_seq_length": 300,
"stride": 100,
"max_batch_size": 20,
"answers_per_seq": 5,
"no_answer": False,
"calibration_factor": 0.5,
},
}
component = ExtractiveReader.from_dict(data)
assert component.model_name_or_path == "my-model"
assert component.device == "cpu"
assert component.token is None
assert component.top_k == 30
assert component.confidence_threshold == 0.5
assert component.max_seq_length == 300
assert component.stride == 100
assert component.max_batch_size == 20
assert component.answers_per_seq == 5
assert component.no_answer is False
assert component.calibration_factor == 0.5
@pytest.mark.unit
def test_output(mock_reader: ExtractiveReader):
answers = mock_reader.run(example_queries[0], example_documents[0], top_k=3)[
@ -209,6 +265,17 @@ def test_nest_answers(mock_reader: ExtractiveReader):
assert no_answer.probability == pytest.approx(expected_no_answer)
@pytest.mark.unit
@patch("haystack.preview.components.readers.extractive.AutoTokenizer.from_pretrained")
@patch("haystack.preview.components.readers.extractive.AutoModelForQuestionAnswering.from_pretrained")
def test_warm_up_use_hf_token(mocked_automodel, mocked_autotokenizer):
reader = ExtractiveReader("deepset/roberta-base-squad2", token="fake-token")
reader.warm_up()
mocked_automodel.assert_called_once_with("deepset/roberta-base-squad2", token="fake-token")
mocked_autotokenizer.assert_called_once_with("deepset/roberta-base-squad2", token="fake-token")
@pytest.mark.integration
def test_t5():
reader = ExtractiveReader("TARUNBHATT/flan-t5-small-finetuned-squad")