From 2a4e6a1de2f909d3425a65a5e93c858ab2786146 Mon Sep 17 00:00:00 2001 From: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Date: Thu, 15 Feb 2024 16:05:43 +0100 Subject: [PATCH] refactor: Refactor `SASEvaluator` (#6998) * Remove preprocessing from SASEvaluator and add warm_up method * Update docstrings --- haystack/components/eval/sas_evaluator.py | 104 +++++------ test/components/eval/test_sas_evaluator.py | 196 ++++----------------- 2 files changed, 85 insertions(+), 215 deletions(-) diff --git a/haystack/components/eval/sas_evaluator.py b/haystack/components/eval/sas_evaluator.py index 8b4c30352..2c755922b 100644 --- a/haystack/components/eval/sas_evaluator.py +++ b/haystack/components/eval/sas_evaluator.py @@ -7,9 +7,7 @@ from haystack.lazy_imports import LazyImport from haystack.utils import ComponentDevice, expit from haystack.utils.auth import Secret, deserialize_secrets_inplace -from .preprocess import _preprocess_text - -with LazyImport(message="Run 'pip install scikit-learn \"sentence-transformers>=2.2.0\"'") as metrics_import: +with LazyImport(message="Run 'pip install scikit-learn \"sentence-transformers>=2.2.0\"'") as sas_import: from sentence_transformers import CrossEncoder, SentenceTransformer, util from transformers import AutoConfig @@ -22,17 +20,11 @@ class SASEvaluator: The SAS is computed using a pre-trained model from the Hugging Face model hub. The model can be either a Bi-Encoder or a Cross-Encoder. The choice of the model is based on the `model` parameter. - The default model is `sentence-transformers/paraphrase-multilingual-mpnet-base-v2`. """ def __init__( self, - labels: List[str], model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", - regexes_to_ignore: Optional[List[str]] = None, - ignore_case: bool = False, - ignore_punctuation: bool = False, - ignore_numbers: bool = False, batch_size: int = 32, device: Optional[ComponentDevice] = None, token: Secret = Secret.from_env_var("HF_API_TOKEN", strict=False), @@ -40,42 +32,25 @@ class SASEvaluator: """ Creates a new instance of SASEvaluator. - :param labels: The list of expected answers. :param model: SentenceTransformers semantic textual similarity model, should be path or string pointing to a downloadable model. - :param regexes_to_ignore: A list of regular expressions. If provided, it removes substrings - matching these regular expressions from both predictions and labels before comparison. Defaults to None. - :param ignore_case: If True, performs case-insensitive comparison. Defaults to False. - :param ignore_punctuation: If True, removes punctuation from both predictions and labels before - comparison. Defaults to False. - :param ignore_numbers: If True, removes numerical digits from both predictions and labels - before comparison. Defaults to False. :param batch_size: Number of prediction-label pairs to encode at once. :param device: The device on which the model is loaded. If `None`, the default device is automatically selected. :param token: The Hugging Face token for HTTP bearer authorization. You can find your HF token at https://huggingface.co/settings/tokens. """ - metrics_import.check() + sas_import.check() - self._labels = labels self._model = model - self._regexes_to_ignore = regexes_to_ignore - self._ignore_case = ignore_case - self._ignore_punctuation = ignore_punctuation - self._ignore_numbers = ignore_numbers self._batch_size = batch_size self._device = device self._token = token + self._similarity_model = None def to_dict(self) -> Dict[str, Any]: return default_to_dict( self, - labels=self._labels, - regexes_to_ignore=self._regexes_to_ignore, - ignore_case=self._ignore_case, - ignore_punctuation=self._ignore_punctuation, - ignore_numbers=self._ignore_numbers, model=self._model, batch_size=self._batch_size, device=self._device.to_dict() if self._device else None, @@ -89,42 +64,54 @@ class SASEvaluator: data["init_parameters"]["device"] = ComponentDevice.from_dict(device) return default_from_dict(cls, data) + def warm_up(self): + """ + Load the model used for evaluation + """ + token = self._token.resolve_value() if self._token else None + config = AutoConfig.from_pretrained(self._model, use_auth_token=token) + cross_encoder_used = False + if config.architectures: + cross_encoder_used = any(arch.endswith("ForSequenceClassification") for arch in config.architectures) + device = ComponentDevice.resolve_device(self._device).to_torch_str() + # Based on the Model string we can load either Bi-Encoders or Cross Encoders. + # Similarity computation changes for both approaches + if cross_encoder_used: + self._similarity_model = CrossEncoder( + self._model, + device=device, + tokenizer_args={"use_auth_token": token}, + automodel_args={"use_auth_token": token}, + ) + else: + self._similarity_model = SentenceTransformer(self._model, device=device, use_auth_token=token) + @component.output_types(sas=float, scores=List[float]) - def run(self, predictions: List[str]) -> Dict[str, Any]: - if len(predictions) != len(self._labels): + def run(self, labels: List[str], predictions: List[str]) -> Dict[str, Any]: + """ + Run the SASEvaluator to compute the Semantic Answer Similarity (SAS) between a list of predictions and a list of + labels. Both must be list of strings of same length. + + :param predictions: List of predictions. + :param labels: List of labels against which the predictions are compared. + :returns: A dictionary with the following outputs: + * `sas` - Cumulative SAS score for the entire dataset. + * `scores` - A list of similarity scores for each prediction-label pair. + """ + if len(labels) != len(predictions): raise ValueError("The number of predictions and labels must be the same.") if len(predictions) == 0: return {"sas": 0.0, "scores": [0.0]} - token = self._token.resolve_value() if self._token else None + if not self._similarity_model: + msg = "The model has not been initialized. Call warm_up() before running the evaluator." + raise RuntimeError(msg) - predictions = _preprocess_text( - predictions, self._regexes_to_ignore, self._ignore_case, self._ignore_punctuation, self._ignore_numbers - ) - labels = _preprocess_text( - self._labels, self._regexes_to_ignore, self._ignore_case, self._ignore_punctuation, self._ignore_numbers - ) - config = AutoConfig.from_pretrained(self._model, use_auth_token=token) - cross_encoder_used = False - if config.architectures: - cross_encoder_used = any(arch.endswith("ForSequenceClassification") for arch in config.architectures) - - device = ComponentDevice.resolve_device(self._device) - - # Based on the Model string we can load either Bi-Encoders or Cross Encoders. - # Similarity computation changes for both approaches - - if cross_encoder_used: + if isinstance(self._similarity_model, CrossEncoder): # For Cross Encoders we create a list of pairs of predictions and labels - similarity_model = CrossEncoder( - self._model, - device=device.to_torch_str(), - tokenizer_args={"use_auth_token": token}, - automodel_args={"use_auth_token": token}, - ) sentence_pairs = [[pred, label] for pred, label in zip(predictions, labels)] - similarity_scores = similarity_model.predict( + similarity_scores = self._similarity_model.predict( sentence_pairs, batch_size=self._batch_size, convert_to_numpy=True ) @@ -138,11 +125,12 @@ class SASEvaluator: else: # For Bi-encoders we create embeddings separately for predictions and labels - similarity_model = SentenceTransformer(self._model, device=device.to_torch_str(), use_auth_token=token) - predictions_embeddings = similarity_model.encode( + predictions_embeddings = self._similarity_model.encode( predictions, batch_size=self._batch_size, convert_to_tensor=True ) - label_embeddings = similarity_model.encode(labels, batch_size=self._batch_size, convert_to_tensor=True) + label_embeddings = self._similarity_model.encode( + labels, batch_size=self._batch_size, convert_to_tensor=True + ) # Compute cosine-similarities scores = util.cos_sim(predictions_embeddings, label_embeddings) diff --git a/test/components/eval/test_sas_evaluator.py b/test/components/eval/test_sas_evaluator.py index 0a7811bfe..bee28046b 100644 --- a/test/components/eval/test_sas_evaluator.py +++ b/test/components/eval/test_sas_evaluator.py @@ -7,14 +7,8 @@ from haystack.utils.device import ComponentDevice class TestSASEvaluator: def test_init_default(self, monkeypatch): monkeypatch.setenv("HF_API_TOKEN", "fake-token") - labels = ["label1", "label2", "label3"] - evaluator = SASEvaluator(labels=labels) + evaluator = SASEvaluator() - assert evaluator._labels == labels - assert evaluator._regexes_to_ignore is None - assert evaluator._ignore_case is False - assert evaluator._ignore_punctuation is False - assert evaluator._ignore_numbers is False assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" assert evaluator._batch_size == 32 assert evaluator._device is None @@ -23,17 +17,11 @@ class TestSASEvaluator: def test_to_dict(self, monkeypatch): monkeypatch.setenv("HF_API_TOKEN", "fake-token") - labels = ["label1", "label2", "label3"] - evaluator = SASEvaluator(labels=labels, device=ComponentDevice.from_str("cuda:0")) + evaluator = SASEvaluator(device=ComponentDevice.from_str("cuda:0")) expected_dict = { "type": "haystack.components.eval.sas_evaluator.SASEvaluator", "init_parameters": { - "labels": labels, - "regexes_to_ignore": None, - "ignore_case": False, - "ignore_punctuation": False, - "ignore_numbers": False, "model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", "batch_size": 32, "device": {"type": "single", "device": "cuda:0"}, @@ -48,11 +36,6 @@ class TestSASEvaluator: { "type": "haystack.components.eval.sas_evaluator.SASEvaluator", "init_parameters": { - "labels": ["label1", "label2", "label3"], - "regexes_to_ignore": None, - "ignore_case": False, - "ignore_punctuation": False, - "ignore_numbers": False, "model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", "batch_size": 32, "device": {"type": "single", "device": "cuda:0"}, @@ -61,54 +44,62 @@ class TestSASEvaluator: } ) - assert evaluator._labels == ["label1", "label2", "label3"] - assert evaluator._regexes_to_ignore is None - assert evaluator._ignore_case is False - assert evaluator._ignore_punctuation is False - assert evaluator._ignore_numbers is False assert evaluator._model == "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" assert evaluator._batch_size == 32 assert evaluator._device.to_torch_str() == "cuda:0" assert evaluator._token.resolve_value() == "fake-token" - @pytest.mark.integration def test_run_with_empty_inputs(self): - evaluator = SASEvaluator(labels=[]) - result = evaluator.run(predictions=[]) + evaluator = SASEvaluator() + result = evaluator.run(labels=[], predictions=[]) assert len(result) == 2 assert result["sas"] == 0.0 assert result["scores"] == [0.0] - @pytest.mark.integration def test_run_with_different_lengths(self): + evaluator = SASEvaluator() labels = [ "A construction budget of US $2.3 billion", "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", ] - evaluator = SASEvaluator(labels=labels) - predictions = [ "A construction budget of US $2.3 billion", "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", ] with pytest.raises(ValueError): - evaluator.run(predictions) + evaluator.run(labels=labels, predictions=predictions) - @pytest.mark.integration - def test_run_with_matching_predictions(self): + def test_run_not_warmed_up(self): + evaluator = SASEvaluator() labels = [ "A construction budget of US $2.3 billion", "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", ] - evaluator = SASEvaluator(labels=labels) predictions = [ "A construction budget of US $2.3 billion", "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", ] - result = evaluator.run(predictions=predictions) + with pytest.raises(RuntimeError): + evaluator.run(labels=labels, predictions=predictions) + + @pytest.mark.integration + def test_run_with_matching_predictions(self): + evaluator = SASEvaluator() + labels = [ + "A construction budget of US $2.3 billion", + "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", + "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", + ] + predictions = [ + "A construction budget of US $2.3 billion", + "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", + "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", + ] + evaluator.warm_up() + result = evaluator.run(labels=labels, predictions=predictions) assert len(result) == 2 assert result["sas"] == pytest.approx(1.0) @@ -116,177 +107,68 @@ class TestSASEvaluator: @pytest.mark.integration def test_run_with_single_prediction(self): - labels = ["US $2.3 billion"] - evaluator = SASEvaluator(labels=labels) + evaluator = SASEvaluator() - result = evaluator.run(predictions=["A construction budget of US $2.3 billion"]) + labels = ["US $2.3 billion"] + evaluator.warm_up() + result = evaluator.run(labels=labels, predictions=["A construction budget of US $2.3 billion"]) assert len(result) == 2 assert result["sas"] == pytest.approx(0.689089, abs=1e-5) assert result["scores"] == pytest.approx([0.689089], abs=1e-5) @pytest.mark.integration def test_run_with_mismatched_predictions(self): + evaluator = SASEvaluator() labels = [ "US $2.3 billion", "Paris's cultural magnificence is symbolized by the Eiffel Tower", "Japan was transformed into a modernized world power after the Meiji Restoration.", ] - evaluator = SASEvaluator(labels=labels) predictions = [ "A construction budget of US $2.3 billion", "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", ] - result = evaluator.run(predictions=predictions) + evaluator.warm_up() + result = evaluator.run(labels=labels, predictions=predictions) assert len(result) == 2 assert result["sas"] == pytest.approx(0.8227189) assert result["scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5) - @pytest.mark.integration - def test_run_with_ignore_case(self): - labels = [ - "A construction budget of US $2.3 BILLION", - "The EIFFEL TOWER, completed in 1889, symbolizes Paris's cultural magnificence.", - "The MEIJI RESTORATION in 1868 transformed Japan into a modernized world power.", - ] - evaluator = SASEvaluator(labels=labels, ignore_case=True) - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", - ] - result = evaluator.run(predictions=predictions) - assert len(result) == 2 - assert result["sas"] == pytest.approx(1.0) - assert result["scores"] == pytest.approx([1.0, 1.0, 1.0]) - - @pytest.mark.integration - def test_run_with_ignore_punctuation(self): - labels = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower completed in 1889 symbolizes Paris's cultural magnificence", - "The Meiji Restoration in 1868 transformed Japan into a modernized world power", - ] - evaluator = SASEvaluator(labels=labels, ignore_punctuation=True) - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration, in 1868, transformed Japan into a modernized world power.", - ] - result = evaluator.run(predictions=predictions) - assert len(result) == 2 - assert result["sas"] == pytest.approx(1.0) - assert result["scores"] == pytest.approx([1.0, 1.0, 1.0]) - - @pytest.mark.integration - def test_run_with_ignore_numbers(self): - labels = [ - "A construction budget of US $10.3 billion", - "The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration, in 1989, transformed Japan into a modernized world power.", - ] - evaluator = SASEvaluator(labels=labels, ignore_numbers=True) - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration, in 1868, transformed Japan into a modernized world power.", - ] - result = evaluator.run(predictions=predictions) - assert result["sas"] == pytest.approx(1.0) - assert result["scores"] == pytest.approx([1.0, 1.0, 1.0]) - - @pytest.mark.integration - def test_run_with_regex_to_ignore(self): - labels = [ - "A construction budget of US $10.3 billion", - "The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration, in 1989, transformed Japan into a modernized world power.", - ] - evaluator = SASEvaluator(labels=labels, regexes_to_ignore=[r"\d+"]) - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration, in 1868, transformed Japan into a modernized world power.", - ] - result = evaluator.run(predictions=predictions) - assert len(result) == 2 - assert result["sas"] == pytest.approx(1.0) - assert result["scores"] == pytest.approx([1.0, 1.0, 1.0]) - - @pytest.mark.integration - def test_run_with_multiple_regex_to_ignore(self): - labels = [ - "A construction budget of US $10.3 billion", - "The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration, in 1989, transformed Japan into a modernized world power.", - ] - evaluator = SASEvaluator(labels=labels, regexes_to_ignore=[r"\d+", r"[^\w\s]"]) - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration, in 1868, transformed Japan into a modernized world power.", - ] - result = evaluator.run(predictions=predictions) - assert len(result) == 2 - assert result["sas"] == pytest.approx(1.0) - assert result["scores"] == pytest.approx([1.0, 1.0, 1.0]) - - @pytest.mark.integration - def test_run_with_multiple_ignore_parameters(self): - labels = [ - "A construction budget of US $10.3 billion", - "The Eiffel Tower, completed in 2005, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration, in 1989, transformed Japan into a modernized world power.", - ] - evaluator = SASEvaluator( - labels=labels, - ignore_numbers=True, - ignore_punctuation=True, - ignore_case=True, - regexes_to_ignore=[r"[^\w\s\d]+"], - ) - predictions = [ - "A construction budget of US $2.3 billion", - "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", - "The Meiji Restoration, in 1868, transformed Japan into a modernized world power.", - ] - result = evaluator.run(predictions=predictions) - assert len(result) == 2 - assert result["sas"] == pytest.approx(1.0) - assert result["scores"] == pytest.approx([1.0, 1.0, 1.0]) - @pytest.mark.integration def test_run_with_bi_encoder_model(self): + evaluator = SASEvaluator(model="sentence-transformers/all-mpnet-base-v2") labels = [ "A construction budget of US $2.3 billion", "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", ] - evaluator = SASEvaluator(labels=labels, model="sentence-transformers/all-mpnet-base-v2") predictions = [ "A construction budget of US $2.3 billion", "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", ] - result = evaluator.run(predictions=predictions) + evaluator.warm_up() + result = evaluator.run(labels=labels, predictions=predictions) assert len(result) == 2 assert result["sas"] == pytest.approx(1.0) assert result["scores"] == pytest.approx([1.0, 1.0, 1.0]) @pytest.mark.integration def test_run_with_cross_encoder_model(self): + evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2") labels = [ "A construction budget of US $2.3 billion", "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", ] - evaluator = SASEvaluator(labels=labels, model="cross-encoder/ms-marco-MiniLM-L-6-v2") predictions = [ "A construction budget of US $2.3 billion", "The Eiffel Tower, completed in 1889, symbolizes Paris's cultural magnificence.", "The Meiji Restoration in 1868 transformed Japan into a modernized world power.", ] - result = evaluator.run(predictions=predictions) + evaluator.warm_up() + result = evaluator.run(labels=labels, predictions=predictions) assert len(result) == 2 assert result["sas"] == pytest.approx(0.999967, abs=1e-5) assert result["scores"] == pytest.approx([0.9999765157699585, 0.999968409538269, 0.9999572038650513], abs=1e-5)