From 44e2b1beed5d83f95facddb09a4e8caa2bd349a6 Mon Sep 17 00:00:00 2001 From: Sebastian Date: Thu, 11 Aug 2022 16:45:03 +0200 Subject: [PATCH] Resolving issue 2853: no answer logic in FARMReader (#2856) * Update FARMReader.eval_on_file to be consistent with FARMReader.eval * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/_src/api/api/reader.md | 15 ++++++++------- haystack/modeling/evaluation/eval.py | 12 +++++++----- haystack/nodes/reader/farm.py | 25 ++++++++++++++++--------- 3 files changed, 31 insertions(+), 21 deletions(-) diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md index 334e26e30..7627affa3 100644 --- a/docs/_src/api/api/reader.md +++ b/docs/_src/api/api/reader.md @@ -88,12 +88,14 @@ want to debug the Language Model, you might need to disable multiprocessing! Can be helpful to disable in production deployments to keep the logs clean. - `duplicate_filtering`: Answers are filtered based on their position. Both start and end position of the answers are considered. The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal. -- `use_confidence_scores`: Sets the type of score that is returned with every predicted answer. +- `use_confidence_scores`: Determines the type of score that is used for ranking a predicted answer. `True` => a scaled confidence / relevance score between [0, 1]. This score can also be further calibrated on your dataset via self.eval() -(see https://haystack.deepset.ai/components/reader#confidence-scores) . +(see https://haystack.deepset.ai/components/reader#confidence-scores). `False` => an unscaled, raw score [-inf, +inf] which is the sum of start and end logit from the model for the predicted span. +Using confidence scores can change the ranking of no_answer compared to using the +unscaled raw scores. - `confidence_threshold`: Filters out predictions below confidence_threshold. Value should be between 0 and 1. Disabled by default. - `proxies`: Dict of proxy servers to use for downloading external models. Example: {'http': 'some.proxy:1234', 'http://hostname': 'my.proxy:3111'} - `local_files_only`: Whether to force checking for local files only (and forbid downloads) @@ -427,7 +429,7 @@ Dict containing query and answers #### FARMReader.eval\_on\_file ```python -def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None) +def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None, calibrate_conf_scores: bool = False) ``` Performs evaluation on a SQuAD-formatted file. @@ -444,13 +446,14 @@ Returns a dict containing the following metrics: - `device`: The device on which the tensors should be processed. Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda") or use the Reader's device by default. +- `calibrate_conf_scores`: Whether to calibrate the temperature for scaling of the confidence scores. #### FARMReader.eval ```python -def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False, use_no_answer_legacy_confidence=False) +def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False) ``` Performs evaluation on evaluation documents in the DocumentStore. @@ -469,9 +472,7 @@ or use the Reader's device by default. - `label_index`: Index/Table name where labeled questions are stored - `doc_index`: Index/Table name where documents that are used for evaluation are stored - `label_origin`: Field name where the gold labels are stored -- `calibrate_conf_scores`: Whether to calibrate the temperature for temperature scaling of the confidence scores -- `use_no_answer_legacy_confidence`: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence. -Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default). +- `calibrate_conf_scores`: Whether to calibrate the temperature for scaling of the confidence scores. diff --git a/haystack/modeling/evaluation/eval.py b/haystack/modeling/evaluation/eval.py index 028227462..7ff9ee966 100644 --- a/haystack/modeling/evaluation/eval.py +++ b/haystack/modeling/evaluation/eval.py @@ -38,18 +38,20 @@ class Evaluator: model: AdaptiveModel, return_preds_and_labels: bool = False, calibrate_conf_scores: bool = False, - use_confidence_scores_for_ranking=True, - use_no_answer_legacy_confidence=False, + use_confidence_scores_for_ranking: bool = True, + use_no_answer_legacy_confidence: bool = False, ) -> List[Dict]: """ Performs evaluation on a given model. :param model: The model on which to perform evaluation :param return_preds_and_labels: Whether to add preds and labels in the returned dicts of the - :param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores + :param calibrate_conf_scores: Whether to calibrate the temperature for scaling of the confidence scores. :param use_confidence_scores_for_ranking: Whether to sort answers by confidence score (normalized between 0 and 1)(default) or by standard score (unbounded). - :param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence. - Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default). + :param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference + between the best overall answer confidence and the no_answer gap confidence. + Otherwise, we use the no_answer score normalized to a range of [0,1] by + an expit function (default). :return: all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics and reports generated during evaluation. """ diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py index f3bb3f179..06c1a49de 100644 --- a/haystack/nodes/reader/farm.py +++ b/haystack/nodes/reader/farm.py @@ -108,12 +108,14 @@ class FARMReader(BaseReader): Can be helpful to disable in production deployments to keep the logs clean. :param duplicate_filtering: Answers are filtered based on their position. Both start and end position of the answers are considered. The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal. - :param use_confidence_scores: Sets the type of score that is returned with every predicted answer. + :param use_confidence_scores: Determines the type of score that is used for ranking a predicted answer. `True` => a scaled confidence / relevance score between [0, 1]. This score can also be further calibrated on your dataset via self.eval() - (see https://haystack.deepset.ai/components/reader#confidence-scores) . + (see https://haystack.deepset.ai/components/reader#confidence-scores). `False` => an unscaled, raw score [-inf, +inf] which is the sum of start and end logit from the model for the predicted span. + Using confidence scores can change the ranking of no_answer compared to using the + unscaled raw scores. :param confidence_threshold: Filters out predictions below confidence_threshold. Value should be between 0 and 1. Disabled by default. :param proxies: Dict of proxy servers to use for downloading external models. Example: {'http': 'some.proxy:1234', 'http://hostname': 'my.proxy:3111'} :param local_files_only: Whether to force checking for local files only (and forbid downloads) @@ -873,7 +875,11 @@ class FARMReader(BaseReader): return result def eval_on_file( - self, data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None + self, + data_dir: Union[Path, str], + test_filename: str, + device: Optional[Union[str, torch.device]] = None, + calibrate_conf_scores: bool = False, ): """ Performs evaluation on a SQuAD-formatted file. @@ -887,6 +893,7 @@ class FARMReader(BaseReader): :param device: The device on which the tensors should be processed. Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda") or use the Reader's device by default. + :param calibrate_conf_scores: Whether to calibrate the temperature for scaling of the confidence scores. """ logger.warning( "FARMReader.eval_on_file() uses a slightly different evaluation approach than `Pipeline.eval()`:\n" @@ -918,7 +925,11 @@ class FARMReader(BaseReader): evaluator = Evaluator(data_loader=data_loader, tasks=eval_processor.tasks, device=device) - eval_results = evaluator.eval(self.inferencer.model) + eval_results = evaluator.eval( + self.inferencer.model, + calibrate_conf_scores=calibrate_conf_scores, + use_confidence_scores_for_ranking=self.use_confidence_scores, + ) results = { "EM": eval_results[0]["EM"] * 100, "f1": eval_results[0]["f1"] * 100, @@ -945,7 +956,6 @@ class FARMReader(BaseReader): doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False, - use_no_answer_legacy_confidence=False, ): """ Performs evaluation on evaluation documents in the DocumentStore. @@ -961,9 +971,7 @@ class FARMReader(BaseReader): :param label_index: Index/Table name where labeled questions are stored :param doc_index: Index/Table name where documents that are used for evaluation are stored :param label_origin: Field name where the gold labels are stored - :param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores - :param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence. - Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default). + :param calibrate_conf_scores: Whether to calibrate the temperature for scaling of the confidence scores. """ logger.warning( "FARMReader.eval() uses a slightly different evaluation approach than `Pipeline.eval()`:\n" @@ -1082,7 +1090,6 @@ class FARMReader(BaseReader): self.inferencer.model, calibrate_conf_scores=calibrate_conf_scores, use_confidence_scores_for_ranking=self.use_confidence_scores, - use_no_answer_legacy_confidence=use_no_answer_legacy_confidence, ) toc = perf_counter() reader_time = toc - tic