From 44e2b1beed5d83f95facddb09a4e8caa2bd349a6 Mon Sep 17 00:00:00 2001
From: Sebastian <sjrl@users.noreply.github.com>
Date: Thu, 11 Aug 2022 16:45:03 +0200
Subject: [PATCH] Resolving issue 2853: no answer logic in FARMReader (#2856)

* Update FARMReader.eval_on_file to be consistent with FARMReader.eval

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 docs/_src/api/api/reader.md          | 15 ++++++++-------
 haystack/modeling/evaluation/eval.py | 12 +++++++-----
 haystack/nodes/reader/farm.py        | 25 ++++++++++++++++---------
 3 files changed, 31 insertions(+), 21 deletions(-)
diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md
index 334e26e30..7627affa3 100644
--- a/docs/_src/api/api/reader.md
+++ b/docs/_src/api/api/reader.md
@@ -88,12 +88,14 @@ want to debug the Language Model, you might need to disable multiprocessing!
 Can be helpful to disable in production deployments to keep the logs clean.
 - `duplicate_filtering`: Answers are filtered based on their position. Both start and end position of the answers are considered.
 The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal.
-- `use_confidence_scores`: Sets the type of score that is returned with every predicted answer.
+- `use_confidence_scores`: Determines the type of score that is used for ranking a predicted answer.
 `True` => a scaled confidence / relevance score between [0, 1].
 This score can also be further calibrated on your dataset via self.eval()
-(see https://haystack.deepset.ai/components/reader#confidence-scores) .
+(see https://haystack.deepset.ai/components/reader#confidence-scores).
 `False` => an unscaled, raw score [-inf, +inf] which is the sum of start and end logit
 from the model for the predicted span.
+Using confidence scores can change the ranking of no_answer compared to using the
+unscaled raw scores.
 - `confidence_threshold`: Filters out predictions below confidence_threshold. Value should be between 0 and 1. Disabled by default.
 - `proxies`: Dict of proxy servers to use for downloading external models. Example: {'http': 'some.proxy:1234', 'http://hostname': 'my.proxy:3111'}
 - `local_files_only`: Whether to force checking for local files only (and forbid downloads)
@@ -427,7 +429,7 @@ Dict containing query and answers
 #### FARMReader.eval\_on\_file
 
 ```python
-def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None)
+def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None, calibrate_conf_scores: bool = False)
 ```
 
 Performs evaluation on a SQuAD-formatted file.
@@ -444,13 +446,14 @@ Returns a dict containing the following metrics:
 - `device`: The device on which the tensors should be processed.
 Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
 or use the Reader's device by default.
+- `calibrate_conf_scores`: Whether to calibrate the temperature for scaling of the confidence scores.
 
 <a id="farm.FARMReader.eval"></a>
 
 #### FARMReader.eval
 
 ```python
-def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False, use_no_answer_legacy_confidence=False)
+def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False)
 ```
 
 Performs evaluation on evaluation documents in the DocumentStore.
@@ -469,9 +472,7 @@ or use the Reader's device by default.
 - `label_index`: Index/Table name where labeled questions are stored
 - `doc_index`: Index/Table name where documents that are used for evaluation are stored
 - `label_origin`: Field name where the gold labels are stored
-- `calibrate_conf_scores`: Whether to calibrate the temperature for temperature scaling of the confidence scores
-- `use_no_answer_legacy_confidence`: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
-Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
+- `calibrate_conf_scores`: Whether to calibrate the temperature for scaling of the confidence scores.
 
 <a id="farm.FARMReader.calibrate_confidence_scores"></a>
 
diff --git a/haystack/modeling/evaluation/eval.py b/haystack/modeling/evaluation/eval.py
index 028227462..7ff9ee966 100644
--- a/haystack/modeling/evaluation/eval.py
+++ b/haystack/modeling/evaluation/eval.py
@@ -38,18 +38,20 @@ class Evaluator:
         model: AdaptiveModel,
         return_preds_and_labels: bool = False,
         calibrate_conf_scores: bool = False,
-        use_confidence_scores_for_ranking=True,
-        use_no_answer_legacy_confidence=False,
+        use_confidence_scores_for_ranking: bool = True,
+        use_no_answer_legacy_confidence: bool = False,
     ) -> List[Dict]:
         """
         Performs evaluation on a given model.
 
         :param model: The model on which to perform evaluation
         :param return_preds_and_labels: Whether to add preds and labels in the returned dicts of the
-        :param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores
+        :param calibrate_conf_scores: Whether to calibrate the temperature for scaling of the confidence scores.
         :param use_confidence_scores_for_ranking: Whether to sort answers by confidence score (normalized between 0 and 1)(default) or by standard score (unbounded).
-        :param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
-                                                Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
+        :param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference
+                                                between the best overall answer confidence and the no_answer gap confidence.
+                                                Otherwise, we use the no_answer score normalized to a range of [0,1] by
+                                                an expit function (default).
         :return: all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics
                              and reports generated during evaluation.
         """
diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py
index f3bb3f179..06c1a49de 100644
--- a/haystack/nodes/reader/farm.py
+++ b/haystack/nodes/reader/farm.py
@@ -108,12 +108,14 @@ class FARMReader(BaseReader):
                              Can be helpful to disable in production deployments to keep the logs clean.
         :param duplicate_filtering: Answers are filtered based on their position. Both start and end position of the answers are considered.
                                     The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal.
-        :param use_confidence_scores: Sets the type of score that is returned with every predicted answer.
+        :param use_confidence_scores: Determines the type of score that is used for ranking a predicted answer.
                                       `True` => a scaled confidence / relevance score between [0, 1].
                                       This score can also be further calibrated on your dataset via self.eval()
-                                      (see https://haystack.deepset.ai/components/reader#confidence-scores) .
+                                      (see https://haystack.deepset.ai/components/reader#confidence-scores).
                                       `False` => an unscaled, raw score [-inf, +inf] which is the sum of start and end logit
                                       from the model for the predicted span.
+                                      Using confidence scores can change the ranking of no_answer compared to using the
+                                      unscaled raw scores.
         :param confidence_threshold: Filters out predictions below confidence_threshold. Value should be between 0 and 1. Disabled by default.
         :param proxies: Dict of proxy servers to use for downloading external models. Example: {'http': 'some.proxy:1234', 'http://hostname': 'my.proxy:3111'}
         :param local_files_only: Whether to force checking for local files only (and forbid downloads)
@@ -873,7 +875,11 @@ class FARMReader(BaseReader):
         return result
 
     def eval_on_file(
-        self, data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None
+        self,
+        data_dir: Union[Path, str],
+        test_filename: str,
+        device: Optional[Union[str, torch.device]] = None,
+        calibrate_conf_scores: bool = False,
     ):
         """
         Performs evaluation on a SQuAD-formatted file.
@@ -887,6 +893,7 @@ class FARMReader(BaseReader):
         :param device: The device on which the tensors should be processed.
                Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
                or use the Reader's device by default.
+        :param calibrate_conf_scores: Whether to calibrate the temperature for scaling of the confidence scores.
         """
         logger.warning(
             "FARMReader.eval_on_file() uses a slightly different evaluation approach than `Pipeline.eval()`:\n"
@@ -918,7 +925,11 @@ class FARMReader(BaseReader):
 
         evaluator = Evaluator(data_loader=data_loader, tasks=eval_processor.tasks, device=device)
 
-        eval_results = evaluator.eval(self.inferencer.model)
+        eval_results = evaluator.eval(
+            self.inferencer.model,
+            calibrate_conf_scores=calibrate_conf_scores,
+            use_confidence_scores_for_ranking=self.use_confidence_scores,
+        )
         results = {
             "EM": eval_results[0]["EM"] * 100,
             "f1": eval_results[0]["f1"] * 100,
@@ -945,7 +956,6 @@ class FARMReader(BaseReader):
         doc_index: str = "eval_document",
         label_origin: str = "gold-label",
         calibrate_conf_scores: bool = False,
-        use_no_answer_legacy_confidence=False,
     ):
         """
         Performs evaluation on evaluation documents in the DocumentStore.
@@ -961,9 +971,7 @@ class FARMReader(BaseReader):
         :param label_index: Index/Table name where labeled questions are stored
         :param doc_index: Index/Table name where documents that are used for evaluation are stored
         :param label_origin: Field name where the gold labels are stored
-        :param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores
-        :param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
-                                                Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
+        :param calibrate_conf_scores: Whether to calibrate the temperature for scaling of the confidence scores.
         """
         logger.warning(
             "FARMReader.eval() uses a slightly different evaluation approach than `Pipeline.eval()`:\n"
@@ -1082,7 +1090,6 @@ class FARMReader(BaseReader):
             self.inferencer.model,
             calibrate_conf_scores=calibrate_conf_scores,
             use_confidence_scores_for_ranking=self.use_confidence_scores,
-            use_no_answer_legacy_confidence=use_no_answer_legacy_confidence,
         )
         toc = perf_counter()
         reader_time = toc - tic