Resolving issue 2853: no answer logic in FARMReader (#2856)

* Update FARMReader.eval_on_file to be consistent with FARMReader.eval * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2025-12-30 00:30:09 +00:00 · 2022-08-11 16:45:03 +02:00 · 2022-08-11 16:45:03 +02:00 · 44e2b1beed
commit 44e2b1beed
parent fc8ecbf20c
3 changed files with 31 additions and 21 deletions
--- a/docs/_src/api/api/reader.md
+++ b/docs/_src/api/api/reader.md
@ -88,12 +88,14 @@ want to debug the Language Model, you might need to disable multiprocessing!
 Can be helpful to disable in production deployments to keep the logs clean.
 - `duplicate_filtering`: Answers are filtered based on their position. Both start and end position of the answers are considered.
 The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal.
- `use_confidence_scores`: Sets the type of score that is returned with every predicted answer.
+- `use_confidence_scores`: Determines the type of score that is used for ranking a predicted answer.
 `True` => a scaled confidence / relevance score between [0, 1].
 This score can also be further calibrated on your dataset via self.eval()
-(see https://haystack.deepset.ai/components/reader#confidence-scores) .
+(see https://haystack.deepset.ai/components/reader#confidence-scores).
 `False` => an unscaled, raw score [-inf, +inf] which is the sum of start and end logit
 from the model for the predicted span.
+Using confidence scores can change the ranking of no_answer compared to using the
+unscaled raw scores.
 - `confidence_threshold`: Filters out predictions below confidence_threshold. Value should be between 0 and 1. Disabled by default.
 - `proxies`: Dict of proxy servers to use for downloading external models. Example: {'http': 'some.proxy:1234', 'http://hostname': 'my.proxy:3111'}
 - `local_files_only`: Whether to force checking for local files only (and forbid downloads)
@ -427,7 +429,7 @@ Dict containing query and answers
 #### FARMReader.eval\_on\_file

 ```python
-def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None)
+def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None, calibrate_conf_scores: bool = False)
 ```

 Performs evaluation on a SQuAD-formatted file.
@ -444,13 +446,14 @@ Returns a dict containing the following metrics:
 - `device`: The device on which the tensors should be processed.
 Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
 or use the Reader's device by default.
+- `calibrate_conf_scores`: Whether to calibrate the temperature for scaling of the confidence scores.

 <a id="farm.FARMReader.eval"></a>

 #### FARMReader.eval

 ```python
-def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False, use_no_answer_legacy_confidence=False)
+def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False)
 ```

 Performs evaluation on evaluation documents in the DocumentStore.
@ -469,9 +472,7 @@ or use the Reader's device by default.
 - `label_index`: Index/Table name where labeled questions are stored
 - `doc_index`: Index/Table name where documents that are used for evaluation are stored
 - `label_origin`: Field name where the gold labels are stored
- `calibrate_conf_scores`: Whether to calibrate the temperature for temperature scaling of the confidence scores
- `use_no_answer_legacy_confidence`: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
-Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
+- `calibrate_conf_scores`: Whether to calibrate the temperature for scaling of the confidence scores.

 <a id="farm.FARMReader.calibrate_confidence_scores"></a>

--- a/haystack/modeling/evaluation/eval.py
+++ b/haystack/modeling/evaluation/eval.py
@ -38,18 +38,20 @@ class Evaluator:
        model: AdaptiveModel,
        return_preds_and_labels: bool = False,
        calibrate_conf_scores: bool = False,
-        use_confidence_scores_for_ranking=True,
-        use_no_answer_legacy_confidence=False,
+        use_confidence_scores_for_ranking: bool = True,
+        use_no_answer_legacy_confidence: bool = False,
    ) -> List[Dict]:
        """
        Performs evaluation on a given model.

        :param model: The model on which to perform evaluation
        :param return_preds_and_labels: Whether to add preds and labels in the returned dicts of the
-        :param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores
+        :param calibrate_conf_scores: Whether to calibrate the temperature for scaling of the confidence scores.
        :param use_confidence_scores_for_ranking: Whether to sort answers by confidence score (normalized between 0 and 1)(default) or by standard score (unbounded).
-        :param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
-                                                Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
+        :param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference
+                                                between the best overall answer confidence and the no_answer gap confidence.
+                                                Otherwise, we use the no_answer score normalized to a range of [0,1] by
+                                                an expit function (default).
        :return: all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics
                             and reports generated during evaluation.
        """
--- a/haystack/nodes/reader/farm.py
+++ b/haystack/nodes/reader/farm.py
@ -108,12 +108,14 @@ class FARMReader(BaseReader):
                             Can be helpful to disable in production deployments to keep the logs clean.
        :param duplicate_filtering: Answers are filtered based on their position. Both start and end position of the answers are considered.
                                    The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal.
-        :param use_confidence_scores: Sets the type of score that is returned with every predicted answer.
+        :param use_confidence_scores: Determines the type of score that is used for ranking a predicted answer.
                                      `True` => a scaled confidence / relevance score between [0, 1].
                                      This score can also be further calibrated on your dataset via self.eval()
-                                      (see https://haystack.deepset.ai/components/reader#confidence-scores) .
+                                      (see https://haystack.deepset.ai/components/reader#confidence-scores).
                                      `False` => an unscaled, raw score [-inf, +inf] which is the sum of start and end logit
                                      from the model for the predicted span.
+                                      Using confidence scores can change the ranking of no_answer compared to using the
+                                      unscaled raw scores.
        :param confidence_threshold: Filters out predictions below confidence_threshold. Value should be between 0 and 1. Disabled by default.
        :param proxies: Dict of proxy servers to use for downloading external models. Example: {'http': 'some.proxy:1234', 'http://hostname': 'my.proxy:3111'}
        :param local_files_only: Whether to force checking for local files only (and forbid downloads)
@ -873,7 +875,11 @@ class FARMReader(BaseReader):
        return result

    def eval_on_file(
-        self, data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None
+        self,
+        data_dir: Union[Path, str],
+        test_filename: str,
+        device: Optional[Union[str, torch.device]] = None,
+        calibrate_conf_scores: bool = False,
    ):
        """
        Performs evaluation on a SQuAD-formatted file.
@ -887,6 +893,7 @@ class FARMReader(BaseReader):
        :param device: The device on which the tensors should be processed.
               Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
               or use the Reader's device by default.
+        :param calibrate_conf_scores: Whether to calibrate the temperature for scaling of the confidence scores.
        """
        logger.warning(
            "FARMReader.eval_on_file() uses a slightly different evaluation approach than `Pipeline.eval()`:\n"
@ -918,7 +925,11 @@ class FARMReader(BaseReader):

        evaluator = Evaluator(data_loader=data_loader, tasks=eval_processor.tasks, device=device)

-        eval_results = evaluator.eval(self.inferencer.model)
+        eval_results = evaluator.eval(
+            self.inferencer.model,
+            calibrate_conf_scores=calibrate_conf_scores,
+            use_confidence_scores_for_ranking=self.use_confidence_scores,
+        )
        results = {
            "EM": eval_results[0]["EM"] * 100,
            "f1": eval_results[0]["f1"] * 100,
@ -945,7 +956,6 @@ class FARMReader(BaseReader):
        doc_index: str = "eval_document",
        label_origin: str = "gold-label",
        calibrate_conf_scores: bool = False,
-        use_no_answer_legacy_confidence=False,
    ):
        """
        Performs evaluation on evaluation documents in the DocumentStore.
@ -961,9 +971,7 @@ class FARMReader(BaseReader):
        :param label_index: Index/Table name where labeled questions are stored
        :param doc_index: Index/Table name where documents that are used for evaluation are stored
        :param label_origin: Field name where the gold labels are stored
-        :param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores
-        :param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
-                                                Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
+        :param calibrate_conf_scores: Whether to calibrate the temperature for scaling of the confidence scores.
        """
        logger.warning(
            "FARMReader.eval() uses a slightly different evaluation approach than `Pipeline.eval()`:\n"
@ -1082,7 +1090,6 @@ class FARMReader(BaseReader):
            self.inferencer.model,
            calibrate_conf_scores=calibrate_conf_scores,
            use_confidence_scores_for_ranking=self.use_confidence_scores,
-            use_no_answer_legacy_confidence=use_no_answer_legacy_confidence,
        )
        toc = perf_counter()
        reader_time = toc - tic