From 25a6eaae05918f43de0781cb34a45d5a78459261 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Tue, 12 Dec 2023 15:12:28 +0100
Subject: [PATCH] feat!: Rename ExtractiveReader's `confidence_threshold` to
 `score_threshold` (#6532)

* rename to score_threshold

* Update haystack/components/readers/extractive.py

Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com>

---------

Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com>
---
 haystack/components/readers/extractive.py     | 41 ++++++++++---------
 ...ame-confidence-score-3ba604b8f9bd0454.yaml |  5 +++
 test/components/readers/test_extractive.py    |  4 +-
 3 files changed, 28 insertions(+), 22 deletions(-)
 create mode 100644 releasenotes/notes/rename-confidence-score-3ba604b8f9bd0454.yaml

diff --git a/haystack/components/readers/extractive.py b/haystack/components/readers/extractive.py
index deb400050..3f78cc573 100644
--- a/haystack/components/readers/extractive.py
+++ b/haystack/components/readers/extractive.py
@@ -21,7 +21,7 @@ logger = logging.getLogger(__name__)
 class ExtractiveReader:
     """
     A component that locates and extract answers to a given query from Documents. It's used for performing extractive
-    QA. The Reader assigns a probability score to every possible answer span independently of other answer spans.
+    QA. The Reader assigns a score to every possible answer span independently of other answer spans.
     This fixes a common issue of other implementations which make comparisons across documents harder by normalizing
     each document's answers independently.
 
@@ -42,7 +42,7 @@ class ExtractiveReader:
         device: Optional[str] = None,
         token: Union[bool, str, None] = None,
         top_k: int = 20,
-        confidence_threshold: Optional[float] = None,
+        score_threshold: Optional[float] = None,
         max_seq_length: int = 384,
         stride: int = 128,
         max_batch_size: Optional[int] = None,
@@ -61,9 +61,9 @@ class ExtractiveReader:
             If this parameter is set to `True`, then the token generated when running
             `transformers-cli login` (stored in ~/.huggingface) is used.
         :param top_k: Number of answers to return per query.
-            It is required even if confidence_threshold is set. Defaults to 20.
+            It is required even if score_threshold is set. Defaults to 20.
             An additional answer with no text is returned if no_answer is set to True (default).
-        :param confidence_threshold: Returns only answers with the probability score above this threshold.
+        :param score_threshold: Returns only answers with the probability score above this threshold.
         :param max_seq_length: Maximum number of tokens.
             If a sequence exceeds it, the sequence is split.
             Default: 384
@@ -72,8 +72,9 @@ class ExtractiveReader:
         :param max_batch_size: Maximum number of samples that are fed through the model at the same time.
         :param answers_per_seq: Number of answer candidates to consider per sequence.
             This is relevant when a Document was split into multiple sequences because of max_seq_length.
-        :param no_answer: Whether to return no answer scores.
-        :param calibration_factor: Factor used for calibrating probability scores.
+        :param no_answer: Whether to return an additional `no answer` with an empty text and a score representing the
+            probability that the other top_k answers are incorrect.
+        :param calibration_factor: Factor used for calibrating probabilities.
         :param model_kwargs: Additional keyword arguments passed to `AutoModelForQuestionAnswering.from_pretrained`
             when loading the model specified in `model_name_or_path`. For details on what kwargs you can pass,
             see the model's documentation.
@@ -85,7 +86,7 @@ class ExtractiveReader:
         self.token = token
         self.max_seq_length = max_seq_length
         self.top_k = top_k
-        self.confidence_threshold = confidence_threshold
+        self.score_threshold = score_threshold
         self.stride = stride
         self.max_batch_size = max_batch_size
         self.answers_per_seq = answers_per_seq
@@ -110,7 +111,7 @@ class ExtractiveReader:
             token=self.token if not isinstance(self.token, str) else None,
             max_seq_length=self.max_seq_length,
             top_k=self.top_k,
-            confidence_threshold=self.confidence_threshold,
+            score_threshold=self.score_threshold,
             stride=self.stride,
             max_batch_size=self.max_batch_size,
             answers_per_seq=self.answers_per_seq,
@@ -202,8 +203,8 @@ class ExtractiveReader:
         encodings: List["Encoding"],
     ) -> Tuple[List[List[int]], List[List[int]], "torch.Tensor"]:
         """
-        Turn start and end logits into probability scores for each answer span. Unlike most other
-        implementations, it doesn't normalize the scores to make them easier to compare across different
+        Turn start and end logits into probabilities for each answer span. Unlike most other
+        implementations, it doesn't normalize the scores in each split to make them easier to compare across different
         splits. Returns the top k answer spans.
         """
         mask = sequence_ids == 1  # Only keep tokens from the context (should ignore special tokens)
@@ -262,14 +263,14 @@ class ExtractiveReader:
         queries: List[str],
         answers_per_seq: int,
         top_k: Optional[int],
-        confidence_threshold: Optional[float],
+        score_threshold: Optional[float],
         query_ids: List[int],
         document_ids: List[int],
         no_answer: bool,
     ) -> List[List[ExtractedAnswer]]:
         """
-        Reconstructs the nested structure that existed before flattening. Also computes a no answer probability.
-        This probability is different from most other implementations because it does not consider the no answer
+        Reconstructs the nested structure that existed before flattening. Also computes a no answer score.
+        This score is different from most other implementations because it does not consider the no answer
         logit introduced with SQuAD 2. Instead, it just computes the probability that the answer does not exist
         in the top k or top p.
         """
@@ -307,8 +308,8 @@ class ExtractiveReader:
                 )
                 current_answers.append(answer_)
             current_answers = sorted(current_answers, key=lambda answer: answer.score, reverse=True)
-            if confidence_threshold is not None:
-                current_answers = [answer for answer in current_answers if answer.score >= confidence_threshold]
+            if score_threshold is not None:
+                current_answers = [answer for answer in current_answers if answer.score >= score_threshold]
             nested_answers.append(current_answers)
 
         return nested_answers
@@ -319,7 +320,7 @@ class ExtractiveReader:
         query: str,
         documents: List[Document],
         top_k: Optional[int] = None,
-        confidence_threshold: Optional[float] = None,
+        score_threshold: Optional[float] = None,
         max_seq_length: Optional[int] = None,
         stride: Optional[int] = None,
         max_batch_size: Optional[int] = None,
@@ -333,9 +334,9 @@ class ExtractiveReader:
         :param documents: List of Documents in which you want to search for an answer to the query.
         :param top_k: The maximum number of answers to return.
             An additional answer is returned if no_answer is set to True (default).
-        :param confidence_threshold:
+        :param score_threshold:
         :return: List of ExtractedAnswers sorted by (desc.) answer score.
-        :param confidence_threshold: Returns only answers with the probability score above this threshold.
+        :param score_threshold: Returns only answers with the score above this threshold.
         :param max_seq_length: Maximum number of tokens.
             If a sequence exceeds it, the sequence is split.
             Default: 384
@@ -352,7 +353,7 @@ class ExtractiveReader:
             raise ComponentError("The component was not warmed up. Run 'warm_up()' before calling 'run()'.")
 
         top_k = top_k or self.top_k
-        confidence_threshold = confidence_threshold or self.confidence_threshold
+        score_threshold = score_threshold or self.score_threshold
         max_seq_length = max_seq_length or self.max_seq_length
         stride = stride or self.stride
         max_batch_size = max_batch_size or self.max_batch_size
@@ -400,7 +401,7 @@ class ExtractiveReader:
             queries,
             answers_per_seq,
             top_k,
-            confidence_threshold,
+            score_threshold,
             query_ids,
             document_ids,
             no_answer,
diff --git a/releasenotes/notes/rename-confidence-score-3ba604b8f9bd0454.yaml b/releasenotes/notes/rename-confidence-score-3ba604b8f9bd0454.yaml
new file mode 100644
index 000000000..4b449fad3
--- /dev/null
+++ b/releasenotes/notes/rename-confidence-score-3ba604b8f9bd0454.yaml
@@ -0,0 +1,5 @@
+---
+enhancements:
+  - |
+    Renamed the confidence_threshold parameter of the ExtractiveReader to score_threshold as ExtractedAnswers have a score and this is what the threshold is for.
+    For consistency, the term confidence is not mentioned anymore in favor of score.
diff --git a/test/components/readers/test_extractive.py b/test/components/readers/test_extractive.py
index 2ced6678e..69b51f5f9 100644
--- a/test/components/readers/test_extractive.py
+++ b/test/components/readers/test_extractive.py
@@ -98,7 +98,7 @@ def test_to_dict():
             "device": None,
             "token": None,  # don't serialize valid tokens
             "top_k": 20,
-            "confidence_threshold": None,
+            "score_threshold": None,
             "max_seq_length": 384,
             "stride": 128,
             "max_batch_size": None,
@@ -121,7 +121,7 @@ def test_to_dict_empty_model_kwargs():
             "device": None,
             "token": None,  # don't serialize valid tokens
             "top_k": 20,
-            "confidence_threshold": None,
+            "score_threshold": None,
             "max_seq_length": 384,
             "stride": 128,
             "max_batch_size": None,