Add no answer handling and sort no answer into positive predictions

2026-01-05 19:47:45 +00:00 · 2020-02-21 18:27:53 +01:00 · 2020-02-21 18:27:53 +01:00 · ef9b99c3cc
commit ef9b99c3cc
parent 840b368732
2 changed files with 31 additions and 31 deletions
--- a/haystack/reader/farm.py
+++ b/haystack/reader/farm.py
@ -45,19 +45,18 @@ class FARMReader:
        :param batch_size: Number of samples the model receives in one batch for inference
        :param use_gpu: Whether to use GPU (if available)
        :param n_candidates_per_passage: How many candidate answers are extracted per text sequence that the model can process at once (depends on `max_seq_len`).
-                                         Note: This is not the number of "final answers" you will receive
-                                         (see `top_k` in FARMReader.predict() or Finder.get_answers() for that)
-                                         # TODO adjust farm. n_cand = 2 returns no answer + highest positive answer
-                                         # should return no answer + 2 best positive answers
-                                         # drawback: answers from a single paragraph might be very similar in text and score
-                                         # we need to have more varied answers (by excluding overlapping answers?)
+                                         Note: - This is not the number of "final answers" you will receive
+                                                 (see `top_k` in FARMReader.predict() or Finder.get_answers() for that)
+                                               - FARM includes no_answer in the sorted list of predictions
+
+
        """


        self.inferencer = Inferencer.load(model_name_or_path, batch_size=batch_size, gpu=use_gpu, task_type="question_answering")
        self.inferencer.model.prediction_heads[0].context_window_size = context_window_size
-        self.inferencer.model.prediction_heads[0].no_ans_threshold = no_ans_boost # TODO adjust naming and concept in FARM
-        self.no_ans_boost = no_ans_boost
+        self.inferencer.model.prediction_heads[0].no_ans_boost = no_ans_boost
+        # TODO adjust terminology: a haystack passage is a FARM document (which gets devided into FARM passages depending on document length and max_seq_len)
        self.inferencer.model.prediction_heads[0].n_best = n_candidates_per_passage

    def train(self, data_dir, train_filename, dev_filename=None, test_file_name=None,
@ -188,7 +187,7 @@ class FARMReader:
            }
            input_dicts.append(cur)

-        # get answers from QA model (Default: top 5 per input paragraph)
+        # get answers from QA model
        predictions = self.inferencer.inference_from_dicts(
            dicts=input_dicts, rest_api_schema=True, max_processes=max_processes
        )
@ -200,9 +199,10 @@ class FARMReader:
        no_ans_gaps = []
        best_score_answer = 0
        for pred in predictions:
+            positive_found = False
            for a in pred["predictions"][0]["answers"]:
                # skip "no answers" here
-                if a["answer"]:
+                if(not positive_found and a["answer"]):
                    cur = {"answer": a["answer"],
                           "score": a["score"],
                           "probability": float(expit(np.asarray([a["score"]]) / 8)), #just a pseudo prob for now
@ -214,30 +214,27 @@ class FARMReader:
                    no_ans_gaps.append(pred["predictions"][0]["no_ans_gap"])
                    if a["score"] > best_score_answer:
                        best_score_answer = a["score"]
+                    positive_found = True

-        # adjust no_ans_gaps
+
+        # "no answer" scores and positive answers scores are difficult to compare, because
+        # + a positive answer score is related to one specific document
+        # - a "no answer" score is related to all input documents
+        # Thus we compute the "no answer" score relative to the best possible answer and adjust it by
+        # the most significant difference between scores.
+        # Most significant difference: a model switching from predicting an answer to "no answer" (or vice versa).
+        # No_ans_gap coming from FARM mean how much no_ans_boost should change to switch predictions
        no_ans_gaps = np.array(no_ans_gaps)
-        no_ans_gaps_adjusted = no_ans_gaps + self.no_ans_boost
+        max_no_ans_gap = np.max(no_ans_gaps)
+        if(np.sum(no_ans_gaps < 0) == len(no_ans_gaps)): # all passages "no answer" as top score
+            no_ans_score = best_score_answer - max_no_ans_gap # max_no_ans_gap is negative, so it increases best pos score
+        else: # case: at least one passage predicts an answer (positive no_ans_gap)
+            no_ans_score = best_score_answer - max_no_ans_gap

-        # We want to heuristically rank how likely or unlikely the "no answer" option is.
-
-        # case: all documents return no answer, then all no_ans_gaps are positive
-        if np.sum(no_ans_gaps_adjusted < 0) == 0:
-            # to rank we add the smallest no_ans_gap (a document where an answer would be nearly as likely as the no anser)
-            # to the highest answer score we found
-            no_ans_score = best_score_answer + min(no_ans_gaps_adjusted)
-        # case: documents where answers are preferred over no answer, the no_ans_gap is negative
-        else:
-            # the lowest (highest negative) no_ans_gap would be needed as positive no_ans_boost for the
-            # model to return "no answer" on all documents
-            # we subtract this value from the best answer score to rank our "no answer" option
-            # magically this is the same equation as used for the case above : )
-            no_ans_score = best_score_answer + min(no_ans_gaps_adjusted)
-
-        cur = {"answer": "[computer says no answer is likely]",
+        cur = {"answer": None,
               "score": no_ans_score,
               "probability": float(expit(np.asarray(no_ans_score) / 8)),  # just a pseudo prob for now
-               "context": "",
+               "context": None,
               "offset_start": 0,
               "offset_end": 0,
               "document_id": None}
@ -249,7 +246,7 @@ class FARMReader:
        )
        answers = answers[:top_k]
        result = {"question": question,
-                  "adjust_no_ans_boost": -min(no_ans_gaps_adjusted),
+                  "no_ans_gap": max_no_ans_gap,
                  "answers": answers}

        return result
--- a/tutorials/Tutorial1_Basic_QA_Pipeline.py
+++ b/tutorials/Tutorial1_Basic_QA_Pipeline.py
@ -51,7 +51,10 @@ finder = Finder(reader, retriever)
 # The higher top_k_retriever, the better (but also the slower) your answers.
 prediction = finder.get_answers(question="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5)

-#prediction = finder.get_answers(question="Who is the daughter of Arya Stark?", top_k_reader=5) # impossible question test
+# to test impossible questions we need a large QA model, e.g. deepset/bert-large-uncased-whole-word-masking-squad2
+#prediction = finder.get_answers(question="Who is the first daughter of Arya Stark?", top_k_retriever=10, top_k_reader=5)
+
+
 #prediction = finder.get_answers(question="Who created the Dothraki vocabulary?", top_k_reader=5)
 #prediction = finder.get_answers(question="Who is the sister of Sansa?", top_k_reader=5)