Polish Evaluation Tutorial (#2212)

* Polish evaluation tutorial * Clear notebook output * Cleanup tutorials * Fix discrepancy in isolated retriever eval results * Incorporate reviewer feedback * Clean notebook output
2025-12-29 07:59:27 +00:00 · 2022-02-24 17:45:40 +01:00 · 2022-02-24 17:45:40 +01:00 · bb107e5027
commit bb107e5027
parent 2c423ba063
2 changed files with 361 additions and 609 deletions
--- a/tutorials/Tutorial5_Evaluation.ipynb
+++ b/tutorials/Tutorial5_Evaluation.ipynb
--- a/tutorials/Tutorial5_Evaluation.py
+++ b/tutorials/Tutorial5_Evaluation.py
@ -1,7 +1,6 @@
 from haystack.document_stores import ElasticsearchDocumentStore
 from haystack.nodes import ElasticsearchRetriever, DensePassageRetriever, EmbeddingRetriever, FARMReader, PreProcessor
 from haystack.utils import fetch_archive_from_http, launch_es
-from haystack.modeling.utils import initialize_device_settings
 from haystack.pipelines import ExtractiveQAPipeline, DocumentSearchPipeline
 from haystack.schema import Answer, Document, EvaluationResult, Label, MultiLabel, Span

@ -20,7 +19,6 @@ def tutorial5_evaluation():
    # Code
    ##############################################
    launch_es()
-    devices, n_gpu = initialize_device_settings(use_cuda=True)

    # Download evaluation data, which is a subset of Natural Questions development set containing 50 documents with one question per document and multiple annotated answers
    doc_dir = "../data/nq"
@ -88,7 +86,8 @@ def tutorial5_evaluation():
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", top_k=4, return_no_answer=True)

    # Define a pipeline consisting of the initialized retriever and reader
-    # Here we evaluate retriever and reader in open domain fashion on the full corpus of documents i.e. a document is considered
+    # Here we evaluate retriever and reader in an integrated (a.k.a. open domain) fashion on the full corpus of documents
+    # i.e. a document is considered
    # correctly retrieved if it contains the gold answer string within it. The reader is evaluated based purely on the
    # predicted answer string, regardless of which document this came from and the position of the extracted span.
    # The generation of predictions is seperated from the calculation of metrics.
@ -101,23 +100,39 @@ def tutorial5_evaluation():
    # pipeline = DocumentSearchPipeline(retriever=retriever)

    # We can load evaluation labels from the document store
+    # We are also opting to filter out no_answer samples
    eval_labels = document_store.get_all_labels_aggregated(drop_negative_labels=True, drop_no_answers=False)
+    eval_labels = [label for label in eval_labels if not label.no_answer]

-    # Alternative: Define queries and labels directly
+    ## Alternative: Define queries and labels directly
    # eval_labels = [
-    #        MultiLabel(labels=[Label(query="who is written in the book of life",
-    #        answer=Answer(answer="every person who is destined for Heaven or the World to Come",
-    #        offsets_in_context=[Span(374, 434)]),
-    #        document=Document(id='1b090aec7dbd1af6739c4c80f8995877-0',
-    #        content_type="text",
-    #        content='Book of Life - wikipedia Book of Life Jump to: navigation, search This article is about the book mentioned in Christian and Jewish religious teachings. For other uses, see The Book of Life. In Christianity and Judaism, the Book of Life (Hebrew: ספר החיים, transliterated Sefer HaChaim; Greek: βιβλίον τῆς ζωῆς Biblíon tēs Zōēs) is the book in which God records the names of every person who is destined for Heaven or the World to Come. According to the Talmud it is open on Rosh Hashanah, as is its analog for the wicked, the Book of the Dead. For this reason extra mention is made for the Book of Life during Amidah recitations during the Days of Awe, the ten days between Rosh Hashanah, the Jewish new year, and Yom Kippur, the day of atonement (the two High Holidays, particularly in the prayer Unetaneh Tokef). Contents (hide) 1 In the Hebrew Bible 2 Book of Jubilees 3 References in the New Testament 4 The eschatological or annual roll-call 5 Fundraising 6 See also 7 Notes 8 References In the Hebrew Bible(edit) In the Hebrew Bible the Book of Life - the book or muster-roll of God - records forever all people considered righteous before God'),
-    #        is_correct_answer=True,
-    #        is_correct_document=True,
-    #        origin="gold-label")])
-    #    ]
+    #     MultiLabel(
+    #         labels=[
+    #             Label(
+    #                 query="who is written in the book of life",
+    #                 answer=Answer(
+    #                     answer="every person who is destined for Heaven or the World to Come",
+    #                     offsets_in_context=[Span(374, 434)]
+    #                 ),
+    #                 document=Document(
+    #                     id='1b090aec7dbd1af6739c4c80f8995877-0',
+    #                    content_type="text",
+    #                    content='Book of Life - wikipedia Book of Life Jump to: navigation, search This article is
+    #                       about the book mentioned in Christian and Jewish religious teachings...'
+    #                 ),
+    #                 is_correct_answer=True,
+    #                 is_correct_document=True,
+    #                 origin="gold-label"
+    #             )
+    #         ]
+    #     )
+    # ]

    # Similar to pipeline.run() we can execute pipeline.eval()
-    eval_result = pipeline.eval(labels=eval_labels, params={"Retriever": {"top_k": 5}})
+    eval_result = pipeline.eval(
+        labels=eval_labels,
+        params={"Retriever": {"top_k": 5}}
+    )

    # The EvaluationResult contains a pandas dataframe for each pipeline node.
    # That's why there are two dataframes in the EvaluationResult of an ExtractiveQAPipeline.
@ -129,16 +144,20 @@ def tutorial5_evaluation():
    reader_result.head()

    # We can filter for all documents retrieved for a given query
-    retriever_book_of_life = retriever_result[retriever_result["query"] == "who is written in the book of life"]
+    query = "who is written in the book of life"
+    retriever_book_of_life = retriever_result[retriever_result["query"] == query]

    # We can also filter for all answers predicted for a given query
    reader_book_of_life = reader_result[reader_result["query"] == "who is written in the book of life"]

-    # Save the evaluation result so that we can reload it later and calculate evaluation metrics without running the pipeline again.
+    # Save the evaluation result so that we can reload it later
+    # and calculate evaluation metrics without running the pipeline again.
    eval_result.save("../")

    ## Calculating Evaluation Metrics
-    # Load an EvaluationResult to quickly calculate standard evaluation metrics for all predictions, such as F1-score of each individual prediction of the Reader node or recall of the retriever.
+    # Load an EvaluationResult to quickly calculate standard evaluation metrics for all predictions,
+    # such as F1-score of each individual prediction of the Reader node or recall of the retriever.
+    # To learn more about the metrics, see [Evaluation Metrics](https://haystack.deepset.ai/guides/evaluation#metrics-retrieval)

    saved_eval_result = EvaluationResult.load("../")
    metrics = saved_eval_result.calculate_metrics()
@ -152,14 +171,19 @@ def tutorial5_evaluation():
    print(f'Reader - Exact Match: {metrics["Reader"]["exact_match"]}')

    ## Generating an Evaluation Report
-    # A summary of the evaluation results can be printed to get a quick overview. It includes some aggregated metrics and also shows a few wrongly predicted examples.
+    # A summary of the evaluation results can be printed to get a quick overview.
+    # It includes some aggregated metrics and also shows a few wrongly predicted examples.

    pipeline.print_eval_report(saved_eval_result)

    ## Advanced Evaluation Metrics
-    # As an advanced evaluation metric, semantic answer similarity (SAS) can be calculated. This metric takes into account whether the meaning of a predicted answer is similar to the annotated gold answer rather than just doing string comparison.
-    # To this end SAS relies on pre-trained models. For English, we recommend "cross-encoder/stsb-roberta-large", whereas for German we recommend "deepset/gbert-large-sts". A good multilingual model is "sentence-transformers/paraphrase-multilingual-mpnet-base-v2".
-    # More info on this metric can be found in our [paper](https://arxiv.org/abs/2108.06130) or in our [blog post](https://www.deepset.ai/blog/semantic-answer-similarity-to-evaluate-qa).
+    # Semantic Answer Similarity (SAS) is an advanced evaluation metric can be calculated in Haystack.
+    # This metric takes into account whether the meaning of a predicted answer is similar to the annotated gold answer
+    # rather than just doing string comparison. To this end SAS relies on pre-trained models.
+    # For English, we recommend "cross-encoder/stsb-roberta-large", whereas for German we recommend "deepset/gbert-large-sts".
+    # A good multilingual model is "sentence-transformers/paraphrase-multilingual-mpnet-base-v2".
+    # More info on this metric can be found in our [paper](https://arxiv.org/abs/2108.06130)
+    # or in our [blog post](https://www.deepset.ai/blog/semantic-answer-similarity-to-evaluate-qa).

    advanced_eval_result = pipeline.eval(
        labels=eval_labels,
@ -170,34 +194,53 @@ def tutorial5_evaluation():
    metrics = advanced_eval_result.calculate_metrics()
    print(metrics["Reader"]["sas"])

-    ## Isolated Evaluation Mode to Understand Upper Bounds of the Reader's Performance
-    # The isolated node evaluation uses labels as input to the reader node instead of the output of the preceeding retriever node.
-    # Thereby, we can additionally calculate the upper bounds of the evaluation metrics of the reader.
+    ## Isolated Evaluation Mode
+    # The isolated node evaluation uses labels as input to the Reader node instead of the output of the preceeding retriever node.
+    # Thereby, we can additionally calculate the upper bounds of the evaluation metrics of the Reader.
+    # Note that even with isolated evaluation enabled, integrated evaluation will still be running.
    eval_result_with_upper_bounds = pipeline.eval(
-        labels=eval_labels, params={"Retriever": {"top_k": 1}}, add_isolated_node_eval=True
+        labels=eval_labels,
+        params={"Retriever": {"top_k": 5}},
+        add_isolated_node_eval=True
    )
    pipeline.print_eval_report(eval_result_with_upper_bounds)

    ## Evaluation of Individual Components
-    # Sometimes you might want to evaluate individual components, for example, if you don't have a pipeline but only a retriever or a reader with a model that you trained yourself.
+    # Sometimes you might want to evaluate individual components,
+    # for example, if you don't have a pipeline but only a retriever or a reader with a model that you trained yourself.
+
    # Evaluate Retriever on its own
    # Here we evaluate only the retriever, based on whether the gold_label document is retrieved.
-    retriever_eval_results = retriever.eval(top_k=10, label_index=label_index, doc_index=doc_index)
+    # Note that no_answer samples are omitted when evaluation is performed with this method
+    retriever_eval_results = retriever.eval(
+        top_k=5,
+        label_index=label_index,
+        doc_index=doc_index
+    )
+
    ## Retriever Recall is the proportion of questions for which the correct document containing the answer is
    ## among the correct documents
    print("Retriever Recall:", retriever_eval_results["recall"])
    ## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
    print("Retriever Mean Avg Precision:", retriever_eval_results["map"])

+    # Just as a sanity check, we can compare the recall from `retriever.eval()`
+    # with the multi hit recall from `pipeline.eval(add_isolated_node_eval=True)`.
+    # These two recall metrics are only comparable since we chose to filter out no_answer samples when generating eval_labels.
+    metrics = eval_result_with_upper_bounds.calculate_metrics()
+    print(metrics["Retriever"]["recall_multi_hit"])
+
    # Evaluate Reader on its own
    # Here we evaluate only the reader in a closed domain fashion i.e. the reader is given one query
    # and its corresponding relevant document and metrics are calculated on whether the right position in this text is selected by
    # the model as the answer span (i.e. SQuAD style)
    reader_eval_results = reader.eval(
-        document_store=document_store, device=devices[0], label_index=label_index, doc_index=doc_index
+        document_store=document_store,
+        label_index=label_index,
+        doc_index=doc_index
    )
    # Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
-    # reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json", device=device)
+    # reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json")

    ## Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer
    print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"])