haystack/e2e/pipelines/test_evaluation_pipeline.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import os
from typing import List

import pytest

from haystack import Document, Pipeline
from haystack.components.builders import AnswerBuilder, ChatPromptBuilder
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack.components.evaluators import (
    ContextRelevanceEvaluator,
    DocumentMAPEvaluator,
    DocumentMRREvaluator,
    DocumentRecallEvaluator,
    FaithfulnessEvaluator,
    SASEvaluator,
)
from haystack.components.evaluators.document_recall import RecallMode
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy
from haystack.evaluation import EvaluationRunResult
from haystack.dataclasses import ChatMessage

EMBEDDINGS_MODEL = "sentence-transformers/all-MiniLM-L6-v2"


def indexing_pipeline(documents: List[Document]):
    """Indexing the documents"""
    document_store = InMemoryDocumentStore()
    doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
    doc_embedder = SentenceTransformersDocumentEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False)
    ingestion_pipe = Pipeline()
    ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder")
    ingestion_pipe.add_component(instance=doc_writer, name="doc_writer")
    ingestion_pipe.connect("doc_embedder.documents", "doc_writer.documents")
    ingestion_pipe.run({"doc_embedder": {"documents": documents}})
    return document_store


def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int):
    """RAG pipeline"""
    template = [
        ChatMessage.from_system(
            text="You have to answer the following question based on the given context information only."
        ),
        ChatMessage.from_user(
            text="""Context:
            {% for document in documents %}
                {{ document.content }}
            {% endfor %}

            Question: {{question}}"""
        ),
    ]
    rag = Pipeline()
    rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False))
    rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k))
    rag.add_component("prompt_builder", ChatPromptBuilder(template=template))
    rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini"))
    rag.add_component("answer_builder", AnswerBuilder())
    rag.connect("embedder", "retriever.query_embedding")
    rag.connect("retriever", "prompt_builder.documents")
    rag.connect("prompt_builder", "generator")
    rag.connect("generator.replies", "answer_builder.replies")
    rag.connect("retriever", "answer_builder.documents")

    return rag


def evaluation_pipeline():
    """
    Create an evaluation pipeline with the following evaluators:

    - DocumentMRREvaluator
    - FaithfulnessEvaluator
    - SASEvaluator
    - DocumentMAPEvaluator
    - DocumentRecallEvaluator
    - ContextRelevanceEvaluator
    """
    eval_pipeline = Pipeline()
    eval_pipeline.add_component("doc_mrr", DocumentMRREvaluator())
    eval_pipeline.add_component("groundedness", FaithfulnessEvaluator())
    eval_pipeline.add_component("sas", SASEvaluator(model=EMBEDDINGS_MODEL))
    eval_pipeline.add_component("doc_map", DocumentMAPEvaluator())
    eval_pipeline.add_component("doc_recall_single_hit", DocumentRecallEvaluator(mode=RecallMode.SINGLE_HIT))
    eval_pipeline.add_component("doc_recall_multi_hit", DocumentRecallEvaluator(mode=RecallMode.MULTI_HIT))
    eval_pipeline.add_component("relevance", ContextRelevanceEvaluator())

    return eval_pipeline


def built_eval_input(questions, truth_docs, truth_answers, retrieved_docs, contexts, pred_answers):  # pylint: disable=too-many-positional-arguments
    """Helper function to build the input for the evaluation pipeline"""
    return {
        "doc_mrr": {"ground_truth_documents": truth_docs, "retrieved_documents": retrieved_docs},
        "groundedness": {"questions": questions, "contexts": contexts, "predicted_answers": pred_answers},
        "sas": {"predicted_answers": pred_answers, "ground_truth_answers": truth_answers},
        "doc_map": {"ground_truth_documents": truth_docs, "retrieved_documents": retrieved_docs},
        "doc_recall_single_hit": {"ground_truth_documents": truth_docs, "retrieved_documents": retrieved_docs},
        "doc_recall_multi_hit": {"ground_truth_documents": truth_docs, "retrieved_documents": retrieved_docs},
        "relevance": {"questions": questions, "contexts": contexts},
    }


def run_rag_pipeline(documents, evaluation_questions, rag_pipeline_a):
    """
    Run the RAG pipeline and return the contexts, predicted answers, retrieved documents and ground truth documents
    """

    truth_docs = []
    retrieved_docs = []
    contexts = []
    predicted_answers = []

    for q in evaluation_questions:
        response = rag_pipeline_a.run(
            {
                "embedder": {"text": q["question"]},
                "prompt_builder": {"question": q["question"]},
                "answer_builder": {"query": q["question"]},
            }
        )
        truth_docs.append([doc for doc in documents if doc.meta["name"] in q["ground_truth_doc"] and doc.content])
        retrieved_docs.append(response["answer_builder"]["answers"][0].documents)
        contexts.append([doc.content for doc in response["answer_builder"]["answers"][0].documents])
        predicted_answers.append(response["answer_builder"]["answers"][0].data)

    return contexts, predicted_answers, retrieved_docs, truth_docs


def built_input_for_results_eval(rag_results):
    """Helper function to build the input for the results evaluation"""
    return {
        "Mean Reciprocal Rank": {
            "individual_scores": rag_results["doc_mrr"]["individual_scores"],
            "score": rag_results["doc_mrr"]["score"],
        },
        "Semantic Answer Similarity": {
            "individual_scores": rag_results["sas"]["individual_scores"],
            "score": rag_results["sas"]["score"],
        },
        "Faithfulness": {
            "individual_scores": rag_results["groundedness"]["individual_scores"],
            "score": rag_results["groundedness"]["score"],
        },
        "Document MAP": {
            "individual_scores": rag_results["doc_map"]["individual_scores"],
            "score": rag_results["doc_map"]["score"],
        },
        "Document Recall Single Hit": {
            "individual_scores": rag_results["doc_recall_single_hit"]["individual_scores"],
            "score": rag_results["doc_recall_single_hit"]["score"],
        },
        "Document Recall Multi Hit": {
            "individual_scores": rag_results["doc_recall_multi_hit"]["individual_scores"],
            "score": rag_results["doc_recall_multi_hit"]["score"],
        },
        "Contextual Relevance": {
            "individual_scores": rag_results["relevance"]["individual_scores"],
            "score": rag_results["relevance"]["score"],
        },
    }


@pytest.mark.skipif(
    not os.environ.get("OPENAI_API_KEY", None),
    reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
)
def test_evaluation_pipeline(samples_path):
    """Test an evaluation pipeline"""
    eval_questions = [
        {
            "question": 'What falls within the term "cultural anthropology"?',
            "answer": "the ideology and analytical stance of cultural relativism",
            "ground_truth_doc": ["Culture.txt"],
        },
        {
            "question": "Who was the spiritual guide during the Protestant Reformation?",
            "answer": "Martin Bucer",
            "ground_truth_doc": ["Strasbourg.txt"],
        },
        {
            "question": "What is materialism?",
            "answer": "a form of philosophical monism",
            "ground_truth_doc": ["Materialism.txt"],
        },
    ]

    questions = [q["question"] for q in eval_questions]
    truth_answers = [q["answer"] for q in eval_questions]

    # indexing documents
    docs = []
    full_path = os.path.join(str(samples_path) + "/test_documents/")
    for article in os.listdir(full_path):
        with open(f"{full_path}/{article}", "r") as f:
            for text in f.read().split("\n"):
                if doc := Document(content=text, meta={"name": article}) if text else None:
                    docs.append(doc)
    doc_store = indexing_pipeline(docs)

    # running the RAG pipeline A + evaluation pipeline
    rag_pipeline_a = rag_pipeline(doc_store, top_k=2)
    contexts_a, pred_answers_a, retrieved_docs_a, truth_docs = run_rag_pipeline(docs, eval_questions, rag_pipeline_a)
    eval_pipeline = evaluation_pipeline()
    eval_input = built_eval_input(questions, truth_docs, truth_answers, retrieved_docs_a, contexts_a, pred_answers_a)
    results_rag_a = eval_pipeline.run(eval_input)

    # running the evaluation EvaluationRunResult
    inputs_a = {
        "question": questions,
        "contexts": contexts_a,
        "answer": truth_answers,
        "predicted_answer": pred_answers_a,
    }
    results_a = built_input_for_results_eval(results_rag_a)
    evaluation_result_a = EvaluationRunResult(run_name="rag_pipeline_a", results=results_a, inputs=inputs_a)
    aggregated_score_report_json = evaluation_result_a.aggregated_report()

    # assert the score report has all the metrics
    assert len(aggregated_score_report_json["metrics"]) == 7
    assert list(aggregated_score_report_json.keys()) == ["metrics", "score"]
    assert list(aggregated_score_report_json["metrics"]) == [
        "Mean Reciprocal Rank",
        "Semantic Answer Similarity",
        "Faithfulness",
        "Document MAP",
        "Document Recall Single Hit",
        "Document Recall Multi Hit",
        "Contextual Relevance",
    ]

    # assert the evaluation result has all the metrics, inputs and questions
    detailed_report_json = evaluation_result_a.detailed_report()
    assert list(detailed_report_json.keys()) == [
        "question",
        "contexts",
        "answer",
        "predicted_answer",
        "Mean Reciprocal Rank",
        "Semantic Answer Similarity",
        "Faithfulness",
        "Document MAP",
        "Document Recall Single Hit",
        "Document Recall Multi Hit",
        "Contextual Relevance",
    ]

    # running the RAG pipeline B
    rag_pipeline_b = rag_pipeline(doc_store, top_k=4)
    contexts_b, pred_answers_b, retrieved_docs_b, truth_docs = run_rag_pipeline(docs, eval_questions, rag_pipeline_b)
    eval_input = built_eval_input(questions, truth_docs, truth_answers, retrieved_docs_b, contexts_b, pred_answers_b)
    results_rag_b = eval_pipeline.run(eval_input)

    inputs_b = {
        "question": questions,
        "contexts": contexts_b,
        "answer": truth_answers,
        "predicted_answer": pred_answers_b,
    }
    results_b = built_input_for_results_eval(results_rag_b)
    evaluation_result_b = EvaluationRunResult(run_name="rag_pipeline_b", results=results_b, inputs=inputs_b)
    comparative_json = evaluation_result_a.comparative_detailed_report(evaluation_result_b)

    # assert the comparative score report has all the metrics, inputs and questions
    assert list(comparative_json.keys()) == [
        "question",
        "contexts",
        "answer",
        "predicted_answer",
        "rag_pipeline_a_Mean Reciprocal Rank",
        "rag_pipeline_a_Semantic Answer Similarity",
        "rag_pipeline_a_Faithfulness",
        "rag_pipeline_a_Document MAP",
        "rag_pipeline_a_Document Recall Single Hit",
        "rag_pipeline_a_Document Recall Multi Hit",
        "rag_pipeline_a_Contextual Relevance",
        "rag_pipeline_b_Mean Reciprocal Rank",
        "rag_pipeline_b_Semantic Answer Similarity",
        "rag_pipeline_b_Faithfulness",
        "rag_pipeline_b_Document MAP",
        "rag_pipeline_b_Document Recall Single Hit",
        "rag_pipeline_b_Document Recall Multi Hit",
        "rag_pipeline_b_Contextual Relevance",
    ]
chore: add license header to all modules (#7675) * add license header to modules * check license header at linting time 2024-05-09 15:40:36 +02:00			`# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>`
			`#`
			`# SPDX-License-Identifier: Apache-2.0`

test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`import os`
			`from typing import List`

			`import pytest`

			`from haystack import Document, Pipeline`
fix: updating end2end evaluation tests (#9053) * updating tests * fixing tests, default now is JSON object and no longer dataframe * cleaning up leftovers 2025-03-18 10:52:05 +01:00			`from haystack.components.builders import AnswerBuilder, ChatPromptBuilder`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder`
			`from haystack.components.evaluators import (`
			`ContextRelevanceEvaluator,`
			`DocumentMAPEvaluator,`
			`DocumentMRREvaluator,`
			`DocumentRecallEvaluator,`
			`FaithfulnessEvaluator,`
			`SASEvaluator,`
			`)`
			`from haystack.components.evaluators.document_recall import RecallMode`
fix: updating end2end evaluation tests (#9053) * updating tests * fixing tests, default now is JSON object and no longer dataframe * cleaning up leftovers 2025-03-18 10:52:05 +01:00			`from haystack.components.generators.chat import OpenAIChatGenerator`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`from haystack.components.retrievers import InMemoryEmbeddingRetriever`
			`from haystack.components.writers import DocumentWriter`
			`from haystack.document_stores.in_memory import InMemoryDocumentStore`
			`from haystack.document_stores.types import DuplicatePolicy`
			`from haystack.evaluation import EvaluationRunResult`
fix: updating end2end evaluation tests (#9053) * updating tests * fixing tests, default now is JSON object and no longer dataframe * cleaning up leftovers 2025-03-18 10:52:05 +01:00			`from haystack.dataclasses import ChatMessage`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00
			`EMBEDDINGS_MODEL = "sentence-transformers/all-MiniLM-L6-v2"`


			`def indexing_pipeline(documents: List[Document]):`
			`"""Indexing the documents"""`
			`document_store = InMemoryDocumentStore()`
			`doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)`
			`doc_embedder = SentenceTransformersDocumentEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False)`
			`ingestion_pipe = Pipeline()`
feat: add py.typed; adjust `Component` protocol (#9329) * experimenting with py.typed * try changing run method in protocol * Trigger Build * better docstring + release note * remove type:ignore where possible * Removed a few more type: ignores --------- Co-authored-by: Sebastian Husch Lee <sjrl423@gmail.com> 2025-05-07 09:34:31 +02:00			`ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder")`
			`ingestion_pipe.add_component(instance=doc_writer, name="doc_writer")`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`ingestion_pipe.connect("doc_embedder.documents", "doc_writer.documents")`
			`ingestion_pipe.run({"doc_embedder": {"documents": documents}})`
			`return document_store`


feat: add py.typed; adjust `Component` protocol (#9329) * experimenting with py.typed * try changing run method in protocol * Trigger Build * better docstring + release note * remove type:ignore where possible * Removed a few more type: ignores --------- Co-authored-by: Sebastian Husch Lee <sjrl423@gmail.com> 2025-05-07 09:34:31 +02:00			`def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int):`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`"""RAG pipeline"""`
fix: updating end2end evaluation tests (#9053) * updating tests * fixing tests, default now is JSON object and no longer dataframe * cleaning up leftovers 2025-03-18 10:52:05 +01:00			`template = [`
			`ChatMessage.from_system(`
			`text="You have to answer the following question based on the given context information only."`
			`),`
			`ChatMessage.from_user(`
			`text="""Context:`
			`{% for document in documents %}`
			`{{ document.content }}`
			`{% endfor %}`

			`Question: {{question}}"""`
			`),`
			`]`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`rag = Pipeline()`
feat: add py.typed; adjust `Component` protocol (#9329) * experimenting with py.typed * try changing run method in protocol * Trigger Build * better docstring + release note * remove type:ignore where possible * Removed a few more type: ignores --------- Co-authored-by: Sebastian Husch Lee <sjrl423@gmail.com> 2025-05-07 09:34:31 +02:00			`rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False))`
			`rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k))`
			`rag.add_component("prompt_builder", ChatPromptBuilder(template=template))`
			`rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini"))`
			`rag.add_component("answer_builder", AnswerBuilder())`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`rag.connect("embedder", "retriever.query_embedding")`
			`rag.connect("retriever", "prompt_builder.documents")`
			`rag.connect("prompt_builder", "generator")`
			`rag.connect("generator.replies", "answer_builder.replies")`
			`rag.connect("retriever", "answer_builder.documents")`

			`return rag`


			`def evaluation_pipeline():`
			`"""`
			`Create an evaluation pipeline with the following evaluators:`

			`- DocumentMRREvaluator`
			`- FaithfulnessEvaluator`
			`- SASEvaluator`
			`- DocumentMAPEvaluator`
			`- DocumentRecallEvaluator`
			`- ContextRelevanceEvaluator`
			`"""`
			`eval_pipeline = Pipeline()`
			`eval_pipeline.add_component("doc_mrr", DocumentMRREvaluator())`
test: Rename `responses` and use preds instead of ground truth answers in e2e eval test (#7640) * rename responses, use preds instead of ground truth answers * fix typo in component name 2024-05-03 12:48:42 +02:00			`eval_pipeline.add_component("groundedness", FaithfulnessEvaluator())`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`eval_pipeline.add_component("sas", SASEvaluator(model=EMBEDDINGS_MODEL))`
			`eval_pipeline.add_component("doc_map", DocumentMAPEvaluator())`
			`eval_pipeline.add_component("doc_recall_single_hit", DocumentRecallEvaluator(mode=RecallMode.SINGLE_HIT))`
			`eval_pipeline.add_component("doc_recall_multi_hit", DocumentRecallEvaluator(mode=RecallMode.MULTI_HIT))`
			`eval_pipeline.add_component("relevance", ContextRelevanceEvaluator())`

			`return eval_pipeline`


fix: adding pylint disable for EvalRunResult end2endtest (#9054) 2025-03-18 11:20:11 +01:00			`def built_eval_input(questions, truth_docs, truth_answers, retrieved_docs, contexts, pred_answers): # pylint: disable=too-many-positional-arguments`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`"""Helper function to build the input for the evaluation pipeline"""`
			`return {`
			`"doc_mrr": {"ground_truth_documents": truth_docs, "retrieved_documents": retrieved_docs},`
test: Rename `responses` and use preds instead of ground truth answers in e2e eval test (#7640) * rename responses, use preds instead of ground truth answers * fix typo in component name 2024-05-03 12:48:42 +02:00			`"groundedness": {"questions": questions, "contexts": contexts, "predicted_answers": pred_answers},`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`"sas": {"predicted_answers": pred_answers, "ground_truth_answers": truth_answers},`
			`"doc_map": {"ground_truth_documents": truth_docs, "retrieved_documents": retrieved_docs},`
			`"doc_recall_single_hit": {"ground_truth_documents": truth_docs, "retrieved_documents": retrieved_docs},`
			`"doc_recall_multi_hit": {"ground_truth_documents": truth_docs, "retrieved_documents": retrieved_docs},`
			`"relevance": {"questions": questions, "contexts": contexts},`
			`}`


			`def run_rag_pipeline(documents, evaluation_questions, rag_pipeline_a):`
			`"""`
			`Run the RAG pipeline and return the contexts, predicted answers, retrieved documents and ground truth documents`
			`"""`

			`truth_docs = []`
			`retrieved_docs = []`
			`contexts = []`
			`predicted_answers = []`

			`for q in evaluation_questions:`
			`response = rag_pipeline_a.run(`
			`{`
			`"embedder": {"text": q["question"]},`
			`"prompt_builder": {"question": q["question"]},`
			`"answer_builder": {"query": q["question"]},`
			`}`
			`)`
			`truth_docs.append([doc for doc in documents if doc.meta["name"] in q["ground_truth_doc"] and doc.content])`
			`retrieved_docs.append(response["answer_builder"]["answers"][0].documents)`
			`contexts.append([doc.content for doc in response["answer_builder"]["answers"][0].documents])`
			`predicted_answers.append(response["answer_builder"]["answers"][0].data)`

			`return contexts, predicted_answers, retrieved_docs, truth_docs`


			`def built_input_for_results_eval(rag_results):`
			`"""Helper function to build the input for the results evaluation"""`
			`return {`
			`"Mean Reciprocal Rank": {`
			`"individual_scores": rag_results["doc_mrr"]["individual_scores"],`
			`"score": rag_results["doc_mrr"]["score"],`
			`},`
			`"Semantic Answer Similarity": {`
			`"individual_scores": rag_results["sas"]["individual_scores"],`
			`"score": rag_results["sas"]["score"],`
			`},`
			`"Faithfulness": {`
test: Rename `responses` and use preds instead of ground truth answers in e2e eval test (#7640) * rename responses, use preds instead of ground truth answers * fix typo in component name 2024-05-03 12:48:42 +02:00			`"individual_scores": rag_results["groundedness"]["individual_scores"],`
			`"score": rag_results["groundedness"]["score"],`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`},`
			`"Document MAP": {`
			`"individual_scores": rag_results["doc_map"]["individual_scores"],`
			`"score": rag_results["doc_map"]["score"],`
			`},`
			`"Document Recall Single Hit": {`
			`"individual_scores": rag_results["doc_recall_single_hit"]["individual_scores"],`
			`"score": rag_results["doc_recall_single_hit"]["score"],`
			`},`
			`"Document Recall Multi Hit": {`
			`"individual_scores": rag_results["doc_recall_multi_hit"]["individual_scores"],`
			`"score": rag_results["doc_recall_multi_hit"]["score"],`
			`},`
			`"Contextual Relevance": {`
			`"individual_scores": rag_results["relevance"]["individual_scores"],`
			`"score": rag_results["relevance"]["score"],`
			`},`
			`}`


			`@pytest.mark.skipif(`
			`not os.environ.get("OPENAI_API_KEY", None),`
			`reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",`
			`)`
			`def test_evaluation_pipeline(samples_path):`
			`"""Test an evaluation pipeline"""`
			`eval_questions = [`
			`{`
			`"question": 'What falls within the term "cultural anthropology"?',`
			`"answer": "the ideology and analytical stance of cultural relativism",`
			`"ground_truth_doc": ["Culture.txt"],`
			`},`
			`{`
			`"question": "Who was the spiritual guide during the Protestant Reformation?",`
			`"answer": "Martin Bucer",`
			`"ground_truth_doc": ["Strasbourg.txt"],`
			`},`
			`{`
			`"question": "What is materialism?",`
			`"answer": "a form of philosophical monism",`
			`"ground_truth_doc": ["Materialism.txt"],`
			`},`
			`]`

			`questions = [q["question"] for q in eval_questions]`
			`truth_answers = [q["answer"] for q in eval_questions]`

			`# indexing documents`
			`docs = []`
			`full_path = os.path.join(str(samples_path) + "/test_documents/")`
			`for article in os.listdir(full_path):`
			`with open(f"{full_path}/{article}", "r") as f:`
			`for text in f.read().split("\n"):`
fix: changing default model to `gpt-4o-mini` on OpenAI API calls (#8360) * chaning default model to gpt-4o-mini * adding release notes * fixing some missed tests * fixing some more missed tests * fixing one last missed test * fixing linting issues * making pylint happy about an end2end test * chaning if test to walruss operator * fixing azure embedder from ada to text-embedding-ada-002 2024-09-17 10:36:42 +02:00			`if doc := Document(content=text, meta={"name": article}) if text else None:`
			`docs.append(doc)`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`doc_store = indexing_pipeline(docs)`

			`# running the RAG pipeline A + evaluation pipeline`
			`rag_pipeline_a = rag_pipeline(doc_store, top_k=2)`
			`contexts_a, pred_answers_a, retrieved_docs_a, truth_docs = run_rag_pipeline(docs, eval_questions, rag_pipeline_a)`
			`eval_pipeline = evaluation_pipeline()`
			`eval_input = built_eval_input(questions, truth_docs, truth_answers, retrieved_docs_a, contexts_a, pred_answers_a)`
			`results_rag_a = eval_pipeline.run(eval_input)`

			`# running the evaluation EvaluationRunResult`
			`inputs_a = {`
			`"question": questions,`
			`"contexts": contexts_a,`
			`"answer": truth_answers,`
			`"predicted_answer": pred_answers_a,`
			`}`
			`results_a = built_input_for_results_eval(results_rag_a)`
			`evaluation_result_a = EvaluationRunResult(run_name="rag_pipeline_a", results=results_a, inputs=inputs_a)`
fix: updating end2end evaluation tests (#9053) * updating tests * fixing tests, default now is JSON object and no longer dataframe * cleaning up leftovers 2025-03-18 10:52:05 +01:00			`aggregated_score_report_json = evaluation_result_a.aggregated_report()`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00
			`# assert the score report has all the metrics`
fix: updating end2end evaluation tests (#9053) * updating tests * fixing tests, default now is JSON object and no longer dataframe * cleaning up leftovers 2025-03-18 10:52:05 +01:00			`assert len(aggregated_score_report_json["metrics"]) == 7`
			`assert list(aggregated_score_report_json.keys()) == ["metrics", "score"]`
			`assert list(aggregated_score_report_json["metrics"]) == [`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`"Mean Reciprocal Rank",`
			`"Semantic Answer Similarity",`
			`"Faithfulness",`
			`"Document MAP",`
			`"Document Recall Single Hit",`
			`"Document Recall Multi Hit",`
			`"Contextual Relevance",`
			`]`

			`# assert the evaluation result has all the metrics, inputs and questions`
fix: updating end2end evaluation tests (#9053) * updating tests * fixing tests, default now is JSON object and no longer dataframe * cleaning up leftovers 2025-03-18 10:52:05 +01:00			`detailed_report_json = evaluation_result_a.detailed_report()`
			`assert list(detailed_report_json.keys()) == [`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`"question",`
			`"contexts",`
			`"answer",`
			`"predicted_answer",`
			`"Mean Reciprocal Rank",`
			`"Semantic Answer Similarity",`
			`"Faithfulness",`
			`"Document MAP",`
			`"Document Recall Single Hit",`
			`"Document Recall Multi Hit",`
			`"Contextual Relevance",`
			`]`

			`# running the RAG pipeline B`
			`rag_pipeline_b = rag_pipeline(doc_store, top_k=4)`
			`contexts_b, pred_answers_b, retrieved_docs_b, truth_docs = run_rag_pipeline(docs, eval_questions, rag_pipeline_b)`
			`eval_input = built_eval_input(questions, truth_docs, truth_answers, retrieved_docs_b, contexts_b, pred_answers_b)`
			`results_rag_b = eval_pipeline.run(eval_input)`

			`inputs_b = {`
			`"question": questions,`
			`"contexts": contexts_b,`
			`"answer": truth_answers,`
			`"predicted_answer": pred_answers_b,`
			`}`
			`results_b = built_input_for_results_eval(results_rag_b)`
			`evaluation_result_b = EvaluationRunResult(run_name="rag_pipeline_b", results=results_b, inputs=inputs_b)`
fix: updating end2end evaluation tests (#9053) * updating tests * fixing tests, default now is JSON object and no longer dataframe * cleaning up leftovers 2025-03-18 10:52:05 +01:00			`comparative_json = evaluation_result_a.comparative_detailed_report(evaluation_result_b)`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00
			`# assert the comparative score report has all the metrics, inputs and questions`
fix: updating end2end evaluation tests (#9053) * updating tests * fixing tests, default now is JSON object and no longer dataframe * cleaning up leftovers 2025-03-18 10:52:05 +01:00			`assert list(comparative_json.keys()) == [`
test: end2end evaluation tests (#7601) * initial import * wip * cleaning up tests * fixing tests * adding context relevance * reverting some wrong changes to due PyCharm error in refactoring * building eval pipeline only once * handling mypy issues 2024-04-26 16:07:05 +02:00			`"question",`
			`"contexts",`
			`"answer",`
			`"predicted_answer",`
			`"rag_pipeline_a_Mean Reciprocal Rank",`
			`"rag_pipeline_a_Semantic Answer Similarity",`
			`"rag_pipeline_a_Faithfulness",`
			`"rag_pipeline_a_Document MAP",`
			`"rag_pipeline_a_Document Recall Single Hit",`
			`"rag_pipeline_a_Document Recall Multi Hit",`
			`"rag_pipeline_a_Contextual Relevance",`
			`"rag_pipeline_b_Mean Reciprocal Rank",`
			`"rag_pipeline_b_Semantic Answer Similarity",`
			`"rag_pipeline_b_Faithfulness",`
			`"rag_pipeline_b_Document MAP",`
			`"rag_pipeline_b_Document Recall Single Hit",`
			`"rag_pipeline_b_Document Recall Multi Hit",`
			`"rag_pipeline_b_Contextual Relevance",`
			`]`