haystack/proposals/text/5794-evaluation-haystack-2.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
# mypy: ignore-errors
# pylint: skip-file
###
### This is an example script of how to use the eval function to evaluate a RAG Pipeline.
### For more information see the relative proposal.
###

import os

from haystack import Pipeline
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators.openai.gpt35 import GPT35Generator
from haystack.components.retrievers.memory import MemoryBM25Retriever
from haystack.dataclasses.document import Document
from haystack.document_stores.memory import MemoryDocumentStore

docstore = MemoryDocumentStore()

# Write some fake documents
docstore.write_documents(
    [
        Document(content="This is not the answer you are looking for.", meta={"name": "Obi-Wan Kenobi"}),
        Document(content="This is the way.", meta={"name": "Mandalorian"}),
        Document(content="The answer to life, the universe and everything is 42.", meta={"name": "Deep Thought"}),
        Document(content="When you play the game of thrones, you win or you die.", meta={"name": "Cersei Lannister"}),
        Document(content="Winter is coming.", meta={"name": "Ned Stark"}),
    ]
)

# Create our retriever, we set top_k to 3 to get only the best 3 documents otherwise by default we get 10
retriever = MemoryBM25Retriever(document_store=docstore, top_k=3)

# Create our prompt template
template = """Given the context please answer the question.
Context:
{# We're receiving a list of lists, so we handle it like this #}
{% for list in documents %}
    {% for doc in list %}
        {{- doc -}};
    {% endfor %}
{% endfor %}
Question: {{ question }};
Answer:
"""
prompt_builder = PromptBuilder(template)

# We're using OpenAI gpt-3.5
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
generator = GPT35Generator(api_key=OPENAI_API_KEY)

# Build the pipeline
pipe = Pipeline()

pipe.add_component("docs_retriever", retriever)
pipe.add_component("builder", prompt_builder)
pipe.add_component("gpt35", generator)

pipe.connect("docs_retriever.documents", "builder.documents")
pipe.connect("builder.prompt", "gpt35.prompt")

# Run the pipeline
query = "What is the answer to life, the universe and everything?"
result = pipe.run({"docs_retriever": {"queries": [query]}, "builder": {"question": query}})

print(result["gpt35"]["replies"])


# These are the input that will be passed to the Pipeline when running eval, much like we've done a couple of lines above
inputs = [
    {"docs_retriever": {"queries": ["What is the answer?"]}, "builder": {"question": "What is the answer?"}},
    {
        "docs_retriever": {"queries": ["Take a deep breath and think. What is the answer?"]},
        "builder": {"question": "Take a deep breath and think. What is the answer?"},
    },
    {
        "docs_retriever": {"queries": ["What is the answer to life, the universe and everything?"]},
        "builder": {"question": "What is the answer to life, the universe and everything?"},
    },
]

# These are the expected output that will be compared to the actual output of the Pipeline.
# We have a dictionary for each input so that len(inputs) == len(expected_output).
# This gives the possibility to have different expected output for each different input.
# NOTE: I omitted the gpt35 metadata output because it's too long.
expected_output = [
    {
        # This is the output that we expect from the docs_retriever component
        "docs_retriever": {
            "documents": [
                [
                    Document(
                        content="The answer to life, the universe and everything is 42.", meta={"name": "Deep Thought"}
                    ),
                    Document(content="This is not the answer you are looking for.", meta={"name": "Obi-Wan Kenobi"}),
                    Document(content="This is the way.", meta={"name": "Mandalorian"}),
                ]
            ]
        },
        # This is the output that we expect from the builder component
        "builder": {"prompt": "I should write the actual template here but I'm lazy so I won't."},
        # This is the output that we expect from the gpt35 component
        "gpt35": {"replies": ["The answer to life, the universe and everything is 42."], "metadata": {}},
    },
    {
        "docs_retriever": {
            "documents": [
                [
                    Document(
                        content="The answer to life, the universe and everything is 42.", meta={"name": "Deep Thought"}
                    ),
                    Document(content="This is not the answer you are looking for.", meta={"name": "Obi-Wan Kenobi"}),
                    Document(content="This is the way.", meta={"name": "Mandalorian"}),
                ]
            ]
        },
        "builder": {"prompt": "I should write the actual template here but I'm lazy so I won't."},
        "gpt35": {"replies": ["The answer to life, the universe and everything is 42."], "metadata": {}},
    },
    {
        "docs_retriever": {
            "documents": [
                [
                    Document(
                        content="The answer to life, the universe and everything is 42.", meta={"name": "Deep Thought"}
                    ),
                    Document(content="This is not the answer you are looking for.", meta={"name": "Obi-Wan Kenobi"}),
                    Document(content="This is the way.", meta={"name": "Mandalorian"}),
                ]
            ]
        },
        "builder": {"prompt": "I should write the actual template here but I'm lazy so I won't."},
        "gpt35": {"replies": ["The answer to life, the universe and everything is 42."], "metadata": {}},
    },
]

eval_result = eval(pipe, inputs=inputs, expected_output=expected_output)
metrics = result.calculate_metrics(Metric.SAS)  # noqa
metrics.save("path/to/file.csv")