mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-15 21:10:46 +00:00

* Add SAS metric * Add release notes * Round similarity scores for precision consistency * Add tolerance to tests * Update haystack/evaluation/eval.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Add types for preprocess_text; Add additional types for f1 and em methods --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
337 lines
14 KiB
Python
337 lines
14 KiB
Python
import json
|
|
import pytest
|
|
|
|
from haystack import Pipeline
|
|
from haystack.components.builders.answer_builder import AnswerBuilder
|
|
from haystack.components.builders.prompt_builder import PromptBuilder
|
|
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
|
|
from haystack.components.generators import HuggingFaceLocalGenerator
|
|
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
|
|
from haystack.components.writers import DocumentWriter
|
|
from haystack.dataclasses import Document, GeneratedAnswer
|
|
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
|
from haystack.evaluation.eval import eval
|
|
from haystack.evaluation.metrics import Metric
|
|
|
|
|
|
def test_bm25_rag_pipeline(tmp_path):
|
|
prompt_template = """
|
|
Given these documents, answer the question.\nDocuments:
|
|
{% for doc in documents %}
|
|
{{ doc.content }}
|
|
{% endfor %}
|
|
|
|
\nQuestion: {{question}}
|
|
\nAnswer:
|
|
"""
|
|
rag_pipeline = Pipeline()
|
|
rag_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever")
|
|
rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder")
|
|
rag_pipeline.add_component(
|
|
instance=HuggingFaceLocalGenerator(
|
|
model="google/flan-t5-small",
|
|
task="text2text-generation",
|
|
generation_kwargs={"max_new_tokens": 100, "temperature": 0.5, "do_sample": True},
|
|
),
|
|
name="llm",
|
|
)
|
|
rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
|
|
rag_pipeline.connect("retriever", "prompt_builder.documents")
|
|
rag_pipeline.connect("prompt_builder", "llm")
|
|
rag_pipeline.connect("llm.replies", "answer_builder.replies")
|
|
rag_pipeline.connect("retriever", "answer_builder.documents")
|
|
|
|
# Populate the document store
|
|
documents = [
|
|
Document(content="My name is Jean and I live in Paris."),
|
|
Document(content="My name is Mark and I live in Berlin."),
|
|
Document(content="My name is Giorgio and I live in Rome."),
|
|
]
|
|
rag_pipeline.get_component("retriever").document_store.write_documents(documents)
|
|
|
|
questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]
|
|
inputs = [
|
|
{
|
|
"retriever": {"query": question},
|
|
"prompt_builder": {"question": question},
|
|
"answer_builder": {"query": question},
|
|
}
|
|
for question in questions
|
|
]
|
|
|
|
expected_outputs = [
|
|
{
|
|
"answer_builder": {
|
|
"answers": [
|
|
GeneratedAnswer(
|
|
data="Jean",
|
|
query="Who lives in Paris?",
|
|
documents=[
|
|
Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535),
|
|
Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537),
|
|
Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537),
|
|
],
|
|
meta={},
|
|
)
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"answer_builder": {
|
|
"answers": [
|
|
GeneratedAnswer(
|
|
data="Mark",
|
|
query="Who lives in Berlin?",
|
|
documents=[
|
|
Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535),
|
|
Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537),
|
|
Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537),
|
|
],
|
|
meta={},
|
|
)
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"answer_builder": {
|
|
"answers": [
|
|
GeneratedAnswer(
|
|
data="Giorgio",
|
|
query="Who lives in Rome?",
|
|
documents=[
|
|
Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535),
|
|
Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537),
|
|
Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537),
|
|
],
|
|
meta={},
|
|
)
|
|
]
|
|
}
|
|
},
|
|
]
|
|
|
|
eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs)
|
|
|
|
assert eval_result.inputs == inputs
|
|
assert eval_result.expected_outputs == expected_outputs
|
|
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
|
|
assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()
|
|
|
|
# Test Exact Match
|
|
em_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
|
|
em_custom_parameters = eval_result.calculate_metrics(
|
|
Metric.EM, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
|
|
)
|
|
# Save EM metric results to json
|
|
em_default.save(tmp_path / "exact_match_score.json")
|
|
|
|
assert em_default["exact_match"] == 1.0
|
|
assert em_custom_parameters["exact_match"] == 1.0
|
|
with open(tmp_path / "exact_match_score.json", "r") as f:
|
|
assert em_default == json.load(f)
|
|
|
|
# Test F1
|
|
f1_default = eval_result.calculate_metrics(Metric.F1, output_key="answers")
|
|
f1_custom_parameters = eval_result.calculate_metrics(
|
|
Metric.F1, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
|
|
)
|
|
# Save F1 metric results to json
|
|
f1_default.save(tmp_path / "f1_score.json")
|
|
|
|
assert f1_default["f1"] == 1.0
|
|
assert f1_custom_parameters["f1"] == 1.0
|
|
with open(tmp_path / "f1_score.json", "r") as f:
|
|
assert f1_default == json.load(f)
|
|
|
|
# Test SAS
|
|
sas_default = eval_result.calculate_metrics(
|
|
Metric.SAS, output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
|
)
|
|
sas_custom_parameters = eval_result.calculate_metrics(
|
|
Metric.SAS,
|
|
output_key="answers",
|
|
ignore_case=True,
|
|
ignore_punctuation=True,
|
|
ignore_numbers=True,
|
|
model="cross-encoder/ms-marco-MiniLM-L-6-v2",
|
|
)
|
|
# Save SAS metric results to json
|
|
sas_default.save(tmp_path / "sas_score.json")
|
|
|
|
assert sas_default["sas"] == pytest.approx(1.0)
|
|
assert sas_default["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
|
assert sas_custom_parameters["sas"] == pytest.approx(0.9769593, abs=1e-5)
|
|
assert sas_custom_parameters["scores"] == pytest.approx([0.975823, 0.957218, 0.997837], abs=1e-5)
|
|
|
|
with open(tmp_path / "sas_score.json", "r") as f:
|
|
assert sas_default == json.load(f)
|
|
|
|
|
|
def test_embedding_retrieval_rag_pipeline(tmp_path):
|
|
# Create the RAG pipeline
|
|
prompt_template = """
|
|
Given these documents, answer the question.\nDocuments:
|
|
{% for doc in documents %}
|
|
{{ doc.content }}
|
|
{% endfor %}
|
|
|
|
\nQuestion: {{question}}
|
|
\nAnswer:
|
|
"""
|
|
rag_pipeline = Pipeline()
|
|
rag_pipeline.add_component(
|
|
instance=SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="text_embedder"
|
|
)
|
|
rag_pipeline.add_component(
|
|
instance=InMemoryEmbeddingRetriever(document_store=InMemoryDocumentStore()), name="retriever"
|
|
)
|
|
rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder")
|
|
rag_pipeline.add_component(
|
|
instance=HuggingFaceLocalGenerator(
|
|
model="google/flan-t5-small",
|
|
task="text2text-generation",
|
|
generation_kwargs={"max_new_tokens": 100, "temperature": 0.5, "do_sample": True},
|
|
),
|
|
name="llm",
|
|
)
|
|
rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
|
|
rag_pipeline.connect("text_embedder", "retriever")
|
|
rag_pipeline.connect("retriever", "prompt_builder.documents")
|
|
rag_pipeline.connect("prompt_builder", "llm")
|
|
rag_pipeline.connect("llm.replies", "answer_builder.replies")
|
|
rag_pipeline.connect("retriever", "answer_builder.documents")
|
|
|
|
# Populate the document store
|
|
documents = [
|
|
Document(content="My name is Jean and I live in Paris."),
|
|
Document(content="My name is Mark and I live in Berlin."),
|
|
Document(content="My name is Giorgio and I live in Rome."),
|
|
]
|
|
document_store = rag_pipeline.get_component("retriever").document_store
|
|
indexing_pipeline = Pipeline()
|
|
indexing_pipeline.add_component(
|
|
instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"),
|
|
name="document_embedder",
|
|
)
|
|
indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="document_writer")
|
|
indexing_pipeline.connect("document_embedder", "document_writer")
|
|
indexing_pipeline.run({"document_embedder": {"documents": documents}})
|
|
|
|
# Query and assert
|
|
questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]
|
|
inputs = [
|
|
{
|
|
"prompt_builder": {"question": question},
|
|
"text_embedder": {"text": question},
|
|
"answer_builder": {"query": question},
|
|
}
|
|
for question in questions
|
|
]
|
|
|
|
expected_outputs = [
|
|
{
|
|
"answer_builder": {
|
|
"answers": [
|
|
GeneratedAnswer(
|
|
data="Jean",
|
|
query="Who lives in Paris?",
|
|
documents=[
|
|
Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535),
|
|
Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537),
|
|
Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537),
|
|
],
|
|
meta={},
|
|
)
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"answer_builder": {
|
|
"answers": [
|
|
GeneratedAnswer(
|
|
data="Mark",
|
|
query="Who lives in Berlin?",
|
|
documents=[
|
|
Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535),
|
|
Document(content="My name is Giorgio and I live in Rome.", score=-0.17938556566116537),
|
|
Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537),
|
|
],
|
|
meta={},
|
|
)
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"answer_builder": {
|
|
"answers": [
|
|
GeneratedAnswer(
|
|
data="Giorgio",
|
|
query="Who lives in Rome?",
|
|
documents=[
|
|
Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535),
|
|
Document(content="My name is Mark and I live in Berlin.", score=-0.17938556566116537),
|
|
Document(content="My name is Jean and I live in Paris.", score=-0.17938556566116537),
|
|
],
|
|
meta={},
|
|
)
|
|
]
|
|
}
|
|
},
|
|
]
|
|
|
|
eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs)
|
|
|
|
assert eval_result.inputs == inputs
|
|
assert eval_result.expected_outputs == expected_outputs
|
|
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
|
|
assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()
|
|
|
|
# Test Exact Match
|
|
em_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
|
|
em_custom_parameters = eval_result.calculate_metrics(
|
|
Metric.EM, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
|
|
)
|
|
# Save EM metric results to json
|
|
em_default.save(tmp_path / "exact_match_score.json")
|
|
|
|
assert em_default["exact_match"] == 1.0
|
|
assert em_custom_parameters["exact_match"] == 1.0
|
|
with open(tmp_path / "exact_match_score.json", "r") as f:
|
|
assert em_default == json.load(f)
|
|
|
|
# Test F1
|
|
f1_default = eval_result.calculate_metrics(Metric.F1, output_key="answers")
|
|
f1_custom_parameters = eval_result.calculate_metrics(
|
|
Metric.F1, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
|
|
)
|
|
# Save F1 metric results to json
|
|
f1_default.save(tmp_path / "f1_score.json")
|
|
|
|
assert f1_default["f1"] == 1.0
|
|
assert f1_custom_parameters["f1"] == 1.0
|
|
with open(tmp_path / "f1_score.json", "r") as f:
|
|
assert f1_default == json.load(f)
|
|
|
|
# Test SAS
|
|
sas_default = eval_result.calculate_metrics(
|
|
Metric.SAS, output_key="answers", model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
|
)
|
|
sas_custom_parameters = eval_result.calculate_metrics(
|
|
Metric.SAS,
|
|
output_key="answers",
|
|
ignore_case=True,
|
|
ignore_punctuation=True,
|
|
ignore_numbers=True,
|
|
model="cross-encoder/ms-marco-MiniLM-L-6-v2",
|
|
)
|
|
# Save SAS metric results to json
|
|
sas_default.save(tmp_path / "sas_score.json")
|
|
|
|
assert sas_default["sas"] == pytest.approx(1.0)
|
|
assert sas_default["scores"] == pytest.approx([1.0, 1.0, 1.0])
|
|
assert sas_custom_parameters["sas"] == pytest.approx(0.9769593, abs=1e-5)
|
|
assert sas_custom_parameters["scores"] == pytest.approx([0.975823, 0.957218, 0.997837], abs=1e-5)
|
|
|
|
with open(tmp_path / "sas_score.json", "r") as f:
|
|
assert sas_default == json.load(f)
|