feat: Add Eval and EvaluationResult (#6505)

* Add initial implementation for Eval and EvaluationResult

* Add release notes

* Update files with suggestions from review

* Remove serialization

* Add eval e2e tests

* Update eval e2e tests
This commit is contained in:
Ashwin Mathur 2023-12-18 15:59:09 +05:30 committed by GitHub
parent 3e0e81b1e0
commit 46b395eec3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 522 additions and 0 deletions

View File

@ -0,0 +1,85 @@
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.components.routers import DocumentJoiner, FileTypeRouter
from haystack.components.writers import DocumentWriter
from haystack.dataclasses import Document
from haystack.document_stores import InMemoryDocumentStore
from haystack.evaluation.eval import eval
def test_dense_doc_search_pipeline(samples_path):
# Create the indexing pipeline
indexing_pipeline = Pipeline()
indexing_pipeline.add_component(
instance=FileTypeRouter(mime_types=["text/plain", "application/pdf"]), name="file_type_router"
)
indexing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
indexing_pipeline.add_component(instance=PyPDFToDocument(), name="pdf_file_converter")
indexing_pipeline.add_component(instance=DocumentJoiner(), name="joiner")
indexing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
indexing_pipeline.add_component(
instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter"
)
indexing_pipeline.add_component(
instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
name="embedder",
)
indexing_pipeline.add_component(instance=DocumentWriter(document_store=InMemoryDocumentStore()), name="writer")
indexing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
indexing_pipeline.connect("file_type_router.application/pdf", "pdf_file_converter.sources")
indexing_pipeline.connect("text_file_converter.documents", "joiner.documents")
indexing_pipeline.connect("pdf_file_converter.documents", "joiner.documents")
indexing_pipeline.connect("joiner.documents", "cleaner.documents")
indexing_pipeline.connect("cleaner.documents", "splitter.documents")
indexing_pipeline.connect("splitter.documents", "embedder.documents")
indexing_pipeline.connect("embedder.documents", "writer.documents")
indexing_pipeline.run({"file_type_router": {"sources": list(samples_path.iterdir())}})
filled_document_store = indexing_pipeline.get_component("writer").document_store
# Create the querying pipeline
query_pipeline = Pipeline()
query_pipeline.add_component(
instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
name="text_embedder",
)
query_pipeline.add_component(
instance=InMemoryEmbeddingRetriever(document_store=filled_document_store, top_k=20), name="embedding_retriever"
)
query_pipeline.connect("text_embedder", "embedding_retriever")
inputs = [{"text_embedder": {"text": "Who lives in Rome?"}}]
expected_outputs = [
{
"embedding_retriever": {
"documents": [
Document(
id="d219162e5d0b8e5eab901e32ce0d9c12d24e5ea26a92780442fcfa560eb0b7d6",
content="My name is Giorgio and I live in Rome.",
meta={
"file_path": "/home/ashwin/data_science/0ashwin/opensource/haystack/e2e/samples/doc_1.txt",
"source_id": "0366ae1654f4573564e29184cd4a2232286a93f4f25d6790ce703ae7d4d7d63c",
},
score=0.627746287158654,
),
Document(
id="2dcf2bc0307ba21fbb7e97a307d987a05297e577a44f170081acdbab9fc4b95f",
content="A sample PDF file History and standardizationFormat (PDF) Adobe Systems made the PDF specification ava...",
meta={"source_id": "ec1ac6c430ecd0cc74ae56f3e2d84f93fef3f5393de6901fe8aa01e494ebcdbe"},
score=-0.060180130727963355,
),
]
}
}
]
eval_result = eval(query_pipeline, inputs=inputs, expected_outputs=expected_outputs)
assert eval_result.inputs == inputs
assert eval_result.expected_outputs == expected_outputs
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == query_pipeline.to_dict()

View File

@ -0,0 +1,125 @@
from haystack import Pipeline
from haystack.components.readers import ExtractiveReader
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.dataclasses import Document, ExtractedAnswer
from haystack.document_stores import InMemoryDocumentStore
from haystack.evaluation.eval import eval
def test_extractive_qa_pipeline():
# Create the pipeline
qa_pipeline = Pipeline()
qa_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever")
qa_pipeline.add_component(instance=ExtractiveReader(model_name_or_path="deepset/tinyroberta-squad2"), name="reader")
qa_pipeline.connect("retriever", "reader")
# Populate the document store
documents = [
Document(content="My name is Jean and I live in Paris."),
Document(content="My name is Mark and I live in Berlin."),
Document(content="My name is Giorgio and I live in Rome."),
]
qa_pipeline.get_component("retriever").document_store.write_documents(documents)
# Query and assert
questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]
inputs = [{"retriever": {"query": question}, "reader": {"query": question, "top_k": 1}} for question in questions]
expected_outputs = [
{
"reader": {
"answers": [
ExtractedAnswer(
query="Who lives in Paris?",
score=0.7713339924812317,
data="Jean and I",
document=Document(
id="6c90b78ad94e4e634e2a067b5fe2d26d4ce95405ec222cbaefaeb09ab4dce81e",
content="My name is Jean and I live in Paris.",
score=0.33144005810482535,
),
context=None,
document_offset=ExtractedAnswer.Span(start=11, end=21),
context_offset=None,
meta={},
),
ExtractedAnswer(
query="Who lives in Paris?",
score=0.2286660075187683,
data=None,
document=None,
context=None,
document_offset=None,
context_offset=None,
meta={},
),
]
}
},
{
"reader": {
"answers": [
ExtractedAnswer(
query="Who lives in Berlin?",
score=0.7047999501228333,
data="Mark and I",
document=Document(
id="10a183e965c2e107e20507c717f16559c58a8ba4bc7c577ea8dc32a8d6ca7a20",
content="My name is Mark and I live in Berlin.",
score=0.33144005810482535,
),
context=None,
document_offset=ExtractedAnswer.Span(start=11, end=21),
context_offset=None,
meta={},
),
ExtractedAnswer(
query="Who lives in Berlin?",
score=0.29520004987716675,
data=None,
document=None,
context=None,
document_offset=None,
context_offset=None,
meta={},
),
]
}
},
{
"reader": {
"answers": [
ExtractedAnswer(
query="Who lives in Rome?",
score=0.7661304473876953,
data="Giorgio and I",
document=Document(
id="fb0f1efe94b3c78aa1c4e5a17a5ef8270f70e89d36a3665c8362675e8a769a27",
content="My name is Giorgio and I live in Rome.",
score=0.33144005810482535,
),
context=None,
document_offset=ExtractedAnswer.Span(start=11, end=24),
context_offset=None,
meta={},
),
ExtractedAnswer(
query="Who lives in Rome?",
score=0.2338695526123047,
data=None,
document=None,
context=None,
document_offset=None,
context_offset=None,
meta={},
),
]
}
},
]
eval_result = eval(qa_pipeline, inputs=inputs, expected_outputs=expected_outputs)
assert eval_result.inputs == inputs
assert eval_result.expected_outputs == expected_outputs
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == qa_pipeline.to_dict()

View File

@ -0,0 +1,100 @@
from haystack import Document, Pipeline
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.rankers import TransformersSimilarityRanker
from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
from haystack.components.routers.document_joiner import DocumentJoiner
from haystack.document_stores import InMemoryDocumentStore
from haystack.evaluation.eval import eval
def test_hybrid_doc_search_pipeline():
# Create the pipeline
document_store = InMemoryDocumentStore()
hybrid_pipeline = Pipeline()
hybrid_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=document_store), name="bm25_retriever")
hybrid_pipeline.add_component(
instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
name="text_embedder",
)
hybrid_pipeline.add_component(
instance=InMemoryEmbeddingRetriever(document_store=document_store), name="embedding_retriever"
)
hybrid_pipeline.add_component(instance=DocumentJoiner(), name="joiner")
hybrid_pipeline.add_component(instance=TransformersSimilarityRanker(top_k=2), name="ranker")
hybrid_pipeline.connect("bm25_retriever", "joiner")
hybrid_pipeline.connect("text_embedder", "embedding_retriever")
hybrid_pipeline.connect("embedding_retriever", "joiner")
hybrid_pipeline.connect("joiner", "ranker")
# Populate the document store
documents = [
Document(content="My name is Jean and I live in Paris."),
Document(content="My name is Mark and I live in Berlin."),
Document(content="My name is Mario and I live in the capital of Italy."),
Document(content="My name is Giorgio and I live in Rome."),
]
hybrid_pipeline.get_component("bm25_retriever").document_store.write_documents(documents)
questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]
inputs = [
{"bm25_retriever": {"query": question}, "text_embedder": {"text": question}, "ranker": {"query": question}}
for question in questions
]
expected_outputs = [
{
"ranker": {
"documents": [
Document(
id="6c90b78ad94e4e634e2a067b5fe2d26d4ce95405ec222cbaefaeb09ab4dce81e",
content="My name is Jean and I live in Paris.",
score=2.2277960777282715,
),
Document(
id="10a183e965c2e107e20507c717f16559c58a8ba4bc7c577ea8dc32a8d6ca7a20",
content="My name is Mark and I live in Berlin.",
score=-7.304897308349609,
),
]
}
},
{
"ranker": {
"documents": [
Document(
id="10a183e965c2e107e20507c717f16559c58a8ba4bc7c577ea8dc32a8d6ca7a20",
content="My name is Mark and I live in Berlin.",
score=3.694173812866211,
),
Document(
id="f7533b5c6c968680d0ef8e38f366d4e68b7ac0d7238f1b1b366d15cb9c33efd8",
content="My name is Mario and I live in the capital of Italy.",
score=-9.008655548095703,
),
]
}
},
{
"ranker": {
"documents": [
Document(
id="fb0f1efe94b3c78aa1c4e5a17a5ef8270f70e89d36a3665c8362675e8a769a27",
content="My name is Giorgio and I live in Rome.",
score=3.487802028656006,
),
Document(
id="f7533b5c6c968680d0ef8e38f366d4e68b7ac0d7238f1b1b366d15cb9c33efd8",
content="My name is Mario and I live in the capital of Italy.",
score=-2.873128890991211,
),
]
}
},
]
eval_result = eval(hybrid_pipeline, inputs=inputs, expected_outputs=expected_outputs)
assert eval_result.inputs == inputs
assert eval_result.expected_outputs == expected_outputs
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == hybrid_pipeline.to_dict()

View File

@ -0,0 +1,145 @@
from haystack import Pipeline
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack.components.generators import HuggingFaceLocalGenerator
from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
from haystack.components.writers import DocumentWriter
from haystack.dataclasses import Document
from haystack.document_stores import InMemoryDocumentStore
from haystack.evaluation.eval import eval
def test_bm25_rag_pipeline():
prompt_template = """
Given these documents, answer the question.\nDocuments:
{% for doc in documents %}
{{ doc.content }}
{% endfor %}
\nQuestion: {{question}}
\nAnswer:
"""
rag_pipeline = Pipeline()
rag_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever")
rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder")
rag_pipeline.add_component(
instance=HuggingFaceLocalGenerator(
model_name_or_path="google/flan-t5-small",
task="text2text-generation",
generation_kwargs={"max_new_tokens": 100, "temperature": 0.5, "do_sample": True},
),
name="llm",
)
rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
rag_pipeline.connect("retriever", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")
rag_pipeline.connect("llm.replies", "answer_builder.replies")
rag_pipeline.connect("retriever", "answer_builder.documents")
# Populate the document store
documents = [
Document(content="My name is Jean and I live in Paris."),
Document(content="My name is Mark and I live in Berlin."),
Document(content="My name is Giorgio and I live in Rome."),
]
rag_pipeline.get_component("retriever").document_store.write_documents(documents)
questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]
inputs = [
{
"retriever": {"query": question},
"prompt_builder": {"question": question},
"answer_builder": {"query": question},
}
for question in questions
]
expected_outputs = [
{"llm": {"replies": ["Jean"]}},
{"llm": {"replies": ["Mark"]}},
{"llm": {"replies": ["Giorgio"]}},
]
eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs)
assert eval_result.inputs == inputs
assert eval_result.expected_outputs == expected_outputs
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()
def test_embedding_retrieval_rag_pipeline():
# Create the RAG pipeline
prompt_template = """
Given these documents, answer the question.\nDocuments:
{% for doc in documents %}
{{ doc.content }}
{% endfor %}
\nQuestion: {{question}}
\nAnswer:
"""
rag_pipeline = Pipeline()
rag_pipeline.add_component(
instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
name="text_embedder",
)
rag_pipeline.add_component(
instance=InMemoryEmbeddingRetriever(document_store=InMemoryDocumentStore()), name="retriever"
)
rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder")
rag_pipeline.add_component(
instance=HuggingFaceLocalGenerator(
model_name_or_path="google/flan-t5-small",
task="text2text-generation",
generation_kwargs={"max_new_tokens": 100, "temperature": 0.5, "do_sample": True},
),
name="llm",
)
rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
rag_pipeline.connect("text_embedder", "retriever")
rag_pipeline.connect("retriever", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")
rag_pipeline.connect("llm.replies", "answer_builder.replies")
rag_pipeline.connect("retriever", "answer_builder.documents")
# Populate the document store
documents = [
Document(content="My name is Jean and I live in Paris."),
Document(content="My name is Mark and I live in Berlin."),
Document(content="My name is Giorgio and I live in Rome."),
]
document_store = rag_pipeline.get_component("retriever").document_store
indexing_pipeline = Pipeline()
indexing_pipeline.add_component(
instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"),
name="document_embedder",
)
indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="document_writer")
indexing_pipeline.connect("document_embedder", "document_writer")
indexing_pipeline.run({"document_embedder": {"documents": documents}})
# Query and assert
questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]
inputs = [
{
"prompt_builder": {"question": question},
"text_embedder": {"text": question},
"answer_builder": {"query": question},
}
for question in questions
]
expected_outputs = [
{"llm": {"replies": ["Jean"]}},
{"llm": {"replies": ["Mark"]}},
{"llm": {"replies": ["Giorgio"]}},
]
eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs)
assert eval_result.inputs == inputs
assert eval_result.expected_outputs == expected_outputs
assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
assert eval_result.runnable.to_dict() == rag_pipeline.to_dict()

View File

@ -0,0 +1,3 @@
from haystack.evaluation.eval import EvaluationResult, eval
__all__ = ["eval", "EvaluationResult"]

View File

@ -0,0 +1,60 @@
from typing import Any, Dict, List, Union
from haystack import Pipeline
from haystack.core.component import Component
class EvaluationResult:
"""
EvaluationResult keeps track of all the information related to evaluation, namely the runnable (Pipeline or component), inputs, outputs, and expected outputs.
The EvaluationResult keeps track of all the information stored by eval.
:param runnable: The runnable (Pipeline or component) used for evaluation.
:param inputs: List of inputs used for evaluation.
:param outputs: List of outputs generated by the runnable.
:param expected_outputs: List of expected outputs used for evaluation.
"""
def __init__(
self,
runnable: Union[Pipeline, Component],
inputs: List[Dict[str, Any]],
outputs: List[Dict[str, Any]],
expected_outputs: List[Dict[str, Any]],
) -> None:
self.runnable = runnable
self.inputs = inputs
self.outputs = outputs
self.expected_outputs = expected_outputs
def eval(
runnable: Union[Pipeline, Component], inputs: List[Dict[str, Any]], expected_outputs: List[Dict[str, Any]]
) -> EvaluationResult:
"""
Evaluates the provided Pipeline or component based on the given inputs and expected outputs.
This function facilitates the evaluation of a given runnable (either a Pipeline or a component) using the provided
inputs and corresponding expected outputs.
:param runnable: The runnable (Pipeline or component) used for evaluation.
:param inputs: List of inputs used for evaluation.
:param expected_outputs: List of expected outputs used for evaluation.
:return: An instance of EvaluationResult containing information about the evaluation, including the runnable, inputs, outputs, and expected outputs.
"""
outputs = []
# Check that expected outputs has the correct shape
if len(inputs) != len(expected_outputs):
raise ValueError(
f"The number of inputs ({len(inputs)}) does not match the number of expected outputs ({len(expected_outputs)}). "
" Please ensure that each input has a corresponding expected output."
)
for input_ in inputs:
output = runnable.run(input_)
outputs.append(output)
return EvaluationResult(runnable, inputs, outputs, expected_outputs)

View File

@ -0,0 +1,4 @@
preview:
- |
Add eval function for evaluation of components and Pipelines.
Adds EvaluationResult to store results of evaluation.