diff --git a/e2e/pipelines/test_eval_dense_doc_search.py b/e2e/pipelines/test_eval_dense_doc_search.py new file mode 100644 index 000000000..b17f052af --- /dev/null +++ b/e2e/pipelines/test_eval_dense_doc_search.py @@ -0,0 +1,85 @@ +from haystack import Pipeline +from haystack.components.converters import PyPDFToDocument, TextFileToDocument +from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder +from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter +from haystack.components.retrievers import InMemoryEmbeddingRetriever +from haystack.components.routers import DocumentJoiner, FileTypeRouter +from haystack.components.writers import DocumentWriter +from haystack.dataclasses import Document +from haystack.document_stores import InMemoryDocumentStore +from haystack.evaluation.eval import eval + + +def test_dense_doc_search_pipeline(samples_path): + # Create the indexing pipeline + indexing_pipeline = Pipeline() + indexing_pipeline.add_component( + instance=FileTypeRouter(mime_types=["text/plain", "application/pdf"]), name="file_type_router" + ) + indexing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter") + indexing_pipeline.add_component(instance=PyPDFToDocument(), name="pdf_file_converter") + indexing_pipeline.add_component(instance=DocumentJoiner(), name="joiner") + indexing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner") + indexing_pipeline.add_component( + instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter" + ) + indexing_pipeline.add_component( + instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), + name="embedder", + ) + indexing_pipeline.add_component(instance=DocumentWriter(document_store=InMemoryDocumentStore()), name="writer") + + indexing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources") + indexing_pipeline.connect("file_type_router.application/pdf", "pdf_file_converter.sources") + indexing_pipeline.connect("text_file_converter.documents", "joiner.documents") + indexing_pipeline.connect("pdf_file_converter.documents", "joiner.documents") + indexing_pipeline.connect("joiner.documents", "cleaner.documents") + indexing_pipeline.connect("cleaner.documents", "splitter.documents") + indexing_pipeline.connect("splitter.documents", "embedder.documents") + indexing_pipeline.connect("embedder.documents", "writer.documents") + + indexing_pipeline.run({"file_type_router": {"sources": list(samples_path.iterdir())}}) + filled_document_store = indexing_pipeline.get_component("writer").document_store + + # Create the querying pipeline + query_pipeline = Pipeline() + query_pipeline.add_component( + instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), + name="text_embedder", + ) + query_pipeline.add_component( + instance=InMemoryEmbeddingRetriever(document_store=filled_document_store, top_k=20), name="embedding_retriever" + ) + query_pipeline.connect("text_embedder", "embedding_retriever") + + inputs = [{"text_embedder": {"text": "Who lives in Rome?"}}] + expected_outputs = [ + { + "embedding_retriever": { + "documents": [ + Document( + id="d219162e5d0b8e5eab901e32ce0d9c12d24e5ea26a92780442fcfa560eb0b7d6", + content="My name is Giorgio and I live in Rome.", + meta={ + "file_path": "/home/ashwin/data_science/0ashwin/opensource/haystack/e2e/samples/doc_1.txt", + "source_id": "0366ae1654f4573564e29184cd4a2232286a93f4f25d6790ce703ae7d4d7d63c", + }, + score=0.627746287158654, + ), + Document( + id="2dcf2bc0307ba21fbb7e97a307d987a05297e577a44f170081acdbab9fc4b95f", + content="A sample PDF file History and standardizationFormat (PDF) Adobe Systems made the PDF specification ava...", + meta={"source_id": "ec1ac6c430ecd0cc74ae56f3e2d84f93fef3f5393de6901fe8aa01e494ebcdbe"}, + score=-0.060180130727963355, + ), + ] + } + } + ] + + eval_result = eval(query_pipeline, inputs=inputs, expected_outputs=expected_outputs) + + assert eval_result.inputs == inputs + assert eval_result.expected_outputs == expected_outputs + assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) + assert eval_result.runnable.to_dict() == query_pipeline.to_dict() diff --git a/e2e/pipelines/test_eval_extractive_qa_pipeline.py b/e2e/pipelines/test_eval_extractive_qa_pipeline.py new file mode 100644 index 000000000..201a84e13 --- /dev/null +++ b/e2e/pipelines/test_eval_extractive_qa_pipeline.py @@ -0,0 +1,125 @@ +from haystack import Pipeline +from haystack.components.readers import ExtractiveReader +from haystack.components.retrievers import InMemoryBM25Retriever +from haystack.dataclasses import Document, ExtractedAnswer +from haystack.document_stores import InMemoryDocumentStore +from haystack.evaluation.eval import eval + + +def test_extractive_qa_pipeline(): + # Create the pipeline + qa_pipeline = Pipeline() + qa_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever") + qa_pipeline.add_component(instance=ExtractiveReader(model_name_or_path="deepset/tinyroberta-squad2"), name="reader") + qa_pipeline.connect("retriever", "reader") + + # Populate the document store + documents = [ + Document(content="My name is Jean and I live in Paris."), + Document(content="My name is Mark and I live in Berlin."), + Document(content="My name is Giorgio and I live in Rome."), + ] + qa_pipeline.get_component("retriever").document_store.write_documents(documents) + + # Query and assert + questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"] + inputs = [{"retriever": {"query": question}, "reader": {"query": question, "top_k": 1}} for question in questions] + expected_outputs = [ + { + "reader": { + "answers": [ + ExtractedAnswer( + query="Who lives in Paris?", + score=0.7713339924812317, + data="Jean and I", + document=Document( + id="6c90b78ad94e4e634e2a067b5fe2d26d4ce95405ec222cbaefaeb09ab4dce81e", + content="My name is Jean and I live in Paris.", + score=0.33144005810482535, + ), + context=None, + document_offset=ExtractedAnswer.Span(start=11, end=21), + context_offset=None, + meta={}, + ), + ExtractedAnswer( + query="Who lives in Paris?", + score=0.2286660075187683, + data=None, + document=None, + context=None, + document_offset=None, + context_offset=None, + meta={}, + ), + ] + } + }, + { + "reader": { + "answers": [ + ExtractedAnswer( + query="Who lives in Berlin?", + score=0.7047999501228333, + data="Mark and I", + document=Document( + id="10a183e965c2e107e20507c717f16559c58a8ba4bc7c577ea8dc32a8d6ca7a20", + content="My name is Mark and I live in Berlin.", + score=0.33144005810482535, + ), + context=None, + document_offset=ExtractedAnswer.Span(start=11, end=21), + context_offset=None, + meta={}, + ), + ExtractedAnswer( + query="Who lives in Berlin?", + score=0.29520004987716675, + data=None, + document=None, + context=None, + document_offset=None, + context_offset=None, + meta={}, + ), + ] + } + }, + { + "reader": { + "answers": [ + ExtractedAnswer( + query="Who lives in Rome?", + score=0.7661304473876953, + data="Giorgio and I", + document=Document( + id="fb0f1efe94b3c78aa1c4e5a17a5ef8270f70e89d36a3665c8362675e8a769a27", + content="My name is Giorgio and I live in Rome.", + score=0.33144005810482535, + ), + context=None, + document_offset=ExtractedAnswer.Span(start=11, end=24), + context_offset=None, + meta={}, + ), + ExtractedAnswer( + query="Who lives in Rome?", + score=0.2338695526123047, + data=None, + document=None, + context=None, + document_offset=None, + context_offset=None, + meta={}, + ), + ] + } + }, + ] + + eval_result = eval(qa_pipeline, inputs=inputs, expected_outputs=expected_outputs) + + assert eval_result.inputs == inputs + assert eval_result.expected_outputs == expected_outputs + assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) + assert eval_result.runnable.to_dict() == qa_pipeline.to_dict() diff --git a/e2e/pipelines/test_eval_hybrid_doc_search_pipeline.py b/e2e/pipelines/test_eval_hybrid_doc_search_pipeline.py new file mode 100644 index 000000000..f7b4455e6 --- /dev/null +++ b/e2e/pipelines/test_eval_hybrid_doc_search_pipeline.py @@ -0,0 +1,100 @@ +from haystack import Document, Pipeline +from haystack.components.embedders import SentenceTransformersTextEmbedder +from haystack.components.rankers import TransformersSimilarityRanker +from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever +from haystack.components.routers.document_joiner import DocumentJoiner +from haystack.document_stores import InMemoryDocumentStore +from haystack.evaluation.eval import eval + + +def test_hybrid_doc_search_pipeline(): + # Create the pipeline + document_store = InMemoryDocumentStore() + hybrid_pipeline = Pipeline() + hybrid_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=document_store), name="bm25_retriever") + hybrid_pipeline.add_component( + instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), + name="text_embedder", + ) + hybrid_pipeline.add_component( + instance=InMemoryEmbeddingRetriever(document_store=document_store), name="embedding_retriever" + ) + hybrid_pipeline.add_component(instance=DocumentJoiner(), name="joiner") + hybrid_pipeline.add_component(instance=TransformersSimilarityRanker(top_k=2), name="ranker") + + hybrid_pipeline.connect("bm25_retriever", "joiner") + hybrid_pipeline.connect("text_embedder", "embedding_retriever") + hybrid_pipeline.connect("embedding_retriever", "joiner") + hybrid_pipeline.connect("joiner", "ranker") + + # Populate the document store + documents = [ + Document(content="My name is Jean and I live in Paris."), + Document(content="My name is Mark and I live in Berlin."), + Document(content="My name is Mario and I live in the capital of Italy."), + Document(content="My name is Giorgio and I live in Rome."), + ] + hybrid_pipeline.get_component("bm25_retriever").document_store.write_documents(documents) + + questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"] + inputs = [ + {"bm25_retriever": {"query": question}, "text_embedder": {"text": question}, "ranker": {"query": question}} + for question in questions + ] + expected_outputs = [ + { + "ranker": { + "documents": [ + Document( + id="6c90b78ad94e4e634e2a067b5fe2d26d4ce95405ec222cbaefaeb09ab4dce81e", + content="My name is Jean and I live in Paris.", + score=2.2277960777282715, + ), + Document( + id="10a183e965c2e107e20507c717f16559c58a8ba4bc7c577ea8dc32a8d6ca7a20", + content="My name is Mark and I live in Berlin.", + score=-7.304897308349609, + ), + ] + } + }, + { + "ranker": { + "documents": [ + Document( + id="10a183e965c2e107e20507c717f16559c58a8ba4bc7c577ea8dc32a8d6ca7a20", + content="My name is Mark and I live in Berlin.", + score=3.694173812866211, + ), + Document( + id="f7533b5c6c968680d0ef8e38f366d4e68b7ac0d7238f1b1b366d15cb9c33efd8", + content="My name is Mario and I live in the capital of Italy.", + score=-9.008655548095703, + ), + ] + } + }, + { + "ranker": { + "documents": [ + Document( + id="fb0f1efe94b3c78aa1c4e5a17a5ef8270f70e89d36a3665c8362675e8a769a27", + content="My name is Giorgio and I live in Rome.", + score=3.487802028656006, + ), + Document( + id="f7533b5c6c968680d0ef8e38f366d4e68b7ac0d7238f1b1b366d15cb9c33efd8", + content="My name is Mario and I live in the capital of Italy.", + score=-2.873128890991211, + ), + ] + } + }, + ] + + eval_result = eval(hybrid_pipeline, inputs=inputs, expected_outputs=expected_outputs) + + assert eval_result.inputs == inputs + assert eval_result.expected_outputs == expected_outputs + assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) + assert eval_result.runnable.to_dict() == hybrid_pipeline.to_dict() diff --git a/e2e/pipelines/test_eval_rag_pipelines.py b/e2e/pipelines/test_eval_rag_pipelines.py new file mode 100644 index 000000000..7ff365dfa --- /dev/null +++ b/e2e/pipelines/test_eval_rag_pipelines.py @@ -0,0 +1,145 @@ +from haystack import Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.retrievers import InMemoryBM25Retriever, InMemoryEmbeddingRetriever +from haystack.components.writers import DocumentWriter +from haystack.dataclasses import Document +from haystack.document_stores import InMemoryDocumentStore +from haystack.evaluation.eval import eval + + +def test_bm25_rag_pipeline(): + prompt_template = """ + Given these documents, answer the question.\nDocuments: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + + \nQuestion: {{question}} + \nAnswer: + """ + rag_pipeline = Pipeline() + rag_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever") + rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") + rag_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model_name_or_path="google/flan-t5-small", + task="text2text-generation", + generation_kwargs={"max_new_tokens": 100, "temperature": 0.5, "do_sample": True}, + ), + name="llm", + ) + rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + rag_pipeline.connect("retriever", "prompt_builder.documents") + rag_pipeline.connect("prompt_builder", "llm") + rag_pipeline.connect("llm.replies", "answer_builder.replies") + rag_pipeline.connect("retriever", "answer_builder.documents") + + # Populate the document store + documents = [ + Document(content="My name is Jean and I live in Paris."), + Document(content="My name is Mark and I live in Berlin."), + Document(content="My name is Giorgio and I live in Rome."), + ] + rag_pipeline.get_component("retriever").document_store.write_documents(documents) + + questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"] + inputs = [ + { + "retriever": {"query": question}, + "prompt_builder": {"question": question}, + "answer_builder": {"query": question}, + } + for question in questions + ] + + expected_outputs = [ + {"llm": {"replies": ["Jean"]}}, + {"llm": {"replies": ["Mark"]}}, + {"llm": {"replies": ["Giorgio"]}}, + ] + + eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs) + + assert eval_result.inputs == inputs + assert eval_result.expected_outputs == expected_outputs + assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) + assert eval_result.runnable.to_dict() == rag_pipeline.to_dict() + + +def test_embedding_retrieval_rag_pipeline(): + # Create the RAG pipeline + prompt_template = """ + Given these documents, answer the question.\nDocuments: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + + \nQuestion: {{question}} + \nAnswer: + """ + rag_pipeline = Pipeline() + rag_pipeline.add_component( + instance=SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), + name="text_embedder", + ) + rag_pipeline.add_component( + instance=InMemoryEmbeddingRetriever(document_store=InMemoryDocumentStore()), name="retriever" + ) + rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") + rag_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model_name_or_path="google/flan-t5-small", + task="text2text-generation", + generation_kwargs={"max_new_tokens": 100, "temperature": 0.5, "do_sample": True}, + ), + name="llm", + ) + rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + rag_pipeline.connect("text_embedder", "retriever") + rag_pipeline.connect("retriever", "prompt_builder.documents") + rag_pipeline.connect("prompt_builder", "llm") + rag_pipeline.connect("llm.replies", "answer_builder.replies") + rag_pipeline.connect("retriever", "answer_builder.documents") + + # Populate the document store + documents = [ + Document(content="My name is Jean and I live in Paris."), + Document(content="My name is Mark and I live in Berlin."), + Document(content="My name is Giorgio and I live in Rome."), + ] + document_store = rag_pipeline.get_component("retriever").document_store + indexing_pipeline = Pipeline() + indexing_pipeline.add_component( + instance=SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"), + name="document_embedder", + ) + indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="document_writer") + indexing_pipeline.connect("document_embedder", "document_writer") + indexing_pipeline.run({"document_embedder": {"documents": documents}}) + + # Query and assert + questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"] + inputs = [ + { + "prompt_builder": {"question": question}, + "text_embedder": {"text": question}, + "answer_builder": {"query": question}, + } + for question in questions + ] + + expected_outputs = [ + {"llm": {"replies": ["Jean"]}}, + {"llm": {"replies": ["Mark"]}}, + {"llm": {"replies": ["Giorgio"]}}, + ] + + eval_result = eval(rag_pipeline, inputs=inputs, expected_outputs=expected_outputs) + + assert eval_result.inputs == inputs + assert eval_result.expected_outputs == expected_outputs + assert len(eval_result.outputs) == len(expected_outputs) == len(inputs) + assert eval_result.runnable.to_dict() == rag_pipeline.to_dict() diff --git a/haystack/evaluation/__init__.py b/haystack/evaluation/__init__.py new file mode 100644 index 000000000..090aadc2c --- /dev/null +++ b/haystack/evaluation/__init__.py @@ -0,0 +1,3 @@ +from haystack.evaluation.eval import EvaluationResult, eval + +__all__ = ["eval", "EvaluationResult"] diff --git a/haystack/evaluation/eval.py b/haystack/evaluation/eval.py new file mode 100644 index 000000000..0f9f1e879 --- /dev/null +++ b/haystack/evaluation/eval.py @@ -0,0 +1,60 @@ +from typing import Any, Dict, List, Union + +from haystack import Pipeline +from haystack.core.component import Component + + +class EvaluationResult: + """ + EvaluationResult keeps track of all the information related to evaluation, namely the runnable (Pipeline or component), inputs, outputs, and expected outputs. + The EvaluationResult keeps track of all the information stored by eval. + + :param runnable: The runnable (Pipeline or component) used for evaluation. + :param inputs: List of inputs used for evaluation. + :param outputs: List of outputs generated by the runnable. + :param expected_outputs: List of expected outputs used for evaluation. + """ + + def __init__( + self, + runnable: Union[Pipeline, Component], + inputs: List[Dict[str, Any]], + outputs: List[Dict[str, Any]], + expected_outputs: List[Dict[str, Any]], + ) -> None: + self.runnable = runnable + self.inputs = inputs + self.outputs = outputs + self.expected_outputs = expected_outputs + + +def eval( + runnable: Union[Pipeline, Component], inputs: List[Dict[str, Any]], expected_outputs: List[Dict[str, Any]] +) -> EvaluationResult: + """ + Evaluates the provided Pipeline or component based on the given inputs and expected outputs. + + This function facilitates the evaluation of a given runnable (either a Pipeline or a component) using the provided + inputs and corresponding expected outputs. + + :param runnable: The runnable (Pipeline or component) used for evaluation. + :param inputs: List of inputs used for evaluation. + :param expected_outputs: List of expected outputs used for evaluation. + + :return: An instance of EvaluationResult containing information about the evaluation, including the runnable, inputs, outputs, and expected outputs. + """ + + outputs = [] + + # Check that expected outputs has the correct shape + if len(inputs) != len(expected_outputs): + raise ValueError( + f"The number of inputs ({len(inputs)}) does not match the number of expected outputs ({len(expected_outputs)}). " + " Please ensure that each input has a corresponding expected output." + ) + + for input_ in inputs: + output = runnable.run(input_) + outputs.append(output) + + return EvaluationResult(runnable, inputs, outputs, expected_outputs) diff --git a/releasenotes/notes/add-eval-and-evaluation-result-5e9ac742e323bda8.yaml b/releasenotes/notes/add-eval-and-evaluation-result-5e9ac742e323bda8.yaml new file mode 100644 index 000000000..b270143ed --- /dev/null +++ b/releasenotes/notes/add-eval-and-evaluation-result-5e9ac742e323bda8.yaml @@ -0,0 +1,4 @@ +preview: + - | + Add eval function for evaluation of components and Pipelines. + Adds EvaluationResult to store results of evaluation.