diff --git a/test/test_eval.py b/test/test_eval.py index a0051b2d4..34969605e 100644 --- a/test/test_eval.py +++ b/test/test_eval.py @@ -1,17 +1,14 @@ import pytest from haystack.document_stores.base import BaseDocumentStore -from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore from haystack.nodes.preprocessor import PreProcessor from haystack.nodes.evaluator import EvalAnswers, EvalDocuments from haystack.nodes.query_classifier.transformers import TransformersQueryClassifier -from haystack.nodes.retriever.dense import DensePassageRetriever, EmbeddingRetriever +from haystack.nodes.retriever.dense import DensePassageRetriever from haystack.nodes.retriever.sparse import ElasticsearchRetriever -from haystack.document_stores.memory import InMemoryDocumentStore from haystack.pipelines.base import Pipeline -from haystack.pipelines import ExtractiveQAPipeline, DocumentSearchPipeline, FAQPipeline, GenerativeQAPipeline, SearchSummarizationPipeline -from haystack.pipelines.standard_pipelines import RetrieverQuestionGenerationPipeline, TranslationWrapperPipeline +from haystack.pipelines import ExtractiveQAPipeline +from haystack.pipelines.standard_pipelines import DocumentSearchPipeline, FAQPipeline, RetrieverQuestionGenerationPipeline, TranslationWrapperPipeline from haystack.schema import Answer, Document, EvaluationResult, Label, MultiLabel, Span -from conftest import DOCS_WITH_EMBEDDINGS @pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True) @@ -555,67 +552,6 @@ def test_document_search_calculate_metrics(retriever_with_docs): assert metrics["Retriever"]["precision"] == 1.0/6 -@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) -def test_generativeqa_calculate_metrics(document_store_with_docs: InMemoryDocumentStore, rag_generator): - retriever = EmbeddingRetriever( - document_store=document_store_with_docs, - embedding_model="deepset/sentence_bert", - use_gpu=False - ) - document_store_with_docs.update_embeddings(retriever=retriever) - pipeline = GenerativeQAPipeline(generator=rag_generator, retriever=retriever) - eval_result: EvaluationResult = pipeline.eval( - labels=EVAL_LABELS, - params={"Retriever": {"top_k": 5}} - ) - - metrics = eval_result.calculate_metrics() - - assert "Retriever" in eval_result - assert "Generator" in eval_result - assert len(eval_result) == 2 - - assert metrics["Retriever"]["mrr"] == 0.5 - assert metrics["Retriever"]["map"] == 0.5 - assert metrics["Retriever"]["recall_multi_hit"] == 0.5 - assert metrics["Retriever"]["recall_single_hit"] == 0.5 - assert metrics["Retriever"]["precision"] == 1.0/6 - assert metrics["Generator"]["exact_match"] == 0.0 - assert metrics["Generator"]["f1"] == 1.0/3 - - -@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) -def test_summarizer_calculate_metrics(document_store_with_docs: ElasticsearchDocumentStore, summarizer): - retriever = EmbeddingRetriever( - document_store=document_store_with_docs, - embedding_model="deepset/sentence_bert", - use_gpu=False - ) - document_store_with_docs.update_embeddings(retriever=retriever) - pipeline = SearchSummarizationPipeline(retriever=retriever, summarizer=summarizer, return_in_answer_format=True) - eval_result: EvaluationResult = pipeline.eval( - labels=EVAL_LABELS, - params={"Retriever": {"top_k": 5}} - ) - - metrics = eval_result.calculate_metrics() - - assert "Retriever" in eval_result - assert "Summarizer" in eval_result - assert len(eval_result) == 2 - - assert metrics["Retriever"]["mrr"] == 0.5 - assert metrics["Retriever"]["map"] == 0.5 - assert metrics["Retriever"]["recall_multi_hit"] == 0.5 - assert metrics["Retriever"]["recall_single_hit"] == 0.5 - assert metrics["Retriever"]["precision"] == 1.0/6 - assert metrics["Summarizer"]["mrr"] == 0.5 - assert metrics["Summarizer"]["map"] == 0.5 - assert metrics["Summarizer"]["recall_multi_hit"] == 0.5 - assert metrics["Summarizer"]["recall_single_hit"] == 0.5 - assert metrics["Summarizer"]["precision"] == 1.0/6 - - @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) def test_faq_calculate_metrics(retriever_with_docs): diff --git a/test/test_eval_embedding_retriever.py b/test/test_eval_embedding_retriever.py new file mode 100644 index 000000000..94bf75a4e --- /dev/null +++ b/test/test_eval_embedding_retriever.py @@ -0,0 +1,72 @@ +import pytest +from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore +from haystack.nodes.retriever.dense import EmbeddingRetriever +from haystack.document_stores.memory import InMemoryDocumentStore +from haystack.nodes.summarizer.transformers import TransformersSummarizer +from haystack.pipelines import GenerativeQAPipeline, SearchSummarizationPipeline +from haystack.schema import EvaluationResult +from test_eval import EVAL_LABELS + + +# had to be separated from other eval tests to work around OOM in Windows CI + +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("retriever_with_docs", ["embedding"], indirect=True) +def test_generativeqa_calculate_metrics(document_store_with_docs: InMemoryDocumentStore, rag_generator, retriever_with_docs): + document_store_with_docs.update_embeddings(retriever=retriever_with_docs) + pipeline = GenerativeQAPipeline(generator=rag_generator, retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval( + labels=EVAL_LABELS, + params={"Retriever": {"top_k": 5}} + ) + + metrics = eval_result.calculate_metrics() + + assert "Retriever" in eval_result + assert "Generator" in eval_result + assert len(eval_result) == 2 + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 1.0/6 + assert metrics["Generator"]["exact_match"] == 0.0 + assert metrics["Generator"]["f1"] == 1.0/3 + + +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +def test_summarizer_calculate_metrics(document_store_with_docs: ElasticsearchDocumentStore): + summarizer = TransformersSummarizer( + model_name_or_path="sshleifer/distill-pegasus-xsum-16-4", + use_gpu=False + ) + document_store_with_docs.embedding_dim = 384 + retriever = EmbeddingRetriever( + document_store=document_store_with_docs, + embedding_model="sentence-transformers/all-MiniLM-L6-v2", + use_gpu=False + ) + document_store_with_docs.update_embeddings(retriever=retriever) + pipeline = SearchSummarizationPipeline(retriever=retriever, summarizer=summarizer, return_in_answer_format=True) + eval_result: EvaluationResult = pipeline.eval( + labels=EVAL_LABELS, + params={"Retriever": {"top_k": 5}} + ) + + metrics = eval_result.calculate_metrics() + + assert "Retriever" in eval_result + assert "Summarizer" in eval_result + assert len(eval_result) == 2 + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 1.0/6 + assert metrics["Summarizer"]["mrr"] == 0.0 + assert metrics["Summarizer"]["map"] == 0.0 + assert metrics["Summarizer"]["recall_multi_hit"] == 0.0 + assert metrics["Summarizer"]["recall_single_hit"] == 0.0 + assert metrics["Summarizer"]["precision"] == 0.0