haystack/e2e/pipelines/test_eval_extractive_qa_pipeline.py

import json

from haystack import Pipeline
from haystack.components.readers import ExtractiveReader
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.dataclasses import Document, ExtractedAnswer
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.evaluation.eval import eval
from haystack.evaluation.metrics import Metric


def test_extractive_qa_pipeline(tmp_path):
    # Create the pipeline
    qa_pipeline = Pipeline()
    qa_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever")
    qa_pipeline.add_component(instance=ExtractiveReader(model="deepset/tinyroberta-squad2"), name="reader")
    qa_pipeline.connect("retriever", "reader")

    # Populate the document store
    documents = [
        Document(content="My name is Jean and I live in Paris."),
        Document(content="My name is Mark and I live in Berlin."),
        Document(content="My name is Giorgio and I live in Rome."),
    ]
    qa_pipeline.get_component("retriever").document_store.write_documents(documents)

    # Query and assert
    questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]
    inputs = [{"retriever": {"query": question}, "reader": {"query": question, "top_k": 1}} for question in questions]
    expected_outputs = [
        {
            "reader": {
                "answers": [
                    ExtractedAnswer(
                        query="Who lives in Paris?",
                        score=0.7713339924812317,
                        data="Jean and I",
                        document=Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535),
                        context=None,
                        document_offset=ExtractedAnswer.Span(start=11, end=21),
                        context_offset=None,
                        meta={},
                    ),
                    ExtractedAnswer(
                        query="Who lives in Paris?",
                        score=0.2286660075187683,
                        data=None,
                        document=None,
                        context=None,
                        document_offset=None,
                        context_offset=None,
                        meta={},
                    ),
                ]
            }
        },
        {
            "reader": {
                "answers": [
                    ExtractedAnswer(
                        query="Who lives in Berlin?",
                        score=0.7047999501228333,
                        data="Mark and I",
                        document=Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535),
                        context=None,
                        document_offset=ExtractedAnswer.Span(start=11, end=21),
                        context_offset=None,
                        meta={},
                    ),
                    ExtractedAnswer(
                        query="Who lives in Berlin?",
                        score=0.29520004987716675,
                        data=None,
                        document=None,
                        context=None,
                        document_offset=None,
                        context_offset=None,
                        meta={},
                    ),
                ]
            }
        },
        {
            "reader": {
                "answers": [
                    ExtractedAnswer(
                        query="Who lives in Rome?",
                        score=0.7661304473876953,
                        data="Giorgio and I",
                        document=Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535),
                        context=None,
                        document_offset=ExtractedAnswer.Span(start=11, end=24),
                        context_offset=None,
                        meta={},
                    ),
                    ExtractedAnswer(
                        query="Who lives in Rome?",
                        score=0.2338695526123047,
                        data=None,
                        document=None,
                        context=None,
                        document_offset=None,
                        context_offset=None,
                        meta={},
                    ),
                ]
            }
        },
    ]

    eval_result = eval(qa_pipeline, inputs=inputs, expected_outputs=expected_outputs)

    assert eval_result.inputs == inputs
    assert eval_result.expected_outputs == expected_outputs
    assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)
    assert eval_result.runnable.to_dict() == qa_pipeline.to_dict()

    metrics_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")
    metrics_custom_parameters = eval_result.calculate_metrics(
        Metric.EM, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True
    )
    # Save metric results to json
    metrics_default.save(tmp_path / "exact_match_score.json")

    assert metrics_default["exact_match"] == 1.0
    assert metrics_custom_parameters["exact_match"] == 1.0
    with open(tmp_path / "exact_match_score.json", "r") as f:
        assert metrics_default == json.load(f)
feat: Add `calculate_metrics` and `MetricsResult` (#6680) * Add calculate_metrics, MetricsResult, Exact Match * Add additional tests for metric calculation * Add release notes * Add docstring for Exact Match metric * Remove Exact Match Implementation * Update release notes * Remove unnecessary metrics implementation * Simplify logic to run supported metrics * Add some evaluation tests * Fix linting --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> 2024-01-10 14:56:44 +05:30			`import json`

feat: Add Eval and EvaluationResult (#6505) * Add initial implementation for Eval and EvaluationResult * Add release notes * Update files with suggestions from review * Remove serialization * Add eval e2e tests * Update eval e2e tests 2023-12-18 15:59:09 +05:30			`from haystack import Pipeline`
			`from haystack.components.readers import ExtractiveReader`
refact!: Remove symbols under the `haystack.document_stores` namespace (#6714) * remove symbols under the haystack.document_stores namespace * Update haystack/document_stores/types/protocol.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * fix * same for retrievers * leftovers * more leftovers * add relnote * leftovers * one more * fix examples --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2024-01-10 21:20:42 +01:00			`from haystack.components.retrievers.in_memory import InMemoryBM25Retriever`
feat: Add Eval and EvaluationResult (#6505) * Add initial implementation for Eval and EvaluationResult * Add release notes * Update files with suggestions from review * Remove serialization * Add eval e2e tests * Update eval e2e tests 2023-12-18 15:59:09 +05:30			`from haystack.dataclasses import Document, ExtractedAnswer`
refact!: Remove symbols under the `haystack.document_stores` namespace (#6714) * remove symbols under the haystack.document_stores namespace * Update haystack/document_stores/types/protocol.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * fix * same for retrievers * leftovers * more leftovers * add relnote * leftovers * one more * fix examples --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2024-01-10 21:20:42 +01:00			`from haystack.document_stores.in_memory import InMemoryDocumentStore`
feat: Add Eval and EvaluationResult (#6505) * Add initial implementation for Eval and EvaluationResult * Add release notes * Update files with suggestions from review * Remove serialization * Add eval e2e tests * Update eval e2e tests 2023-12-18 15:59:09 +05:30			`from haystack.evaluation.eval import eval`
feat: Add `calculate_metrics` and `MetricsResult` (#6680) * Add calculate_metrics, MetricsResult, Exact Match * Add additional tests for metric calculation * Add release notes * Add docstring for Exact Match metric * Remove Exact Match Implementation * Update release notes * Remove unnecessary metrics implementation * Simplify logic to run supported metrics * Add some evaluation tests * Fix linting --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> 2024-01-10 14:56:44 +05:30			`from haystack.evaluation.metrics import Metric`
feat: Add Eval and EvaluationResult (#6505) * Add initial implementation for Eval and EvaluationResult * Add release notes * Update files with suggestions from review * Remove serialization * Add eval e2e tests * Update eval e2e tests 2023-12-18 15:59:09 +05:30

feat: Add `calculate_metrics` and `MetricsResult` (#6680) * Add calculate_metrics, MetricsResult, Exact Match * Add additional tests for metric calculation * Add release notes * Add docstring for Exact Match metric * Remove Exact Match Implementation * Update release notes * Remove unnecessary metrics implementation * Simplify logic to run supported metrics * Add some evaluation tests * Fix linting --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> 2024-01-10 14:56:44 +05:30			`def test_extractive_qa_pipeline(tmp_path):`
feat: Add Eval and EvaluationResult (#6505) * Add initial implementation for Eval and EvaluationResult * Add release notes * Update files with suggestions from review * Remove serialization * Add eval e2e tests * Update eval e2e tests 2023-12-18 15:59:09 +05:30			`# Create the pipeline`
			`qa_pipeline = Pipeline()`
			`qa_pipeline.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever")`
feat!: Rename `model_name_or_path` to `model` in `ExtractiveReader` (#6736) * rename model parameter and internam model attribute in ExtractiveReader * fix tests for ExtractiveReader * fix e2e * reno * another fix * review feedback * Update releasenotes/notes/rename-model-param-reader-b8cbb0d638e3b8c2.yaml 2024-01-15 14:48:33 +01:00			`qa_pipeline.add_component(instance=ExtractiveReader(model="deepset/tinyroberta-squad2"), name="reader")`
feat: Add Eval and EvaluationResult (#6505) * Add initial implementation for Eval and EvaluationResult * Add release notes * Update files with suggestions from review * Remove serialization * Add eval e2e tests * Update eval e2e tests 2023-12-18 15:59:09 +05:30			`qa_pipeline.connect("retriever", "reader")`

			`# Populate the document store`
			`documents = [`
			`Document(content="My name is Jean and I live in Paris."),`
			`Document(content="My name is Mark and I live in Berlin."),`
			`Document(content="My name is Giorgio and I live in Rome."),`
			`]`
			`qa_pipeline.get_component("retriever").document_store.write_documents(documents)`

			`# Query and assert`
			`questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"]`
			`inputs = [{"retriever": {"query": question}, "reader": {"query": question, "top_k": 1}} for question in questions]`
			`expected_outputs = [`
			`{`
			`"reader": {`
			`"answers": [`
			`ExtractedAnswer(`
			`query="Who lives in Paris?",`
			`score=0.7713339924812317,`
			`data="Jean and I",`
feat: Add Exact Match metric (#6696) * Add exact match metric * Add release notes * Cleanup comments in test_eval_exact_match.py * Create separate preprocessing function; Add output_key parameter * Update release note --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> 2024-01-22 14:27:04 +05:30			`document=Document(content="My name is Jean and I live in Paris.", score=0.33144005810482535),`
feat: Add Eval and EvaluationResult (#6505) * Add initial implementation for Eval and EvaluationResult * Add release notes * Update files with suggestions from review * Remove serialization * Add eval e2e tests * Update eval e2e tests 2023-12-18 15:59:09 +05:30			`context=None,`
			`document_offset=ExtractedAnswer.Span(start=11, end=21),`
			`context_offset=None,`
			`meta={},`
			`),`
			`ExtractedAnswer(`
			`query="Who lives in Paris?",`
			`score=0.2286660075187683,`
			`data=None,`
			`document=None,`
			`context=None,`
			`document_offset=None,`
			`context_offset=None,`
			`meta={},`
			`),`
			`]`
			`}`
			`},`
			`{`
			`"reader": {`
			`"answers": [`
			`ExtractedAnswer(`
			`query="Who lives in Berlin?",`
			`score=0.7047999501228333,`
			`data="Mark and I",`
feat: Add Exact Match metric (#6696) * Add exact match metric * Add release notes * Cleanup comments in test_eval_exact_match.py * Create separate preprocessing function; Add output_key parameter * Update release note --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> 2024-01-22 14:27:04 +05:30			`document=Document(content="My name is Mark and I live in Berlin.", score=0.33144005810482535),`
feat: Add Eval and EvaluationResult (#6505) * Add initial implementation for Eval and EvaluationResult * Add release notes * Update files with suggestions from review * Remove serialization * Add eval e2e tests * Update eval e2e tests 2023-12-18 15:59:09 +05:30			`context=None,`
			`document_offset=ExtractedAnswer.Span(start=11, end=21),`
			`context_offset=None,`
			`meta={},`
			`),`
			`ExtractedAnswer(`
			`query="Who lives in Berlin?",`
			`score=0.29520004987716675,`
			`data=None,`
			`document=None,`
			`context=None,`
			`document_offset=None,`
			`context_offset=None,`
			`meta={},`
			`),`
			`]`
			`}`
			`},`
			`{`
			`"reader": {`
			`"answers": [`
			`ExtractedAnswer(`
			`query="Who lives in Rome?",`
			`score=0.7661304473876953,`
			`data="Giorgio and I",`
feat: Add Exact Match metric (#6696) * Add exact match metric * Add release notes * Cleanup comments in test_eval_exact_match.py * Create separate preprocessing function; Add output_key parameter * Update release note --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> 2024-01-22 14:27:04 +05:30			`document=Document(content="My name is Giorgio and I live in Rome.", score=0.33144005810482535),`
feat: Add Eval and EvaluationResult (#6505) * Add initial implementation for Eval and EvaluationResult * Add release notes * Update files with suggestions from review * Remove serialization * Add eval e2e tests * Update eval e2e tests 2023-12-18 15:59:09 +05:30			`context=None,`
			`document_offset=ExtractedAnswer.Span(start=11, end=24),`
			`context_offset=None,`
			`meta={},`
			`),`
			`ExtractedAnswer(`
			`query="Who lives in Rome?",`
			`score=0.2338695526123047,`
			`data=None,`
			`document=None,`
			`context=None,`
			`document_offset=None,`
			`context_offset=None,`
			`meta={},`
			`),`
			`]`
			`}`
			`},`
			`]`

			`eval_result = eval(qa_pipeline, inputs=inputs, expected_outputs=expected_outputs)`

			`assert eval_result.inputs == inputs`
			`assert eval_result.expected_outputs == expected_outputs`
			`assert len(eval_result.outputs) == len(expected_outputs) == len(inputs)`
			`assert eval_result.runnable.to_dict() == qa_pipeline.to_dict()`
feat: Add `calculate_metrics` and `MetricsResult` (#6680) * Add calculate_metrics, MetricsResult, Exact Match * Add additional tests for metric calculation * Add release notes * Add docstring for Exact Match metric * Remove Exact Match Implementation * Update release notes * Remove unnecessary metrics implementation * Simplify logic to run supported metrics * Add some evaluation tests * Fix linting --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> 2024-01-10 14:56:44 +05:30
feat: Add Exact Match metric (#6696) * Add exact match metric * Add release notes * Cleanup comments in test_eval_exact_match.py * Create separate preprocessing function; Add output_key parameter * Update release note --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> 2024-01-22 14:27:04 +05:30			`metrics_default = eval_result.calculate_metrics(Metric.EM, output_key="answers")`
			`metrics_custom_parameters = eval_result.calculate_metrics(`
			`Metric.EM, output_key="answers", ignore_case=True, ignore_punctuation=True, ignore_numbers=True`
			`)`
feat: Add `calculate_metrics` and `MetricsResult` (#6680) * Add calculate_metrics, MetricsResult, Exact Match * Add additional tests for metric calculation * Add release notes * Add docstring for Exact Match metric * Remove Exact Match Implementation * Update release notes * Remove unnecessary metrics implementation * Simplify logic to run supported metrics * Add some evaluation tests * Fix linting --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> 2024-01-10 14:56:44 +05:30			`# Save metric results to json`
feat: Add Exact Match metric (#6696) * Add exact match metric * Add release notes * Cleanup comments in test_eval_exact_match.py * Create separate preprocessing function; Add output_key parameter * Update release note --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> 2024-01-22 14:27:04 +05:30			`metrics_default.save(tmp_path / "exact_match_score.json")`
feat: Add `calculate_metrics` and `MetricsResult` (#6680) * Add calculate_metrics, MetricsResult, Exact Match * Add additional tests for metric calculation * Add release notes * Add docstring for Exact Match metric * Remove Exact Match Implementation * Update release notes * Remove unnecessary metrics implementation * Simplify logic to run supported metrics * Add some evaluation tests * Fix linting --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> 2024-01-10 14:56:44 +05:30
feat: Add Exact Match metric (#6696) * Add exact match metric * Add release notes * Cleanup comments in test_eval_exact_match.py * Create separate preprocessing function; Add output_key parameter * Update release note --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> 2024-01-22 14:27:04 +05:30			`assert metrics_default["exact_match"] == 1.0`
			`assert metrics_custom_parameters["exact_match"] == 1.0`
feat: Add `calculate_metrics` and `MetricsResult` (#6680) * Add calculate_metrics, MetricsResult, Exact Match * Add additional tests for metric calculation * Add release notes * Add docstring for Exact Match metric * Remove Exact Match Implementation * Update release notes * Remove unnecessary metrics implementation * Simplify logic to run supported metrics * Add some evaluation tests * Fix linting --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> 2024-01-10 14:56:44 +05:30			`with open(tmp_path / "exact_match_score.json", "r") as f:`
feat: Add Exact Match metric (#6696) * Add exact match metric * Add release notes * Cleanup comments in test_eval_exact_match.py * Create separate preprocessing function; Add output_key parameter * Update release note --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Julian Risch <julian.risch@deepset.ai> 2024-01-22 14:27:04 +05:30			`assert metrics_default == json.load(f)`