haystack/test/components/evaluators/test_answer_exact_match.py

import pytest

from haystack.components.evaluators import AnswerExactMatchEvaluator


def test_run_with_all_matching():
    evaluator = AnswerExactMatchEvaluator()
    result = evaluator.run(
        questions=["What is the capital of Germany?", "What is the capital of France?"],
        ground_truth_answers=[["Berlin"], ["Paris"]],
        predicted_answers=[["Berlin"], ["Paris"]],
    )

    assert result == {"individual_scores": [1, 1], "score": 1.0}


def test_run_with_no_matching():
    evaluator = AnswerExactMatchEvaluator()
    result = evaluator.run(
        questions=["What is the capital of Germany?", "What is the capital of France?"],
        ground_truth_answers=[["Berlin"], ["Paris"]],
        predicted_answers=[["Paris"], ["London"]],
    )

    assert result == {"individual_scores": [0, 0], "score": 0.0}


def test_run_with_partial_matching():
    evaluator = AnswerExactMatchEvaluator()
    result = evaluator.run(
        questions=["What is the capital of Germany?", "What is the capital of France?"],
        ground_truth_answers=[["Berlin"], ["Paris"]],
        predicted_answers=[["Berlin"], ["London"]],
    )

    assert result == {"individual_scores": [1, 0], "score": 0.5}


def test_run_with_complex_data():
    evaluator = AnswerExactMatchEvaluator()
    result = evaluator.run(
        questions=[
            "In what country is Normandy located?",
            "When was the Latin version of the word Norman first recorded?",
            "What developed in Normandy during the 1100s?",
            "In what century did important classical music developments occur in Normandy?",
            "From which countries did the Norse originate?",
            "What century did the Normans first gain their separate identity?",
        ],
        ground_truth_answers=[
            ["France"],
            ["9th century", "9th"],
            ["classical music", "classical"],
            ["11th century", "the 11th"],
            ["Denmark", "Iceland", "Norway"],
            ["10th century", "10th"],
        ],
        predicted_answers=[
            ["France"],
            ["9th century", "10th century", "9th"],
            ["classic music", "rock music", "dubstep"],
            ["11th", "the 11th", "11th century"],
            ["Denmark, Iceland and Norway"],
            ["10th century", "the first half of the 10th century", "10th", "10th"],
        ],
    )
    assert result == {"individual_scores": [1, 1, 0, 1, 0, 1], "score": 0.6666666666666666}


def test_run_with_different_lengths():
    evaluator = AnswerExactMatchEvaluator()

    with pytest.raises(ValueError):
        evaluator.run(
            questions=["What is the capital of Germany?"],
            ground_truth_answers=[["Berlin"], ["Paris"]],
            predicted_answers=[["Berlin"], ["London"]],
        )

    with pytest.raises(ValueError):
        evaluator.run(
            questions=["What is the capital of Germany?", "What is the capital of France?"],
            ground_truth_answers=[["Berlin"]],
            predicted_answers=[["Berlin"], ["London"]],
        )

    with pytest.raises(ValueError):
        evaluator.run(
            questions=["What is the capital of Germany?", "What is the capital of France?"],
            ground_truth_answers=[["Berlin"], ["Paris"]],
            predicted_answers=[["Berlin"]],
        )
Add `AnswerExactMatchEvaluator` (#7381) * Add AnswerExactMatchEvaluator * Add release notes * Fix linting * Update docstrings * Update docstrings * Remove to_dict and from_dict * Fix linting 2024-03-19 16:58:01 +01:00			`import pytest`

			`from haystack.components.evaluators import AnswerExactMatchEvaluator`


			`def test_run_with_all_matching():`
			`evaluator = AnswerExactMatchEvaluator()`
			`result = evaluator.run(`
			`questions=["What is the capital of Germany?", "What is the capital of France?"],`
			`ground_truth_answers=[["Berlin"], ["Paris"]],`
			`predicted_answers=[["Berlin"], ["Paris"]],`
			`)`

feat: Change outputs of AnswerExactMatchEvaluator (#7390) * Change outputs of AnswerExactMatchEvaluator * Changes scores to return the number of matches per question * Revert "Changes scores to return the number of matches per question" This reverts commit e4358720793d4584b0b961402d4557c50c4c2381. * Change output names 2024-03-26 10:57:59 +01:00			`assert result == {"individual_scores": [1, 1], "score": 1.0}`
Add `AnswerExactMatchEvaluator` (#7381) * Add AnswerExactMatchEvaluator * Add release notes * Fix linting * Update docstrings * Update docstrings * Remove to_dict and from_dict * Fix linting 2024-03-19 16:58:01 +01:00

			`def test_run_with_no_matching():`
			`evaluator = AnswerExactMatchEvaluator()`
			`result = evaluator.run(`
			`questions=["What is the capital of Germany?", "What is the capital of France?"],`
			`ground_truth_answers=[["Berlin"], ["Paris"]],`
			`predicted_answers=[["Paris"], ["London"]],`
			`)`

feat: Change outputs of AnswerExactMatchEvaluator (#7390) * Change outputs of AnswerExactMatchEvaluator * Changes scores to return the number of matches per question * Revert "Changes scores to return the number of matches per question" This reverts commit e4358720793d4584b0b961402d4557c50c4c2381. * Change output names 2024-03-26 10:57:59 +01:00			`assert result == {"individual_scores": [0, 0], "score": 0.0}`
Add `AnswerExactMatchEvaluator` (#7381) * Add AnswerExactMatchEvaluator * Add release notes * Fix linting * Update docstrings * Update docstrings * Remove to_dict and from_dict * Fix linting 2024-03-19 16:58:01 +01:00

			`def test_run_with_partial_matching():`
			`evaluator = AnswerExactMatchEvaluator()`
			`result = evaluator.run(`
			`questions=["What is the capital of Germany?", "What is the capital of France?"],`
			`ground_truth_answers=[["Berlin"], ["Paris"]],`
			`predicted_answers=[["Berlin"], ["London"]],`
			`)`

feat: Change outputs of AnswerExactMatchEvaluator (#7390) * Change outputs of AnswerExactMatchEvaluator * Changes scores to return the number of matches per question * Revert "Changes scores to return the number of matches per question" This reverts commit e4358720793d4584b0b961402d4557c50c4c2381. * Change output names 2024-03-26 10:57:59 +01:00			`assert result == {"individual_scores": [1, 0], "score": 0.5}`


			`def test_run_with_complex_data():`
			`evaluator = AnswerExactMatchEvaluator()`
			`result = evaluator.run(`
			`questions=[`
			`"In what country is Normandy located?",`
			`"When was the Latin version of the word Norman first recorded?",`
			`"What developed in Normandy during the 1100s?",`
			`"In what century did important classical music developments occur in Normandy?",`
			`"From which countries did the Norse originate?",`
			`"What century did the Normans first gain their separate identity?",`
			`],`
			`ground_truth_answers=[`
			`["France"],`
			`["9th century", "9th"],`
			`["classical music", "classical"],`
			`["11th century", "the 11th"],`
			`["Denmark", "Iceland", "Norway"],`
			`["10th century", "10th"],`
			`],`
			`predicted_answers=[`
			`["France"],`
			`["9th century", "10th century", "9th"],`
			`["classic music", "rock music", "dubstep"],`
			`["11th", "the 11th", "11th century"],`
			`["Denmark, Iceland and Norway"],`
			`["10th century", "the first half of the 10th century", "10th", "10th"],`
			`],`
			`)`
			`assert result == {"individual_scores": [1, 1, 0, 1, 0, 1], "score": 0.6666666666666666}`
Add `AnswerExactMatchEvaluator` (#7381) * Add AnswerExactMatchEvaluator * Add release notes * Fix linting * Update docstrings * Update docstrings * Remove to_dict and from_dict * Fix linting 2024-03-19 16:58:01 +01:00

			`def test_run_with_different_lengths():`
			`evaluator = AnswerExactMatchEvaluator()`

			`with pytest.raises(ValueError):`
			`evaluator.run(`
			`questions=["What is the capital of Germany?"],`
			`ground_truth_answers=[["Berlin"], ["Paris"]],`
			`predicted_answers=[["Berlin"], ["London"]],`
			`)`

			`with pytest.raises(ValueError):`
			`evaluator.run(`
			`questions=["What is the capital of Germany?", "What is the capital of France?"],`
			`ground_truth_answers=[["Berlin"]],`
			`predicted_answers=[["Berlin"], ["London"]],`
			`)`

			`with pytest.raises(ValueError):`
			`evaluator.run(`
			`questions=["What is the capital of Germany?", "What is the capital of France?"],`
			`ground_truth_answers=[["Berlin"], ["Paris"]],`
			`predicted_answers=[["Berlin"]],`
			`)`