haystack/test/components/evaluators/test_document_recall.py

import pytest

from haystack.components.evaluators.document_recall import DocumentRecallEvaluator, RecallMode
from haystack.dataclasses import Document


def test_init_with_unknown_mode_string():
    with pytest.raises(ValueError):
        DocumentRecallEvaluator(mode="unknown_mode")


class TestDocumentRecallEvaluatorSingleHit:
    @pytest.fixture
    def evaluator(self):
        return DocumentRecallEvaluator(mode=RecallMode.SINGLE_HIT)

    def test_run_with_all_matching(self, evaluator):
        result = evaluator.run(
            ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
            retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
        )
        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
        assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}

    def test_run_with_no_matching(self, evaluator):
        result = evaluator.run(
            ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
            retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
        )
        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
        assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}

    def test_run_with_partial_matching(self, evaluator):
        result = evaluator.run(
            ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
            retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
        )
        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
        assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}

    def test_run_with_complex_data(self, evaluator):
        result = evaluator.run(
            ground_truth_documents=[
                [Document(content="France")],
                [Document(content="9th century"), Document(content="9th")],
                [Document(content="classical music"), Document(content="classical")],
                [Document(content="11th century"), Document(content="the 11th")],
                [Document(content="Denmark, Iceland and Norway")],
                [Document(content="10th century"), Document(content="10th")],
            ],
            retrieved_documents=[
                [Document(content="France")],
                [Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
                [Document(content="classical"), Document(content="rock music"), Document(content="dubstep")],
                [Document(content="11th"), Document(content="the 11th"), Document(content="11th century")],
                [Document(content="Denmark"), Document(content="Norway"), Document(content="Iceland")],
                [
                    Document(content="10th century"),
                    Document(content="the first half of the 10th century"),
                    Document(content="10th"),
                    Document(content="10th"),
                ],
            ],
        )
        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
        assert result == {"individual_scores": [1, 1, 1, 1, 0, 1], "score": 0.8333333333333334}

    def test_run_with_different_lengths(self, evaluator):
        with pytest.raises(ValueError):
            evaluator.run(
                ground_truth_documents=[[Document(content="Berlin")]],
                retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
            )

        with pytest.raises(ValueError):
            evaluator.run(
                ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
                retrieved_documents=[[Document(content="Berlin")]],
            )


class TestDocumentRecallEvaluatorMultiHit:
    @pytest.fixture
    def evaluator(self):
        return DocumentRecallEvaluator(mode=RecallMode.MULTI_HIT)

    def test_run_with_all_matching(self, evaluator):
        result = evaluator.run(
            ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
            retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
        )
        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
        assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}

    def test_run_with_no_matching(self, evaluator):
        result = evaluator.run(
            ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
            retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
        )
        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
        assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}

    def test_run_with_partial_matching(self, evaluator):
        result = evaluator.run(
            ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
            retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
        )
        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
        assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}

    def test_run_with_complex_data(self, evaluator):
        result = evaluator.run(
            ground_truth_documents=[
                [Document(content="France")],
                [Document(content="9th century"), Document(content="9th")],
                [Document(content="classical music"), Document(content="classical")],
                [Document(content="11th century"), Document(content="the 11th")],
                [
                    Document(content="Denmark"),
                    Document(content="Iceland"),
                    Document(content="Norway"),
                    Document(content="Denmark, Iceland and Norway"),
                ],
                [Document(content="10th century"), Document(content="10th")],
            ],
            retrieved_documents=[
                [Document(content="France")],
                [Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
                [Document(content="classical"), Document(content="rock music"), Document(content="dubstep")],
                [Document(content="11th"), Document(content="the 11th"), Document(content="11th century")],
                [Document(content="Denmark"), Document(content="Norway"), Document(content="Iceland")],
                [
                    Document(content="10th century"),
                    Document(content="the first half of the 10th century"),
                    Document(content="10th"),
                    Document(content="10th"),
                ],
            ],
        )
        assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])
        assert result == {"individual_scores": [1.0, 1.0, 0.5, 1.0, 0.75, 1.0], "score": 0.875}

    def test_run_with_different_lengths(self, evaluator):
        with pytest.raises(ValueError):
            evaluator.run(
                ground_truth_documents=[[Document(content="Berlin")]],
                retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
            )

        with pytest.raises(ValueError):
            evaluator.run(
                ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
                retrieved_documents=[[Document(content="Berlin")]],
            )
feat: Add `DocumentRecallEvaluator` (#7399) * Add DocumentRecallEvaluator * Fix mypy error * Simplify recall logic and change output for single hit mode * Remove unused import * Add comment for RecallMode fields * Reword RecallMode comments Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> --------- Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> 2024-03-26 16:15:03 +01:00			`import pytest`

			`from haystack.components.evaluators.document_recall import DocumentRecallEvaluator, RecallMode`
			`from haystack.dataclasses import Document`


			`def test_init_with_unknown_mode_string():`
			`with pytest.raises(ValueError):`
			`DocumentRecallEvaluator(mode="unknown_mode")`


			`class TestDocumentRecallEvaluatorSingleHit:`
			`@pytest.fixture`
			`def evaluator(self):`
			`return DocumentRecallEvaluator(mode=RecallMode.SINGLE_HIT)`

			`def test_run_with_all_matching(self, evaluator):`
			`result = evaluator.run(`
			`ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],`
			`retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],`
			`)`
docs: Fix eval metric examples in docstrings (#7505) * fix eval metric docstrings, change type of individual scores * change import order * change exactmatch docstring to single ground truth answer * change exactmatch comment to single ground truth answer * reverted changing docs to single ground truth * add warm up in SASEvaluator example * fix FaithfulnessEvaluator docstring example * extend FaithfulnessEvaluator docstring example * Update FaithfulnessEvaluator init docstring * Remove outdated default from LLMEvaluator docstring * Add examples param to LLMEvaluator docstring example * Add import and print to LLMEvaluator docstring example 2024-04-10 11:00:20 +02:00			`assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])`
feat: Add `DocumentRecallEvaluator` (#7399) * Add DocumentRecallEvaluator * Fix mypy error * Simplify recall logic and change output for single hit mode * Remove unused import * Add comment for RecallMode fields * Reword RecallMode comments Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> --------- Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> 2024-03-26 16:15:03 +01:00			`assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}`

			`def test_run_with_no_matching(self, evaluator):`
			`result = evaluator.run(`
			`ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],`
			`retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],`
			`)`
docs: Fix eval metric examples in docstrings (#7505) * fix eval metric docstrings, change type of individual scores * change import order * change exactmatch docstring to single ground truth answer * change exactmatch comment to single ground truth answer * reverted changing docs to single ground truth * add warm up in SASEvaluator example * fix FaithfulnessEvaluator docstring example * extend FaithfulnessEvaluator docstring example * Update FaithfulnessEvaluator init docstring * Remove outdated default from LLMEvaluator docstring * Add examples param to LLMEvaluator docstring example * Add import and print to LLMEvaluator docstring example 2024-04-10 11:00:20 +02:00			`assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])`
feat: Add `DocumentRecallEvaluator` (#7399) * Add DocumentRecallEvaluator * Fix mypy error * Simplify recall logic and change output for single hit mode * Remove unused import * Add comment for RecallMode fields * Reword RecallMode comments Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> --------- Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> 2024-03-26 16:15:03 +01:00			`assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}`

			`def test_run_with_partial_matching(self, evaluator):`
			`result = evaluator.run(`
			`ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],`
			`retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],`
			`)`
docs: Fix eval metric examples in docstrings (#7505) * fix eval metric docstrings, change type of individual scores * change import order * change exactmatch docstring to single ground truth answer * change exactmatch comment to single ground truth answer * reverted changing docs to single ground truth * add warm up in SASEvaluator example * fix FaithfulnessEvaluator docstring example * extend FaithfulnessEvaluator docstring example * Update FaithfulnessEvaluator init docstring * Remove outdated default from LLMEvaluator docstring * Add examples param to LLMEvaluator docstring example * Add import and print to LLMEvaluator docstring example 2024-04-10 11:00:20 +02:00			`assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])`
feat: Add `DocumentRecallEvaluator` (#7399) * Add DocumentRecallEvaluator * Fix mypy error * Simplify recall logic and change output for single hit mode * Remove unused import * Add comment for RecallMode fields * Reword RecallMode comments Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> --------- Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> 2024-03-26 16:15:03 +01:00			`assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}`

			`def test_run_with_complex_data(self, evaluator):`
			`result = evaluator.run(`
			`ground_truth_documents=[`
			`[Document(content="France")],`
			`[Document(content="9th century"), Document(content="9th")],`
			`[Document(content="classical music"), Document(content="classical")],`
			`[Document(content="11th century"), Document(content="the 11th")],`
			`[Document(content="Denmark, Iceland and Norway")],`
			`[Document(content="10th century"), Document(content="10th")],`
			`],`
			`retrieved_documents=[`
			`[Document(content="France")],`
			`[Document(content="9th century"), Document(content="10th century"), Document(content="9th")],`
			`[Document(content="classical"), Document(content="rock music"), Document(content="dubstep")],`
			`[Document(content="11th"), Document(content="the 11th"), Document(content="11th century")],`
			`[Document(content="Denmark"), Document(content="Norway"), Document(content="Iceland")],`
			`[`
			`Document(content="10th century"),`
			`Document(content="the first half of the 10th century"),`
			`Document(content="10th"),`
			`Document(content="10th"),`
			`],`
			`],`
			`)`
docs: Fix eval metric examples in docstrings (#7505) * fix eval metric docstrings, change type of individual scores * change import order * change exactmatch docstring to single ground truth answer * change exactmatch comment to single ground truth answer * reverted changing docs to single ground truth * add warm up in SASEvaluator example * fix FaithfulnessEvaluator docstring example * extend FaithfulnessEvaluator docstring example * Update FaithfulnessEvaluator init docstring * Remove outdated default from LLMEvaluator docstring * Add examples param to LLMEvaluator docstring example * Add import and print to LLMEvaluator docstring example 2024-04-10 11:00:20 +02:00			`assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])`
			`assert result == {"individual_scores": [1, 1, 1, 1, 0, 1], "score": 0.8333333333333334}`
feat: Add `DocumentRecallEvaluator` (#7399) * Add DocumentRecallEvaluator * Fix mypy error * Simplify recall logic and change output for single hit mode * Remove unused import * Add comment for RecallMode fields * Reword RecallMode comments Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> --------- Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> 2024-03-26 16:15:03 +01:00
			`def test_run_with_different_lengths(self, evaluator):`
			`with pytest.raises(ValueError):`
			`evaluator.run(`
			`ground_truth_documents=[[Document(content="Berlin")]],`
			`retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],`
			`)`

			`with pytest.raises(ValueError):`
			`evaluator.run(`
			`ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],`
			`retrieved_documents=[[Document(content="Berlin")]],`
			`)`


			`class TestDocumentRecallEvaluatorMultiHit:`
			`@pytest.fixture`
			`def evaluator(self):`
			`return DocumentRecallEvaluator(mode=RecallMode.MULTI_HIT)`

			`def test_run_with_all_matching(self, evaluator):`
			`result = evaluator.run(`
			`ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],`
			`retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],`
			`)`
docs: Fix eval metric examples in docstrings (#7505) * fix eval metric docstrings, change type of individual scores * change import order * change exactmatch docstring to single ground truth answer * change exactmatch comment to single ground truth answer * reverted changing docs to single ground truth * add warm up in SASEvaluator example * fix FaithfulnessEvaluator docstring example * extend FaithfulnessEvaluator docstring example * Update FaithfulnessEvaluator init docstring * Remove outdated default from LLMEvaluator docstring * Add examples param to LLMEvaluator docstring example * Add import and print to LLMEvaluator docstring example 2024-04-10 11:00:20 +02:00			`assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])`
feat: Add `DocumentRecallEvaluator` (#7399) * Add DocumentRecallEvaluator * Fix mypy error * Simplify recall logic and change output for single hit mode * Remove unused import * Add comment for RecallMode fields * Reword RecallMode comments Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> --------- Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> 2024-03-26 16:15:03 +01:00			`assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}`

			`def test_run_with_no_matching(self, evaluator):`
			`result = evaluator.run(`
			`ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],`
			`retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],`
			`)`
docs: Fix eval metric examples in docstrings (#7505) * fix eval metric docstrings, change type of individual scores * change import order * change exactmatch docstring to single ground truth answer * change exactmatch comment to single ground truth answer * reverted changing docs to single ground truth * add warm up in SASEvaluator example * fix FaithfulnessEvaluator docstring example * extend FaithfulnessEvaluator docstring example * Update FaithfulnessEvaluator init docstring * Remove outdated default from LLMEvaluator docstring * Add examples param to LLMEvaluator docstring example * Add import and print to LLMEvaluator docstring example 2024-04-10 11:00:20 +02:00			`assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])`
feat: Add `DocumentRecallEvaluator` (#7399) * Add DocumentRecallEvaluator * Fix mypy error * Simplify recall logic and change output for single hit mode * Remove unused import * Add comment for RecallMode fields * Reword RecallMode comments Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> --------- Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> 2024-03-26 16:15:03 +01:00			`assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}`

			`def test_run_with_partial_matching(self, evaluator):`
			`result = evaluator.run(`
			`ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],`
			`retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],`
			`)`
docs: Fix eval metric examples in docstrings (#7505) * fix eval metric docstrings, change type of individual scores * change import order * change exactmatch docstring to single ground truth answer * change exactmatch comment to single ground truth answer * reverted changing docs to single ground truth * add warm up in SASEvaluator example * fix FaithfulnessEvaluator docstring example * extend FaithfulnessEvaluator docstring example * Update FaithfulnessEvaluator init docstring * Remove outdated default from LLMEvaluator docstring * Add examples param to LLMEvaluator docstring example * Add import and print to LLMEvaluator docstring example 2024-04-10 11:00:20 +02:00			`assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])`
feat: Add `DocumentRecallEvaluator` (#7399) * Add DocumentRecallEvaluator * Fix mypy error * Simplify recall logic and change output for single hit mode * Remove unused import * Add comment for RecallMode fields * Reword RecallMode comments Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> --------- Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> 2024-03-26 16:15:03 +01:00			`assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}`

			`def test_run_with_complex_data(self, evaluator):`
			`result = evaluator.run(`
			`ground_truth_documents=[`
			`[Document(content="France")],`
			`[Document(content="9th century"), Document(content="9th")],`
			`[Document(content="classical music"), Document(content="classical")],`
			`[Document(content="11th century"), Document(content="the 11th")],`
			`[`
			`Document(content="Denmark"),`
			`Document(content="Iceland"),`
			`Document(content="Norway"),`
			`Document(content="Denmark, Iceland and Norway"),`
			`],`
			`[Document(content="10th century"), Document(content="10th")],`
			`],`
			`retrieved_documents=[`
			`[Document(content="France")],`
			`[Document(content="9th century"), Document(content="10th century"), Document(content="9th")],`
			`[Document(content="classical"), Document(content="rock music"), Document(content="dubstep")],`
			`[Document(content="11th"), Document(content="the 11th"), Document(content="11th century")],`
			`[Document(content="Denmark"), Document(content="Norway"), Document(content="Iceland")],`
			`[`
			`Document(content="10th century"),`
			`Document(content="the first half of the 10th century"),`
			`Document(content="10th"),`
			`Document(content="10th"),`
			`],`
			`],`
			`)`
docs: Fix eval metric examples in docstrings (#7505) * fix eval metric docstrings, change type of individual scores * change import order * change exactmatch docstring to single ground truth answer * change exactmatch comment to single ground truth answer * reverted changing docs to single ground truth * add warm up in SASEvaluator example * fix FaithfulnessEvaluator docstring example * extend FaithfulnessEvaluator docstring example * Update FaithfulnessEvaluator init docstring * Remove outdated default from LLMEvaluator docstring * Add examples param to LLMEvaluator docstring example * Add import and print to LLMEvaluator docstring example 2024-04-10 11:00:20 +02:00			`assert all(isinstance(individual_score, float) for individual_score in result["individual_scores"])`
feat: Add `DocumentRecallEvaluator` (#7399) * Add DocumentRecallEvaluator * Fix mypy error * Simplify recall logic and change output for single hit mode * Remove unused import * Add comment for RecallMode fields * Reword RecallMode comments Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> --------- Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com> 2024-03-26 16:15:03 +01:00			`assert result == {"individual_scores": [1.0, 1.0, 0.5, 1.0, 0.75, 1.0], "score": 0.875}`

			`def test_run_with_different_lengths(self, evaluator):`
			`with pytest.raises(ValueError):`
			`evaluator.run(`
			`ground_truth_documents=[[Document(content="Berlin")]],`
			`retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],`
			`)`

			`with pytest.raises(ValueError):`
			`evaluator.run(`
			`ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],`
			`retrieved_documents=[[Document(content="Berlin")]],`
			`)`