add docstrings for EvaluationRunResult (#7885)

2026-01-08 04:56:45 +00:00 · 2024-06-19 11:49:41 +02:00 · 2024-06-19 11:49:41 +02:00 · 7c31d5f418
commit 7c31d5f418
parent 28902c4c65
4 changed files with 81 additions and 46 deletions
--- a/docs/pydoc/config/evaluation_api.yml
+++ b/docs/pydoc/config/evaluation_api.yml
@ -3,7 +3,8 @@ loaders:
    search_path: [../../../haystack/evaluation]
    modules:
      [
-        "eval_run_result"
+        "base",
+        "eval_run_result",
      ]
    ignore_when_discovered: ["__init__"]
 processors:
--- a/haystack/evaluation/init.py
+++ b/haystack/evaluation/init.py
@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0

-from .eval_run_result import BaseEvaluationRunResult, EvaluationRunResult
+from .base import BaseEvaluationRunResult
+from .eval_run_result import EvaluationRunResult

 __all__ = ["BaseEvaluationRunResult", "EvaluationRunResult"]
--- a/haystack/evaluation/base.py
+++ b/haystack/evaluation/base.py
@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import ABC, abstractmethod
+from typing import List, Optional
+
+from pandas import DataFrame
+
+
+class BaseEvaluationRunResult(ABC):
+    """
+    Represents the results of an evaluation run.
+    """
+
+    @abstractmethod
+    def to_pandas(self) -> "DataFrame":
+        """
+        Creates a Pandas DataFrame containing the scores of each metric for every input sample.
+
+        :returns:
+            Pandas DataFrame with the scores.
+        """
+
+    @abstractmethod
+    def score_report(self) -> "DataFrame":
+        """
+        Transforms the results into a Pandas DataFrame with the aggregated scores for each metric.
+
+        :returns:
+            Pandas DataFrame with the aggregated scores.
+        """
+
+    @abstractmethod
+    def comparative_individual_scores_report(
+        self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None
+    ) -> "DataFrame":
+        """
+        Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs.
+
+        The inputs to both evaluation runs is assumed to be the same.
+
+        :param other:
+            Results of another evaluation run to compare with.
+        :param keep_columns:
+            List of common column names to keep from the inputs of the evaluation runs to compare.
+        :returns:
+            Pandas DataFrame with the score comparison.
+        """
--- a/haystack/evaluation/eval_run_result.py
+++ b/haystack/evaluation/eval_run_result.py
@ -2,7 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0

-from abc import ABC, abstractmethod
 from copy import deepcopy
 from typing import Any, Dict, List, Optional
 from warnings import warn
@ -10,46 +9,7 @@ from warnings import warn
 from pandas import DataFrame
 from pandas import concat as pd_concat

-
-class BaseEvaluationRunResult(ABC):
-    """
-    Represents the results of an evaluation run.
-    """
-
-    @abstractmethod
-    def to_pandas(self) -> "DataFrame":
-        """
-        Creates a Pandas DataFrame containing the scores of each metric for every input sample.
-
-        :returns:
-            Pandas DataFrame with the scores.
-        """
-
-    @abstractmethod
-    def score_report(self) -> "DataFrame":
-        """
-        Transforms the results into a Pandas DataFrame with the aggregated scores for each metric.
-
-        :returns:
-            Pandas DataFrame with the aggregated scores.
-        """
-
-    @abstractmethod
-    def comparative_individual_scores_report(
-        self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None
-    ) -> "DataFrame":
-        """
-        Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs.
-
-        The inputs to both evaluation runs is assumed to be the same.
-
-        :param other:
-            Results of another evaluation run to compare with.
-        :param keep_columns:
-            List of common column names to keep from the inputs of the evaluation runs to compare.
-        :returns:
-            Pandas DataFrame with the score comparison.
-        """
+from .base import BaseEvaluationRunResult


 class EvaluationRunResult(BaseEvaluationRunResult):
@ -99,13 +59,25 @@ class EvaluationRunResult(BaseEvaluationRunResult):
                    f"Got {len(outputs['individual_scores'])} but expected {expected_len}."
                )

-    def score_report(self) -> DataFrame:  # noqa: D102
+    def score_report(self) -> DataFrame:
+        """
+        Transforms the results into a Pandas DataFrame with the aggregated scores for each metric.
+
+        :returns:
+            Pandas DataFrame with the aggregated scores.
+        """
        results = {k: v["score"] for k, v in self.results.items()}
        df = DataFrame.from_dict(results, orient="index", columns=["score"]).reset_index()
        df.columns = ["metrics", "score"]
        return df

-    def to_pandas(self) -> DataFrame:  # noqa: D102
+    def to_pandas(self) -> DataFrame:
+        """
+        Creates a Pandas DataFrame containing the scores of each metric for every input sample.
+
+        :returns:
+            Pandas DataFrame with the scores.
+        """
        inputs_columns = list(self.inputs.keys())
        inputs_values = list(self.inputs.values())
        inputs_values = list(map(list, zip(*inputs_values)))  # transpose the values
@ -118,9 +90,21 @@ class EvaluationRunResult(BaseEvaluationRunResult):

        return df_inputs.join(df_scores)

-    def comparative_individual_scores_report(  # noqa: D102
+    def comparative_individual_scores_report(
        self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None
    ) -> DataFrame:
+        """
+        Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs.
+
+        The inputs to both evaluation runs is assumed to be the same.
+
+        :param other:
+            Results of another evaluation run to compare with.
+        :param keep_columns:
+            List of common column names to keep from the inputs of the evaluation runs to compare.
+        :returns:
+            Pandas DataFrame with the score comparison.
+        """
        if not isinstance(other, EvaluationRunResult):
            raise ValueError("Comparative scores can only be computed between EvaluationRunResults.")