haystack/haystack/schema.py

from __future__ import annotations

import ast
import csv
import hashlib
import inspect
import json
import logging
import time
from dataclasses import asdict
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Union
from uuid import uuid4

import numpy as np
import pandas as pd
from numpy import ndarray
from pandas import DataFrame
from pydantic import BaseConfig, Field

# We are using Pydantic dataclasses instead of vanilla Python's
# See #1598 for the reasons behind this choice & performance considerations
from pydantic.dataclasses import dataclass
from pydantic.json import pydantic_encoder

from haystack.mmh3 import hash128

logger = logging.getLogger(__name__)


BaseConfig.arbitrary_types_allowed = True


#: Types of content_types supported
ContentTypes = Literal["text", "table", "image", "audio"]
FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]]

LABEL_DATETIME_FORMAT: str = "%Y-%m-%d %H:%M:%S"


@dataclass
class Document:
    id: str
    content: Union[str, DataFrame]
    content_type: ContentTypes = Field(default="text")
    meta: Dict[str, Any] = Field(default={})
    id_hash_keys: List[str] = Field(default=["content"])
    score: Optional[float] = None
    embedding: Optional[ndarray] = None

    # We use a custom init here as we want some custom logic. The annotations above are however still needed in order
    # to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method
    # They also help in annotating which object attributes will always be present (e.g. "id") even though they
    # don't need to passed by the user in init and are rather initialized automatically in the init
    def __init__(
        self,
        content: Union[str, DataFrame],
        content_type: ContentTypes = "text",
        id: Optional[str] = None,
        score: Optional[float] = None,
        meta: Optional[Dict[str, Any]] = None,
        embedding: Optional[ndarray] = None,
        id_hash_keys: Optional[List[str]] = None,
    ):
        """
        One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
        Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
        many other places that manipulate or interact with document-level data.
        Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
        into smaller passages. We'll have one Document per passage in this case.
        Each document has a unique ID. This can be supplied by the user or generated automatically.
        It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
        There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
        :param content: Content of the document. For most cases, this will be text, but it can be a table or image.
        :param content_type: One of "text", "table", "image" or "audio". Haystack components can use this to adjust their
                             handling of Documents and check compatibility.
        :param id: Unique ID for the document. If not supplied by the user, we'll generate one automatically by
                   creating a hash from the supplied text. This behaviour can be further adjusted by `id_hash_keys`.
        :param score: The relevance score of the Document determined by a model (e.g. Retriever or Re-Ranker).
                      If model's `scale_score` was set to True (default) score is in the unit interval (range of [0,1]), where 1 means extremely relevant.
        :param meta: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed).
        :param embedding: Vector encoding of the text
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's attributes.
                             To ensure you don't have duplicate documents in your DocumentStore if texts are
                             not unique, modify the metadata and pass, for example, "meta" to this field (example: ["content", "meta"]).
                             In this case, the id is generated by using the content and the defined metadata.
                             If you specify a custom ID for the `id` parameter, the `id_hash_keys` parameter is
                             ignored and the custom ID is used.

                             Note that you can use even nested fields of the `meta` as id_hash_keys. For example, if you
                             have a key in `meta` called `url` and you want to use it as part of the id, you can pass
                             this parameter as `["meta.url"]`. Haystack supports a maximum depth of 1. For example, if you
                             use `meta.url.path`, it looks for the `url.path` key in the  `meta` dict, for example `meta['url.path']`.


        """

        if content is None:
            raise ValueError("Can't create 'Document': Mandatory 'content' field is None")

        self.content = content
        self.content_type = content_type
        self.score = score
        self.meta = meta or {}

        allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding"]

        if id_hash_keys is not None and not all(
            key in allowed_hash_key_attributes or key.startswith("meta.") for key in id_hash_keys
        ):
            raise ValueError(
                f"You passed custom strings {id_hash_keys} to id_hash_keys which is deprecated. Supply instead a "
                f"list of Document's attribute names (like {', '.join(allowed_hash_key_attributes)}) or "
                f"a key of meta with a maximum depth of 1 (like meta.url). "
                "See [Custom id hashing on documentstore level](https://github.com/deepset-ai/haystack/pull/1910) and "
                "[Allow more flexible Document id hashing](https://github.com/deepset-ai/haystack/issues/4317) for details"
            )
        # We store id_hash_keys to be able to clone documents, for example when splitting them during pre-processing
        self.id_hash_keys = id_hash_keys or ["content"]

        if embedding is not None:
            embedding = np.asarray(embedding)
        self.embedding = embedding

        # Create a unique ID (either new one, or one from user input)
        if id is not None:
            self.id: str = str(id)
        else:
            self.id: str = self._get_id(id_hash_keys=id_hash_keys)

    def _get_id(self, id_hash_keys: Optional[List[str]] = None):
        """
        Generate the id of a document by creating the hash of strings. By default the content of a document is
        used to generate the hash. There are two ways of modifying the generated id of a document. Either static keys
        or a selection of the content.
        :param id_hash_keys: Optional list of fields that should be dynamically used to generate the hash.
        """

        if id_hash_keys is None:
            return "{:02x}".format(hash128(str(self.content)))

        final_hash_key = ""
        for attr in id_hash_keys:
            if attr.startswith("meta."):
                meta_key = attr.split(".", maxsplit=1)[1]
                if meta_key in self.meta:
                    final_hash_key += ":" + str(self.meta[meta_key])
            else:
                final_hash_key += ":" + str(getattr(self, attr))

        if final_hash_key == "":
            raise ValueError(
                "Can't create 'Document': 'id_hash_keys' must contain at least one of ['content', 'meta'] or be set to None."
            )

        return "{:02x}".format(hash128(final_hash_key))

    def to_dict(self, field_map: Optional[Dict[str, Any]] = None) -> Dict:
        """
        Convert Document to dict. An optional field_map can be supplied to change the names of the keys in the
        resulting dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
        they are serialized / stored in other places (e.g. elasticsearch)
        Example:

        ```python
            doc = Document(content="some text", content_type="text")
            doc.to_dict(field_map={"custom_content_field": "content"})

            # Returns {"custom_content_field": "some text", content_type": "text"}
        ```

        :param field_map: Dict with keys being the custom target keys and values being the standard Document attributes
        :return: dict with content of the Document
        """
        if not field_map:
            field_map = {}

        inv_field_map = {v: k for k, v in field_map.items()}
        _doc: Dict[str, str] = {}
        for k, v in self.__dict__.items():
            # Exclude internal fields (Pydantic, ...) fields from the conversion process
            if k.startswith("__"):
                continue
                # Convert pd.DataFrame to list of rows for serialization
            if k == "content" and self.content_type == "table" and isinstance(self.content, DataFrame):
                v = dataframe_to_list(self.content)
            k = k if k not in inv_field_map else inv_field_map[k]
            _doc[k] = v
        return _doc

    @classmethod
    def from_dict(cls, dict: Dict[str, Any], field_map: Optional[Dict[str, Any]] = None) -> Document:
        """
        Create Document from dict. An optional `field_map` parameter can be supplied to adjust for custom names of the keys in the
        input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
        they are serialized / stored in other places (e.g. elasticsearch).

        Example:

        ```python
            my_dict = {"custom_content_field": "some text", "content_type": "text"}
            Document.from_dict(my_dict, field_map={"custom_content_field": "content"})
        ```

        :param field_map: Dict with keys being the custom target keys and values being the standard Document attributes
        :return: A Document object
        """
        if not field_map:
            field_map = {}

        _doc = dict.copy()
        init_args = ["content", "content_type", "id", "score", "id_hash_keys", "question", "meta", "embedding"]
        if "meta" not in _doc.keys():
            _doc["meta"] = {}
        # copy additional fields into "meta"
        for k, v in _doc.items():
            # Exclude internal fields (Pydantic, ...) fields from the conversion process
            if k.startswith("__"):
                continue
            if k not in init_args and k not in field_map:
                _doc["meta"][k] = v
        # remove additional fields from top level
        _new_doc = {}
        for k, v in _doc.items():
            if k in init_args:
                _new_doc[k] = v
            elif k in field_map:
                k = field_map[k]
                _new_doc[k] = v

        # Convert list of rows to DataFrame
        if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
            _new_doc["content"] = dataframe_from_list(_new_doc["content"])

        return cls(**_new_doc)

    def to_json(self, field_map: Optional[Dict[str, Any]] = None) -> str:
        if not field_map:
            field_map = {}
        dictionary = self.to_dict(field_map=field_map)
        return json.dumps(dictionary, cls=NumpyEncoder)

    @classmethod
    def from_json(cls, data: Union[str, Dict[str, Any]], field_map: Optional[Dict[str, Any]] = None) -> Document:
        if not field_map:
            field_map = {}
        if isinstance(data, str):
            dict_data = json.loads(data)
        else:
            dict_data = data
        return cls.from_dict(dict_data, field_map=field_map)

    def __eq__(self, other):
        content = getattr(other, "content", None)
        if isinstance(content, pd.DataFrame):
            is_content_equal = content.equals(self.content)
        else:
            is_content_equal = content == self.content
        return (
            isinstance(other, self.__class__)
            and is_content_equal
            and getattr(other, "content_type", None) == self.content_type
            and getattr(other, "id", None) == self.id
            and getattr(other, "id_hash_keys", None) == self.id_hash_keys
            and getattr(other, "score", None) == self.score
            and getattr(other, "meta", None) == self.meta
            and np.array_equal(getattr(other, "embedding", None), self.embedding)
        )

    def __repr__(self):
        doc_dict = self.to_dict()
        embedding = doc_dict.get("embedding", None)
        if embedding is not None:
            doc_dict["embedding"] = f"<embedding of shape {getattr(embedding, 'shape', '[no shape]')}>"
        return f"<Document: {str(doc_dict)}>"

    def __str__(self):
        # In some cases, self.content is None (therefore not subscriptable)
        if self.content is None:
            return f"<Document: id={self.id}, content=None>"
        return f"<Document: id={self.id}, content='{self.content[:100]}{'...' if len(self.content) > 100 else ''}'>"

    def __lt__(self, other):
        """Enable sorting of Documents by score"""
        return self.score < other.score


@dataclass
class Span:
    start: int
    end: int
    """
    Defining a sequence of characters (Text span) or cells (Table span) via start and end index.
    For extractive QA: Character where answer starts/ends

    :param start: Position where the span starts
    :param end:  Position where the span ends
    """

    def __contains__(self, value):
        """
        Checks for inclusion of the given value into the interval defined by Span.
        ```
            assert 10 in Span(5, 15)  # True
            assert 20 in Span(1, 15)  # False
        ```
        Includes the left edge, but not the right edge.
        ```
            assert 5 in Span(5, 15)   # True
            assert 15 in Span(5, 15)  # False
        ```
        Works for numbers and all values that can be safely converted into floats.
        ```
            assert 10.0 in Span(5, 15)   # True
            assert "10" in Span(5, 15)   # True
        ```
        It also works for Span objects, returning True only if the given
        Span is fully contained into the original Span.
        As for numerical values, the left edge is included, the right edge is not.
        ```
            assert Span(10, 11) in Span(5, 15)   # True
            assert Span(5, 10) in Span(5, 15)    # True
            assert Span(10, 15) in Span(5, 15)   # False
            assert Span(5, 15) in Span(5, 15)    # False
            assert Span(5, 14) in Span(5, 15)    # True
            assert Span(0, 1) in Span(5, 15)     # False
            assert Span(0, 10) in Span(5, 15)    # False
            assert Span(10, 20) in Span(5, 15)   # False
        ```
        """
        if isinstance(value, Span):
            return self.start <= value.start and self.end > value.end
        try:
            value = float(value)
            return self.start <= value < self.end
        except Exception as e:
            raise ValueError(
                f"Cannot use 'in' with a value of type {type(value)}. Use numeric values or Span objects."
            ) from e


@dataclass
class TableCell:
    row: int
    col: int
    """
    Defining a table cell via the row and column index.

    :param row: Row index of the cell
    :param col: Column index of the cell
    """


@dataclass
class Answer:
    answer: str
    type: Literal["generative", "extractive", "other"] = "extractive"
    score: Optional[float] = None
    context: Optional[Union[str, DataFrame]] = None
    offsets_in_document: Optional[Union[List[Span], List[TableCell]]] = None
    offsets_in_context: Optional[Union[List[Span], List[TableCell]]] = None
    document_ids: Optional[List[str]] = None
    meta: Optional[Dict[str, Any]] = None

    """
    The fundamental object in Haystack to represent any type of Answers (e.g. extractive QA, generative QA or TableQA).
    For example, it's used within some Nodes like the Reader, but also in the REST API.

    :param answer: The answer string. If there's no possible answer (aka "no_answer" or "is_impossible) this will be an empty string.
    :param type: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model
                 (i.e. we can locate an exact answer string in one of the documents) or from a generative model
                 (i.e. no pointer to a specific document, no offsets ...).
    :param score: The relevance score of the Answer determined by a model (e.g. Reader or Generator).
                  In the range of [0,1], where 1 means extremely relevant.
    :param context: The related content that was used to create the answer (i.e. a text passage, part of a table, image ...)
    :param offsets_in_document: List of `Span` objects with start and end positions of the answer **in the
                                document** (as stored in the document store).
                                For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
                                For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
                                (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
    :param offsets_in_context: List of `Span` objects with start and end positions of the answer **in the
                                context** (i.e. the surrounding text/table of a certain window size).
                                For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
                                For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
                                (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
    :param document_ids: IDs of the documents the answer came from (if any).
                                For extractive QA, this will be a list of length 1.
                                For generative QA, this will be a list of length > 0.
    :param meta: Dict that can be used to associate any kind of custom meta data with the answer.
                 In extractive QA, this will carry the meta data of the document where the answer was found.
    """

    def __post_init__(self):
        # In case offsets are passed as dicts rather than Span or TableCell objects we convert them here
        # For example, this is used when instantiating an object via from_json()
        if self.offsets_in_document is not None:
            self.offsets_in_document = self._from_dict_offsets(self.offsets_in_document)

        if self.offsets_in_context is not None:
            self.offsets_in_context = self._from_dict_offsets(self.offsets_in_context)

        if self.meta is None:
            self.meta = {}

        # In case the context is a list of lists for a table document that is instantiated by from_json() or from_dict()
        if isinstance(self.context, list):
            self.context = dataframe_from_list(self.context)

    def __lt__(self, other):
        """Enable sorting of Answers by score"""
        return self.score < other.score

    def __str__(self):
        # self.context might be None (therefore not subscriptable)
        if self.context is None:
            return f"<Answer: answer='{self.answer}', score={self.score}, context=None>"
        return f"<Answer: answer='{self.answer}', score={self.score}, context='{self.context[:50]}{'...' if len(self.context) > 50 else ''}'>"

    def __repr__(self):
        return f"<Answer {self.to_dict()}>"

    def to_dict(self) -> Dict:
        return asdict(self, dict_factory=_dict_factory)

    @classmethod
    def from_dict(cls, dict: Dict) -> Answer:
        # backwards compatibility: `document_id: Optional[str]` was changed to `document_ids: Optional[List[str]]`
        if "document_id" in dict:
            dict = dict.copy()
            document_id = dict.pop("document_id")
            dict["document_ids"] = [document_id] if document_id is not None else None
        return cls(**dict)

    def to_json(self):
        return json.dumps(self.to_dict(), cls=NumpyEncoder)

    @classmethod
    def from_json(cls, data: Union[str, Dict[str, Any]]):
        if isinstance(data, str):
            dict_data = json.loads(data)
        else:
            dict_data = data
        return cls.from_dict(dict_data)

    @staticmethod
    def _from_dict_offsets(offsets):
        converted_offsets = []
        for e in offsets:
            if isinstance(e, dict):
                if "row" in e:  # is a TableCell
                    converted_offsets.append(TableCell(**e))
                else:
                    converted_offsets.append(Span(**e))
            else:
                converted_offsets.append(e)
        return converted_offsets

    def __eq__(self, other):
        context = getattr(other, "context", None)
        if isinstance(context, pd.DataFrame):
            is_content_equal = context.equals(self.context)
        else:
            is_content_equal = context == self.context
        return (
            isinstance(other, self.__class__)
            and is_content_equal
            and getattr(other, "answer", None) == self.answer
            and getattr(other, "type", None) == self.type
            and getattr(other, "score", None) == self.score
            and getattr(other, "offsets_in_document", None) == self.offsets_in_document
            and getattr(other, "offsets_in_context", None) == self.offsets_in_context
            and getattr(other, "document_ids", None) == self.document_ids
            and getattr(other, "meta", None) == self.meta
        )


@dataclass
class Label:
    id: str
    query: str
    document: Document
    is_correct_answer: bool
    is_correct_document: bool
    origin: Literal["user-feedback", "gold-label"]
    answer: Optional[Answer] = None
    pipeline_id: Optional[str] = None
    created_at: Optional[str] = None
    updated_at: Optional[str] = None
    meta: Optional[dict] = None
    # Note that filters cannot be of type Optional[FilterType] as assignments like `filters = {"name": "file_name"}`
    # won't work due to Dict's covariance. See https://github.com/python/mypy/issues/9418.
    filters: Optional[Dict[str, Any]] = None

    # We use a custom init here as we want some custom logic. The annotations above are however still needed in order
    # to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method
    def __init__(
        self,
        query: str,
        document: Document,
        is_correct_answer: bool,
        is_correct_document: bool,
        origin: Literal["user-feedback", "gold-label"],
        answer: Optional[Answer],
        id: Optional[str] = None,
        pipeline_id: Optional[str] = None,
        created_at: Optional[str] = None,
        updated_at: Optional[str] = None,
        meta: Optional[dict] = None,
        filters: Optional[Dict[str, Any]] = None,
    ):
        """
        Object used to represent label/feedback in a standardized way within Haystack.
        This includes labels from dataset like SQuAD, annotations from labeling tools,
        or, user-feedback from the Haystack REST API.

        :param query: the question (or query) for finding answers.
        :param document:
        :param answer: the answer object.
        :param is_correct_answer: whether the sample is positive or negative.
        :param is_correct_document: in case of negative sample(is_correct_answer is False), there could be two cases;
                                    incorrect answer but correct document & incorrect document. This flag denotes if
                                    the returned document was correct.
        :param origin: the source for the labels. It can be used to later for filtering.
        :param id: Unique ID used within the DocumentStore. If not supplied, a uuid will be generated automatically.
        :param pipeline_id: pipeline identifier (any str) that was involved for generating this label (in-case of user feedback).
        :param created_at: Timestamp of creation with format yyyy-MM-dd HH:mm:ss.
                           Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S").
        :param updated_at: Timestamp of update with format yyyy-MM-dd HH:mm:ss.
                           Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S")
        :param meta: Meta fields like "annotator_name" in the form of a custom dict (any keys and values allowed).
        :param filters: filters that should be applied to the query to rule out non-relevant documents. For example, if there are different correct answers
                        in a DocumentStore depending on the retrieved document and the answer in this label is correct only on condition of the filters.
        """

        # Create a unique ID (either new one, or one from user input)
        if id:
            self.id = str(id)
        else:
            self.id = str(uuid4())

        if created_at is None:
            created_at = time.strftime(LABEL_DATETIME_FORMAT)
        self.created_at = created_at

        self.updated_at = updated_at
        self.query = query

        self.answer = answer
        self.document = document

        self.is_correct_answer = is_correct_answer
        self.is_correct_document = is_correct_document
        self.origin = origin

        # TODO autofill answer.document_id if Document is provided

        self.pipeline_id = pipeline_id
        if not meta:
            self.meta = {}
        else:
            self.meta = meta
        self.filters = filters

    @property
    def no_answer(self) -> Optional[bool]:
        no_answer = None
        if self.answer is not None:
            no_answer = self.answer.answer is None or self.answer.answer.strip() == ""
        return no_answer

    def to_dict(self):
        return asdict(self, dict_factory=_dict_factory)

    @classmethod
    def from_dict(cls, dict: Dict):
        answer = dict.get("answer")
        if answer and isinstance(answer, Dict):
            dict["answer"] = Answer.from_dict(dict["answer"])
        doc = dict.get("document")
        if isinstance(doc, Dict):
            dict["document"] = Document.from_dict(dict["document"])
        return cls(**dict)

    def to_json(self):
        return json.dumps(self.to_dict(), cls=NumpyEncoder)

    @classmethod
    def from_json(cls, data: Union[str, Dict[str, Any]]):
        if isinstance(data, str):
            dict_data = json.loads(data)
        else:
            dict_data = data
        return cls.from_dict(dict_data)

    # define __eq__ and __hash__ functions to deduplicate Label Objects
    def __eq__(self, other):
        return (
            isinstance(other, self.__class__)
            and getattr(other, "query", None) == self.query
            and getattr(other, "answer", None) == self.answer
            and getattr(other, "is_correct_answer", None) == self.is_correct_answer
            and getattr(other, "is_correct_document", None) == self.is_correct_document
            and getattr(other, "origin", None) == self.origin
            and getattr(other, "document", None) == self.document
            and getattr(other, "no_answer", None) == self.no_answer
            and getattr(other, "pipeline_id", None) == self.pipeline_id
        )

    def __hash__(self):
        return hash(
            self.query
            + str(self.answer)
            + str(self.is_correct_answer)
            + str(self.is_correct_document)
            + str(self.origin)
            + str(self.document)
            + str(self.no_answer)
            + str(self.pipeline_id)
        )

    def __repr__(self):
        return f"<Label: {self.to_dict()}>"

    def __str__(self):
        return f"<Label: {self.to_dict()}>"


def is_positive_label(label):
    return (label.is_correct_answer and label.is_correct_document) or (
        label.answer is None and label.is_correct_document
    )


class MultiLabel:
    def __init__(self, labels: List[Label], drop_negative_labels: bool = False, drop_no_answers: bool = False):
        """
        There are often multiple `Labels` associated with a single query. For example, there can be multiple annotated
        answers for one question or multiple documents contain the information you want for a query.
        This class is "syntactic sugar" that simplifies the work with such a list of related Labels.
        It stores the original labels in MultiLabel.labels and provides additional aggregated attributes that are
        automatically created at init time. For example, MultiLabel.no_answer allows you to easily access if any of the
        underlying Labels provided a text answer and therefore demonstrates that there is indeed a possible answer.

        :param labels: A list of labels that belong to a similar query and shall be "grouped" together
        :param drop_negative_labels: Whether to drop negative labels from that group (e.g. thumbs down feedback from UI)
        :param drop_no_answers: Whether to drop labels that specify the answer is impossible
        """
        # drop duplicate labels and remove negative labels if needed.
        labels = list(dict.fromkeys(labels))
        if drop_negative_labels:
            labels = [l for l in labels if is_positive_label(l)]
        if drop_no_answers:
            labels = [l for l in labels if l.no_answer is False]

        self._labels = labels
        self._query = self._aggregate_labels(key="query", must_be_single_value=True)[0]
        self._filters = self._aggregate_labels(key="filters", must_be_single_value=True)[0]
        self.id = hashlib.md5((self.query + json.dumps(self.filters, sort_keys=True)).encode()).hexdigest()

        # Currently no_answer is only true if all labels are "no_answers", we could later introduce a param here to let
        # users decided which aggregation logic they want
        self._no_answer = all(l.no_answer for l in self._labels)

        # Answer strings and offsets cleaned for no_answers:
        # If there are only no_answers, offsets are empty and answers will be a single empty string
        # which equals the no_answers representation of reader nodes.
        if self._no_answer:
            self._answers = [""]
            self._offsets_in_documents: List[dict] = []
            self._offsets_in_contexts: List[dict] = []
        else:
            answered = [l.answer for l in self._labels if not l.no_answer and l.answer is not None]
            self._answers = [answer.answer for answer in answered]
            self._offsets_in_documents = []
            self._offsets_in_contexts = []
            for answer in answered:
                if answer.offsets_in_document is not None:
                    for span in answer.offsets_in_document:
                        self._offsets_in_documents.append(self._to_dict_offsets(span))
                if answer.offsets_in_context is not None:
                    for span in answer.offsets_in_context:
                        self._offsets_in_contexts.append(self._to_dict_offsets(span))

        # There are two options here to represent document_ids:
        # taking the id from the document of each label or taking the document_id of each label's answer.
        # We take the former as labels without answers are allowed.
        #
        # For no_answer cases document_store.add_eval_data() currently adds all documents coming from the SQuAD paragraph's context
        # as separate no_answer labels, and thus with document.id but without answer.document_id.
        # If we do not exclude them from document_ids this would be problematic for retriever evaluation as they do not contain the answer.
        # Hence, we exclude them here as well.
        self._document_ids = [l.document.id for l in self._labels if not l.no_answer]
        self._contexts = [str(l.document.content) for l in self._labels if not l.no_answer]

    @staticmethod
    def _to_dict_offsets(offset: Union[Span, TableCell]) -> Dict:
        if isinstance(offset, TableCell):
            return {"row": offset.row, "col": offset.col}
        else:
            return {"start": offset.start, "end": offset.end}

    @property
    def labels(self):
        return self._labels

    @property
    def query(self):
        return self._query

    @property
    def filters(self):
        return self._filters

    @property
    def document_ids(self):
        return self._document_ids

    @property
    def contexts(self):
        return self._contexts

    @property
    def no_answer(self):
        return self._no_answer

    @property
    def answers(self):
        return self._answers

    @property
    def offsets_in_documents(self):
        return self._offsets_in_documents

    @property
    def offsets_in_contexts(self):
        return self._offsets_in_contexts

    def _aggregate_labels(self, key, must_be_single_value=True) -> List[Any]:
        if any(isinstance(getattr(l, key), dict) for l in self.labels):
            # dict is not hashable so we collect unique filters via looping through all labels
            unique_values = []
            for l in self.labels:
                if l.filters not in unique_values:
                    unique_values.append(l.filters)
        else:
            unique_values = list({getattr(l, key) for l in self.labels})
        if must_be_single_value and len(unique_values) > 1:
            raise ValueError(
                f"Tried to combine attribute '{key}' of Labels, but found multiple different values: {unique_values}"
            )
        return unique_values

    def to_dict(self):
        # convert internal attribute names to property names
        result = {k[1:] if k[0] == "_" else k: v for k, v in vars(self).items()}
        # convert Label object to dict
        result["labels"] = [label.to_dict() for label in result["labels"]]
        return result

    @classmethod
    def from_dict(cls, dict: Dict):
        # exclude extra arguments
        inputs = {k: v for k, v in dict.items() if k in inspect.signature(cls).parameters}
        inputs["labels"] = [Label.from_dict(label) for label in inputs["labels"]]
        return cls(**inputs)

    def to_json(self):
        return json.dumps(self.to_dict(), default=pydantic_encoder)

    @classmethod
    def from_json(cls, data: Union[str, Dict[str, Any]]):
        if isinstance(data, str):
            dict_data = json.loads(data)
        else:
            dict_data = data
        return cls.from_dict(dict_data)

    def __eq__(self, other):
        return isinstance(other, self.__class__) and self.labels == other.labels

    def __repr__(self):
        return f"<MultiLabel: {self.to_dict()}>"

    def __str__(self):
        return f"<MultiLabel: {self.to_dict()}>"


def _pydantic_dataclass_from_dict(dict: Dict, pydantic_dataclass_type) -> Any:
    """
    Constructs a pydantic dataclass from a dict incl. other nested dataclasses.
    This allows simple de-serialization of pydantic dataclasses from json.
    :param dict: Dict containing all attributes and values for the dataclass.
    :param pydantic_dataclass_type: The class of the dataclass that should be constructed (e.g. Document)
    """
    base_model = pydantic_dataclass_type.__pydantic_model__.parse_obj(dict)
    base_mode_fields = base_model.__fields__

    values = {}
    for base_model_field_name in base_mode_fields.keys():
        value = getattr(base_model, base_model_field_name)
        values[base_model_field_name] = value

    dataclass_object = pydantic_dataclass_type(**values)
    return dataclass_object


def _dict_factory(data):
    """Meant to be as the dict_factory for `asdict`. This function is called within `asdict` to convert a list of tuples
    into a dictionary object. This handles the conversion of pandas Dataframes into a list of lists.

    :param data: list of (key, value) pairs
    """

    def convert_value(v):
        if isinstance(v, pd.DataFrame):
            return dataframe_to_list(v)
        return v

    return {k: convert_value(v) for k, v in data}


class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)


def dataframe_to_list(df: pd.DataFrame) -> List[List]:
    return [df.columns.tolist()] + df.values.tolist()


def dataframe_from_list(list_df: List[List]) -> pd.DataFrame:
    return pd.DataFrame(columns=list_df[0], data=list_df[1:])


class EvaluationResult:
    def __init__(self, node_results: Optional[Dict[str, DataFrame]] = None) -> None:
        """
        A convenience class to store, pass, and interact with results of a pipeline evaluation run (for example `pipeline.eval()`).
        Detailed results are stored as one dataframe per node. This class makes them more accessible and provides
        convenience methods to work with them.
        For example, you can calculate eval metrics, get detailed reports, or simulate different top_k settings:

        ```python
        eval_results = pipeline.eval(...)

        # derive detailed metrics
        eval_results.calculate_metrics()

        # show summary of incorrect queries
        eval_results.wrong_examples()
        ```

        Each row of the underlying DataFrames contains either an answer or a document that has been retrieved during evaluation.
        Rows are enriched with basic information like rank, query, type, or node.
        Additional answer or document-specific evaluation information, like gold labels
        and metrics showing whether the row matches the gold labels, are included, too.
        The DataFrames have the following schema:
        - multilabel_id: The ID of the multilabel, which is unique for the pair of query and filters.
        - query: The actual query string.
        - filters: The filters used with the query.
        - gold_answers (answers only): The expected answers.
        - answer (answers only): The actual answer.
        - context: The content of the document (the surrounding context of the answer for QA).
        - exact_match (answers only): A metric showing if the answer exactly matches the gold label.
        - f1 (answers only): A metric showing how well the answer overlaps with the gold label on a token basis.
        - sas (answers only, optional): A metric showing how well the answer matches the gold label on a semantic basis.
        - exact_match_context_scope (answers only): exact_match with enforced context match.
        - f1_context_scope (answers only): f1 with enforced context scope match.
        - sas_context_scope (answers only): sas with enforced context scope match.
        - exact_match_document_scope (answers only): exact_match with enforced document scope match.
        - f1_document_scope (answers only): f1 with enforced document scope match.
        - sas_document_scope (answers only): sas with enforced document scope match.
        - exact_match_document_id_and_context_scope: (answers only): exact_match with enforced document and context scope match.
        - f1_document_id_and_context_scope (answers only): f1 with enforced document and context scope match.
        - sas_document_id_and_context_scope (answers only): sas with enforced document and context scope match.
        - gold_contexts: The contents of the gold documents.
        - gold_id_match (documents only): A metric showing whether one of the gold document IDs matches the document.
        - context_match (documents only): A metric showing whether one of the gold contexts matches the document content.
        - answer_match (documents only): A metric showing whether the document contains the answer.
        - gold_id_or_answer_match (documents only): A Boolean operation specifying that there should be either `'gold_id_match' OR 'answer_match'`.
        - gold_id_and_answer_match (documents only): A Boolean operation specifying that there should be both `'gold_id_match' AND 'answer_match'`.
        - gold_id_or_context_match (documents only): A Boolean operation specifying that there should be either `'gold_id_match' OR 'context_match'`.
        - gold_id_and_context_match (documents only): A Boolean operation specifying that there should be both `'gold_id_match' AND 'context_match'`.
        - gold_id_and_context_and_answer_match (documents only): A Boolean operation specifying that there should be `'gold_id_match' AND 'context_match' AND 'answer_match'`.
        - context_and_answer_match (documents only): A Boolean operation specifying that there should be both `'context_match' AND 'answer_match'`.
        - rank: A rank or 1-based-position in the result list.
        - document_id: The ID of the document that has been retrieved or that contained the answer.
        - gold_document_ids: The IDs of the documents to be retrieved.
        - custom_document_id: The custom ID of the document (specified by `custom_document_id_field`) that has been retrieved or that contained the answer.
        - gold_custom_document_ids: The custom documents IDs (specified by `custom_document_id_field`) to be retrieved.
        - offsets_in_document (answers only): The position or offsets within the document where the answer was found.
        - gold_offsets_in_documents (answers only): The position or offsets of the gold answer within the document.
        - gold_answers_exact_match (answers only): exact_match values per gold_answer.
        - gold_answers_f1 (answers only): f1 values per gold_answer.
        - gold_answers_sas (answers only): sas values per gold answer.
        - gold_documents_id_match: The document ID match per gold label (if `custom_document_id_field` has been specified, custom IDs are used).
        - gold_contexts_similarity: Context similarity per gold label.
        - gold_answers_match (documents only): Specifies whether the document contains an answer per gold label.
        - type: Possible values: 'answer' or 'document'.
        - node: The node name
        - eval_mode: Specifies whether the evaluation was executed in integrated or isolated mode.
                     Check pipeline.eval()'s add_isolated_node_eval parameter for more information.

        :param node_results: The evaluation Dataframes per pipeline node.
        """
        self.node_results: Dict[str, DataFrame] = {} if node_results is None else node_results

    def __getitem__(self, key: str):
        return self.node_results.__getitem__(key)

    def __delitem__(self, key: str):
        self.node_results.__delitem__(key)

    def __setitem__(self, key: str, value: DataFrame):
        self.node_results.__setitem__(key, value)

    def __contains__(self, key: str):
        return self.node_results.keys().__contains__(key)

    def __len__(self):
        return self.node_results.__len__()

    def append(self, key: str, value: DataFrame):
        if value is not None and len(value) > 0:
            if key in self.node_results:
                self.node_results[key] = pd.concat([self.node_results[key], value]).reset_index(drop=True)
            else:
                self.node_results[key] = value

    def calculate_metrics(
        self,
        simulated_top_k_reader: int = -1,
        simulated_top_k_retriever: int = -1,
        document_scope: Literal[
            "document_id",
            "context",
            "document_id_and_context",
            "document_id_or_context",
            "answer",
            "document_id_or_answer",
        ] = "document_id_or_answer",
        eval_mode: Literal["integrated", "isolated"] = "integrated",
        answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
    ) -> Dict[str, Dict[str, float]]:
        """
        Calculates proper metrics for each node.

        For Nodes that return Documents, the default metrics are:
        - mrr (`Mean Reciprocal Rank <https://en.wikipedia.org/wiki/Mean_reciprocal_rank>`_)
        - map (`Mean Average Precision <https://en.wikipedia.org/wiki/Evaluation_measures_%28information_retrieval%29#Mean_average_precision>`_)
        - ndcg (`Normalized Discounted Cumulative Gain <https://en.wikipedia.org/wiki/Discounted_cumulative_gain>`_)
        - precision (Precision: How many of the returned documents were relevant?)
        - recall_multi_hit (Recall according to Information Retrieval definition: How many of the relevant documents were retrieved per query?)
        - recall_single_hit (Recall for Question Answering: How many of the queries returned at least one relevant document?)

        For Nodes that return answers, the default metrics are:
        - exact_match (How many of the queries returned the exact answer?)
        - f1 (How well do the returned results overlap with any gold answer on a token basis?)
        - sas, if a SAS model has been provided when calling `pipeline.eval()` (How semantically similar is the prediction to the gold answers?)

        During the eval run, you can simulate lower top_k values for Reader and Retriever than the actual values.
        For example, you can calculate `top_1_f1` for Reader nodes by setting `simulated_top_k_reader=1`.

        If you applied `simulated_top_k_retriever` to a Reader node, you should treat the results with caution as they can differ from an actual eval run with a corresponding `top_k_retriever` heavily.

        :param simulated_top_k_reader: Simulates the `top_k` parameter of the Reader.
        :param simulated_top_k_retriever: Simulates the `top_k` parameter of the Retriever.
            Note: There might be a discrepancy between simulated Reader metrics and an actual Pipeline run with Retriever `top_k`.
        :param eval_mode: The input the Node was evaluated on.
            Usually a Node gets evaluated on the prediction provided by its predecessor Nodes in the Pipeline (`value='integrated'`).
            However, as the quality of the Node can heavily depend on the Node's input and thus the predecessor's quality,
            you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your Node.
            For example, when evaluating the Reader, use `value='isolated'` to simulate a perfect Retriever in an ExtractiveQAPipeline.
            Possible values are: `integrated`, `isolated`.
            The default value is `integrated`.
        :param document_scope: A criterion for deciding whether documents are relevant or not.
            You can select between:
            - 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
                    A typical use case is Document Retrieval.
            - 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
                    A typical use case is Document-Independent Passage Retrieval.
            - 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match.
                    A typical use case is Document-Specific Passage Retrieval.
            - 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match.
                    A typical use case is Document Retrieval having sparse context labels.
            - 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically.
                    A typical use case is Question Answering.
            - 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match.
                    This is intended to be a proper default value in order to support both main use cases:
                    - Document Retrieval
                    - Question Answering
            The default value is 'document_id_or_answer'.
        :param answer_scope: Specifies the scope in which a matching answer is considered correct.
            You can select between:
            - 'any' (default): Any matching answer is considered correct.
            - 'context': The answer is only considered correct if its context matches as well.
                    Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
            - 'document_id': The answer is only considered correct if its document ID matches as well.
                    You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
            - 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
            The default value is 'any'.
            In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
        """
        return {
            node: self._calculate_node_metrics(
                df,
                simulated_top_k_reader=simulated_top_k_reader,
                simulated_top_k_retriever=simulated_top_k_retriever,
                document_scope=document_scope,
                answer_scope=answer_scope,
                eval_mode=eval_mode,
            )
            for node, df in self.node_results.items()
        }

    def wrong_examples(
        self,
        node: str,
        n: int = 3,
        simulated_top_k_reader: int = -1,
        simulated_top_k_retriever: int = -1,
        document_scope: Literal[
            "document_id",
            "context",
            "document_id_and_context",
            "document_id_or_context",
            "answer",
            "document_id_or_answer",
        ] = "document_id_or_answer",
        document_metric: str = "recall_single_hit",
        answer_metric: str = "f1",
        document_metric_threshold: float = 0.5,
        answer_metric_threshold: float = 0.5,
        eval_mode: Literal["integrated", "isolated"] = "integrated",
        answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
    ) -> List[Dict]:
        """
        Returns the worst performing queries.
        Worst performing queries are calculated based on the metric
        that is either a document metric or an answer metric according to the node type.

        Lower top_k values for reader and retriever than the actual values during the eval run can be simulated.
        See calculate_metrics() for more information.

        :param simulated_top_k_reader: simulates top_k param of reader
        :param simulated_top_k_retriever: simulates top_k param of retriever.
            remarks: there might be a discrepancy between simulated reader metrics and an actual pipeline run with retriever top_k
        :param document_metric: the document metric worst queries are calculated with.
            values can be: 'recall_single_hit', 'recall_multi_hit', 'mrr', 'map', 'precision'
        :param answer_metric: the answer metric worst queries are calculated with.
            values can be: 'f1', 'exact_match' and 'sas' if the evaluation was made using a SAS model.
        :param document_metric_threshold: the threshold for the document metric (only samples below selected metric
        threshold will be considered)
        :param answer_metric_threshold: the threshold for the answer metric (only samples below selected metric
        threshold will be considered)
        :param eval_mode: the input on which the node was evaluated on.
            Usually nodes get evaluated on the prediction provided by its predecessor nodes in the pipeline (value='integrated').
            However, as the quality of the node itself can heavily depend on the node's input and thus the predecessor's quality,
            you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your node.
            For example when evaluating the reader use value='isolated' to simulate a perfect retriever in an ExtractiveQAPipeline.
            Values can be 'integrated', 'isolated'.
            Default value is 'integrated'.
        :param document_scope: A criterion for deciding whether documents are relevant or not.
            You can select between:
            - 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
                    A typical use case is Document Retrieval.
            - 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
                    A typical use case is Document-Independent Passage Retrieval.
            - 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match.
                    A typical use case is Document-Specific Passage Retrieval.
            - 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match.
                    A typical use case is Document Retrieval having sparse context labels.
            - 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically.
                    A typical use case is Question Answering.
            - 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match.
                    This is intended to be a proper default value in order to support both main use cases:
                    - Document Retrieval
                    - Question Answering
            The default value is 'document_id_or_answer'.
        :param answer_scope: Specifies the scope in which a matching answer is considered correct.
            You can select between:
            - 'any' (default): Any matching answer is considered correct.
            - 'context': The answer is only considered correct if its context matches as well.
                    Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
            - 'document_id': The answer is only considered correct if its document ID matches as well.
                    You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
            - 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
            The default value is 'any'.
            In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
        """
        node_df = self.node_results[node]
        node_df = self._filter_eval_mode(node_df, eval_mode)

        answers = node_df[node_df["type"] == "answer"]
        if len(answers) > 0:
            metrics_df = self._build_answer_metrics_df(
                answers,
                simulated_top_k_reader=simulated_top_k_reader,
                simulated_top_k_retriever=simulated_top_k_retriever,
                answer_scope=answer_scope,
            )
            worst_df = metrics_df.sort_values(by=[answer_metric]).head(n)
            wrong_examples = []
            for multilabel_id, metrics in worst_df.iterrows():
                query_answers = answers[answers["multilabel_id"] == multilabel_id]
                if answer_metric not in metrics:
                    logger.warning(
                        "You specified an answer_metric=%s not available in calculated metrics=%s."
                        "Skipping collection of worst performing samples.",
                        answer_metric,
                        metrics.keys(),
                    )
                    break
                if metrics[answer_metric] <= answer_metric_threshold:
                    query_dict = {
                        "multilabel_id": query_answers["multilabel_id"].iloc[0],
                        "query": query_answers["query"].iloc[0],
                        "filters": query_answers["filters"].iloc[0],
                        "metrics": metrics.to_dict(),
                        "answers": query_answers.drop(
                            ["node", "query", "type", "gold_answers", "gold_offsets_in_documents", "gold_document_ids"],
                            axis=1,
                        ).to_dict(orient="records"),
                        "gold_answers": query_answers["gold_answers"].iloc[0],
                        "gold_document_ids": query_answers["gold_document_ids"].iloc[0],
                    }
                    wrong_examples.append(query_dict)
            return wrong_examples

        documents = node_df[node_df["type"] == "document"]
        if len(documents) > 0:
            document_relevance_criterion = self._get_document_relevance_criterion(
                document_scope=document_scope, answer_scope=answer_scope
            )
            metrics_df = self._build_document_metrics_df(
                documents,
                simulated_top_k_retriever=simulated_top_k_retriever,
                document_relevance_criterion=document_relevance_criterion,
            )
            worst_df = metrics_df.sort_values(by=[document_metric]).head(n)
            wrong_examples = []
            for multilabel_id, metrics in worst_df.iterrows():
                if document_metric not in metrics:
                    logger.warning(
                        "You specified a document_metric=%s not available in calculated metrics=%s."
                        "Skipping collection of worst performing samples.",
                        document_metric,
                        metrics.keys(),
                    )
                    break
                if metrics[document_metric] <= document_metric_threshold:
                    query_documents = documents[documents["multilabel_id"] == multilabel_id]
                    query_dict = {
                        "multilabel_id": query_documents["multilabel_id"].iloc[0],
                        "query": query_documents["query"].iloc[0],
                        "filters": query_documents["filters"].iloc[0],
                        "metrics": metrics.to_dict(),
                        "documents": query_documents.drop(
                            ["node", "query", "multilabel_id", "filters", "type", "gold_document_ids", "gold_contexts"],
                            axis=1,
                        ).to_dict(orient="records"),
                        "gold_document_ids": query_documents["gold_document_ids"].iloc[0],
                    }
                    wrong_examples.append(query_dict)
            return wrong_examples

        return []

    def _get_document_relevance_criterion(
        self,
        document_scope: Literal[
            "document_id",
            "context",
            "document_id_and_context",
            "document_id_or_context",
            "answer",
            "document_id_or_answer",
        ] = "document_id_or_answer",
        answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
    ) -> Literal[
        "document_id",
        "context",
        "document_id_and_context",
        "document_id_or_context",
        "answer",
        "context_and_answer",
        "document_id_and_answer",
        "document_id_and_context_and_answer",
        "document_id_or_answer",
    ]:
        """
        Combines document_scope and answer_scope to create the document_relevance_criterion.
        """
        answer_scope_to_doc_relevance_crit = {
            "context": "context_and_answer",
            "document_id": "document_id_and_answer",
            "document_id_and_context": "document_id_and_context_and_answer",
        }

        document_relevance_criterion: str = document_scope
        if document_scope in ["answer", "document_id_or_answer"]:
            document_relevance_criterion = answer_scope_to_doc_relevance_crit.get(answer_scope, document_scope)
        elif answer_scope in answer_scope_to_doc_relevance_crit.keys():
            logger.warning(
                "You specified a non-answer document_scope together with a non-default answer_scope. "
                "This may result in inconsistencies between answer and document metrics. "
                "To enforce the same definition of correctness for both, document_scope must be one of 'answer', 'document_id_or_answer'."
            )

        return document_relevance_criterion  # type: ignore[return-value]

    def _calculate_node_metrics(
        self,
        df: DataFrame,
        simulated_top_k_reader: int = -1,
        simulated_top_k_retriever: int = -1,
        document_scope: Literal[
            "document_id",
            "context",
            "document_id_and_context",
            "document_id_or_context",
            "answer",
            "document_id_or_answer",
        ] = "document_id_or_answer",
        eval_mode: str = "integrated",
        answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
    ) -> Dict[str, float]:
        df = self._filter_eval_mode(df, eval_mode)

        answer_metrics = self._calculate_answer_metrics(
            df,
            simulated_top_k_reader=simulated_top_k_reader,
            simulated_top_k_retriever=simulated_top_k_retriever,
            answer_scope=answer_scope,
        )

        document_relevance_criterion = self._get_document_relevance_criterion(
            document_scope=document_scope, answer_scope=answer_scope
        )
        document_metrics = self._calculate_document_metrics(
            df,
            simulated_top_k_retriever=simulated_top_k_retriever,
            document_relevance_criterion=document_relevance_criterion,
        )

        return {**answer_metrics, **document_metrics}

    def _filter_eval_mode(self, df: DataFrame, eval_mode: str) -> DataFrame:
        if "eval_mode" in df.columns:
            df = df[df["eval_mode"] == eval_mode]
        else:
            logger.warning("eval dataframe has no eval_mode column. eval_mode param will be ignored.")
        return df

    def _calculate_answer_metrics(
        self,
        df: DataFrame,
        simulated_top_k_reader: int = -1,
        simulated_top_k_retriever: int = -1,
        answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
    ) -> Dict[str, float]:
        answers = df[df["type"] == "answer"]
        if len(answers) == 0:
            return {}

        metrics_df = self._build_answer_metrics_df(
            answers,
            simulated_top_k_reader=simulated_top_k_reader,
            simulated_top_k_retriever=simulated_top_k_retriever,
            answer_scope=answer_scope,
        )
        num_examples_for_eval = len(answers["multilabel_id"].unique())
        result = {metric: metrics_df[metric].mean().tolist() for metric in metrics_df.columns}
        result["num_examples_for_eval"] = float(num_examples_for_eval)  # formatter requires float
        return result

    def _build_answer_metrics_df(
        self,
        answers: DataFrame,
        simulated_top_k_reader: int = -1,
        simulated_top_k_retriever: int = -1,
        answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
    ) -> DataFrame:
        """
        Builds a dataframe containing answer metrics (columns) per multilabel (index).
        Answer metrics are:
        - exact_match (Did the query exactly return any gold answer? -> 1.0 or 0.0)
        - f1 (How well does the best matching returned results overlap with any gold answer on token basis?)
        - sas if a SAS model has been provided during pipeline.eval() (How semantically similar is the prediction to the gold answers?)
        """
        multilabel_ids = answers["multilabel_id"].unique()
        # simulate top k retriever
        if simulated_top_k_retriever != -1:
            documents = self._get_documents_df()

            top_k_documents = documents[documents["rank"] <= simulated_top_k_retriever]
            simulated_answers = []
            for multilabel_id in multilabel_ids:
                top_k_document_ids = top_k_documents[top_k_documents["multilabel_id"] == multilabel_id][
                    "document_id"
                ].unique()
                query_answers = answers[answers["multilabel_id"] == multilabel_id]
                # consider only the answers within simulated_top_k_retriever documents

                simulated_query_answers = query_answers[
                    query_answers["document_ids"].apply(
                        lambda document_ids, top_k_document_ids=top_k_document_ids: all(
                            document_id in top_k_document_ids for document_id in document_ids
                        )
                    )
                ]
                # simulate top k reader
                if simulated_top_k_reader != -1:
                    # consider only the simulated_top_k_reader answers within simulated_query_answers
                    simulated_query_answers = simulated_query_answers.nsmallest(simulated_top_k_reader, "rank")
                simulated_query_answers["rank"] = np.arange(1, len(simulated_query_answers) + 1)
                simulated_answers.append(simulated_query_answers)
            answers = pd.concat(simulated_answers)
        # simulate top k reader
        elif simulated_top_k_reader != -1:
            answers = answers[answers["rank"] <= simulated_top_k_reader]

        # build metrics df
        answer_metrics = ["exact_match", "f1", "sas"]
        df_records = []

        for multilabel_id in multilabel_ids:
            query_df = answers[answers["multilabel_id"] == multilabel_id]
            metric_to_scoped_col = {
                metric: f"{metric}_{answer_scope}_scope" if answer_scope != "any" else metric
                for metric in answer_metrics
                if metric in query_df.columns
            }
            query_metrics = {
                metric: query_df[col].max() if any(query_df) else 0.0 for metric, col in metric_to_scoped_col.items()
            }
            df_records.append(query_metrics)

        metrics_df = DataFrame.from_records(df_records, index=multilabel_ids)
        return metrics_df

    def _get_documents_df(self):
        document_dfs = [
            node_df for node_df in self.node_results.values() if len(node_df[node_df["type"] == "document"]) > 0
        ]
        if len(document_dfs) != 1:
            raise ValueError("cannot detect retriever dataframe")
        documents_df = document_dfs[0]
        documents_df = documents_df[documents_df["type"] == "document"]
        return documents_df

    def _calculate_document_metrics(
        self,
        df: DataFrame,
        simulated_top_k_retriever: int = -1,
        document_relevance_criterion: Literal[
            "document_id",
            "context",
            "document_id_and_context",
            "document_id_or_context",
            "answer",
            "context_and_answer",
            "document_id_and_answer",
            "document_id_and_context_and_answer",
            "document_id_or_answer",
        ] = "document_id_or_answer",
    ) -> Dict[str, float]:
        documents = df[df["type"] == "document"]
        if len(documents) == 0:
            return {}

        metrics_df = self._build_document_metrics_df(
            documents,
            simulated_top_k_retriever=simulated_top_k_retriever,
            document_relevance_criterion=document_relevance_criterion,
        )

        return {metric: metrics_df[metric].mean().tolist() for metric in metrics_df.columns}

    def _build_document_metrics_df(
        self,
        documents: DataFrame,
        simulated_top_k_retriever: int = -1,
        document_relevance_criterion: Literal[
            "document_id",
            "context",
            "document_id_and_context",
            "document_id_or_context",
            "answer",
            "context_and_answer",
            "document_id_and_answer",
            "document_id_and_context_and_answer",
            "document_id_or_answer",
        ] = "document_id_or_answer",
    ) -> DataFrame:
        """
        Builds a dataframe containing document metrics (columns) per pair of query and gold document ids (index).
        Document metrics are:
        - mrr (Mean Reciprocal Rank: see https://en.wikipedia.org/wiki/Mean_reciprocal_rank)
        - map (Mean Average Precision: see https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision)
        - precision (Precision: How many of the returned documents were relevant?)
        - recall_multi_hit (Recall according to Information Retrieval definition: How many of the relevant documents were retrieved per query?)
        - recall_single_hit (Recall for Question Answering: Did the query return at least one relevant document? -> 1.0 or 0.0)

        :param documents: document eval dataframe
        :param simulated_top_k_retriever: simulates top_k param of retriever.
        :param document_relevance_criterion: criterion for deciding whether documents are relevant or not.
            You can select between:
            - 'document_id': Document's id or custom id must match.
                    Typical use case: Document Retrieval
            - 'context': Document's content must match.
                    Typical use case: Document-independent Passage Retrieval
            - 'document_id_and_context': boolean operation `'document_id' AND 'context'`.
                    Typical use case: Document-specific Passage Retrieval
            - 'document_id_or_context': boolean operation `'document_id' OR 'context'`.
                    Typical use case: Document Retrieval having sparse context labels
            - 'answer': Document's content must include the answer.
                    Typical use case: Question Answering
            - 'document_id_or_answer' (default): boolean operation `'document_id' OR 'answer'`.
                    This is intended to be a proper default value in order to support both main use cases:
                    - Document Retrieval
                    - Question Answering
            - 'context_and_answer': boolean operation `'context' AND 'answer'`.
                    Typical use case: Question Answering with context-specific answers (see answer_scope='context')
            - 'document_id_and_answer': boolean operation `'document_id' AND 'answer'`.
                    Typical use case: Question Answering with document-specific answers (see answer_scope='document_id')
            - 'document_id_and_context_and_answer': boolean operation `'document_id' AND 'context' and 'answer'`.
                    Typical use case: Question Answering with document-and-context-specific answers (see answer_scope='document_id_and_context')
            Default value is 'document_id_or_answer'.
        """
        if simulated_top_k_retriever != -1:
            documents = documents[documents["rank"] <= simulated_top_k_retriever]

        # find out which label matched
        def find_matched_label_idxs(row) -> List[int]:  # pylint: disable=too-many-return-statements
            id_matches = [idx for idx, val in enumerate(row["gold_documents_id_match"]) if val == 1.0]
            context_matches = [
                idx for idx, val in enumerate(row["gold_contexts_similarity"]) if val > 65.0
            ]  # TODO: hardcoded threshold for now, will be param of calculate_metrics
            answer_matches = [idx for idx, val in enumerate(row["gold_answers_match"]) if val == 1.0]
            if document_relevance_criterion == "document_id":
                return id_matches
            elif document_relevance_criterion == "context":
                return context_matches
            elif document_relevance_criterion == "answer":
                return answer_matches
            elif document_relevance_criterion == "document_id_and_context":
                return list(set(id_matches) & set(context_matches))
            elif document_relevance_criterion == "document_id_or_context":
                return list(set(id_matches) | set(context_matches))
            elif document_relevance_criterion == "document_id_and_answer":
                return list(set(id_matches) & set(answer_matches))
            elif document_relevance_criterion == "document_id_or_answer":
                return list(set(id_matches) | set(answer_matches))
            elif document_relevance_criterion == "context_and_answer":
                return list(set(context_matches) & set(answer_matches))
            elif document_relevance_criterion == "document_id_and_context_and_answer":
                return list(set(id_matches) & set(context_matches) & set(answer_matches))
            else:
                raise ValueError(f"document_relevance_criterion '{document_relevance_criterion}' not supported.")

        documents["matched_label_idxs"] = documents.apply(find_matched_label_idxs, axis=1)

        metrics = []

        for multilabel_id in documents["multilabel_id"].unique():
            query_df = documents[documents["multilabel_id"] == multilabel_id]

            # Note: Metrics are always calculated on document_ids.
            # For some document relevance criteria (e.g. context), the gold_document_ids are not enough or not useful at all.
            # So, we have to adjust the relevant ids according to the document_relevance_criterion.
            relevance_criterion_col = f"{document_relevance_criterion.replace('document_id', 'gold_id')}_match"
            relevant_rows = query_df[query_df[relevance_criterion_col] == 1]

            # all labels without no_answers
            # we need to match all (except for single hit recall)
            gold_document_ids = (
                list(query_df["gold_custom_document_ids"].iloc[0])
                if "gold_custom_document_ids" in query_df
                else list(query_df["gold_document_ids"].iloc[0])
            )
            # remove no_answer label
            gold_document_ids = [id for id in gold_document_ids if id != "00"]

            num_labels = len(gold_document_ids)
            num_matched_labels = len({idx for idxs in relevant_rows["matched_label_idxs"] for idx in idxs})
            num_missing_labels = num_labels - num_matched_labels

            relevance_criterion_ids = list(relevant_rows["document_id"].values)
            num_relevants = len(set(relevance_criterion_ids)) + num_missing_labels

            num_retrieved = len(query_df["document_id"])
            num_retrieved_relevants = len(relevant_rows)
            rank_retrieved_relevants = relevant_rows["rank"].values

            if num_labels == 0:
                # For no_answer queries, we set all metrics to 1.0, to indicate that the retriever cannot improve the pipeline.
                # This behavior is different from pytrec_eval, which sets the metrics to 0.0 if there is no relevant document in the evalset.
                rr = 1.0
                avg_precision = 1.0
                recall_multi_hit = 1.0
                recall_single_hit = 1.0
                precision = 1.0
                ndcg = 1.0
            elif num_retrieved_relevants == 0:
                # Set all metrics to 0.0 if no relevant document has been retrieved to avoid undefined metrics.
                rr = 0.0
                avg_precision = 0.0
                recall_multi_hit = 0.0
                recall_single_hit = 0.0
                precision = 0.0
                ndcg = 0.0
            else:
                # The previous checks ensure:
                # - `num_labels` > 0
                # - `num_retrieved_relevants` > 0
                # - `num_relevants` > 0  (`num_relevants` is always >= `num_labels`)
                # - `num_retrieved` > 0  (`num_retrieved` is always >= `num_retrieved_relevants`)
                # - `len(rank_retrieved_relevants)` > 0 (`len(rank_retrieved_relevants)` is always == `num_retrieved_relevants`)
                avp_retrieved_relevants = [
                    len(relevant_rows[relevant_rows["rank"] <= rank]) / rank for rank in rank_retrieved_relevants
                ]
                avg_precision = np.sum(avp_retrieved_relevants) / num_relevants
                recall_multi_hit = num_matched_labels / num_labels
                recall_single_hit = 1.0
                precision = num_retrieved_relevants / num_retrieved
                rr = 1.0 / rank_retrieved_relevants.min()
                dcg = np.sum([1.0 / np.log2(rank + 1) for rank in rank_retrieved_relevants])
                idcg = np.sum([1.0 / np.log2(rank + 1) for rank in range(1, num_relevants + 1)])
                ndcg = dcg / idcg

            metrics.append(
                {
                    "recall_multi_hit": recall_multi_hit,
                    "recall_single_hit": recall_single_hit,
                    "precision": precision,
                    "map": avg_precision,
                    "mrr": rr,
                    "ndcg": ndcg,
                }
            )

        metrics_df = DataFrame.from_records(metrics, index=documents["multilabel_id"].unique())
        return metrics_df

    def save(self, out_dir: Union[str, Path], **to_csv_kwargs):
        """
        Saves the evaluation result.
        The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder.

        :param out_dir: Path to the target folder the csvs will be saved.
        :param to_csv_kwargs: kwargs to be passed to DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
                        This method uses different default values than DataFrame.to_csv() for the following parameters:
                        index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
        """
        out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir)
        logger.info("Saving evaluation results to %s", out_dir)
        if not out_dir.exists():
            out_dir.mkdir(parents=True)
        for node_name, df in self.node_results.items():
            target_path = out_dir / f"{node_name}.csv"
            default_to_csv_kwargs = {
                "index": False,
                "quoting": csv.QUOTE_NONNUMERIC,  # avoids problems with \r chars in texts by enclosing all string values in quotes
            }
            to_csv_kwargs = {**default_to_csv_kwargs, **to_csv_kwargs}
            df.to_csv(target_path, **to_csv_kwargs)

    @classmethod
    def load(cls, load_dir: Union[str, Path], **read_csv_kwargs):
        """
        Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.

        :param load_dir: The directory containing the csv files.
        :param read_csv_kwargs: kwargs to be passed to pd.read_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.
                                This method uses different default values than pd.read_csv() for the following parameters:
                                header=0, converters=CONVERTERS
                                where CONVERTERS is a dictionary mapping all array typed columns to ast.literal_eval.
        """
        load_dir = load_dir if isinstance(load_dir, Path) else Path(load_dir)
        csv_files = [file for file in load_dir.iterdir() if file.is_file() and file.suffix == ".csv"]
        cols_to_convert = [
            "filters",
            "gold_document_ids",
            "gold_custom_document_ids",
            "gold_contexts",
            "gold_answers",
            "gold_documents_id_match",
            "gold_offsets_in_documents",
            "gold_offsets_in_contexts",
            "gold_answers_exact_match",
            "gold_answers_f1",
            "gold_answers_sas",
            "gold_answers_match",
            "gold_contexts_similarity",
            "offsets_in_document",
            "offsets_in_context",
            "document_ids",
            "custom_document_ids",
            "gold_document_contents",
        ]

        def safe_literal_eval(x: str) -> Any:
            if x == "":
                return None
            return ast.literal_eval(x)

        converters = dict.fromkeys(cols_to_convert, safe_literal_eval)
        default_read_csv_kwargs = {"converters": converters, "header": 0}
        read_csv_kwargs = {**default_read_csv_kwargs, **read_csv_kwargs}
        node_results = {file.stem: pd.read_csv(file, **read_csv_kwargs) for file in csv_files}
        # backward compatibility mappings
        for df in node_results.values():
            df.replace(to_replace=np.nan, value=None, inplace=True)
            df.rename(columns={"gold_document_contents": "gold_contexts", "content": "context"}, inplace=True)
            # convert single document_id to list
            if "answer" in df.columns and "document_id" in df.columns and not "document_ids" in df.columns:
                df["document_ids"] = df["document_id"].apply(lambda x: [x] if x not in [None, "None"] else [])
                df.drop(columns=["document_id"], inplace=True)
            if (
                "answer" in df.columns
                and "custom_document_id" in df.columns
                and not "custom_document_ids" in df.columns
            ):
                df["custom_document_ids"] = df["custom_document_id"].apply(
                    lambda x: [x] if x not in [None, "None"] else []
                )
                df.drop(columns=["custom_document_id"], inplace=True)
        result = cls(node_results)
        return result