haystack/haystack/schema.py
Christian Clauss bf6d306d68
ci: Simplify Python code with ruff rules SIM (#5833)
* ci: Simplify Python code with ruff rules SIM

* Revert #5828

* ruff --select=I --fix haystack/modeling/infer.py

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
2023-09-20 08:32:44 +02:00

1634 lines
80 KiB
Python

from __future__ import annotations
import csv
import hashlib
import inspect
from typing import Any, Optional, Dict, List, Union, Literal
from pathlib import Path
from uuid import uuid4
import logging
import time
import json
import ast
from dataclasses import asdict
import numpy as np
from numpy import ndarray
import pandas as pd
from pandas import DataFrame
from pydantic import BaseConfig, Field
from pydantic.json import pydantic_encoder
# We are using Pydantic dataclasses instead of vanilla Python's
# See #1598 for the reasons behind this choice & performance considerations
from pydantic.dataclasses import dataclass
from haystack.mmh3 import hash128
logger = logging.getLogger(__name__)
BaseConfig.arbitrary_types_allowed = True
#: Types of content_types supported
ContentTypes = Literal["text", "table", "image", "audio"]
FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]]
@dataclass
class Document:
id: str
content: Union[str, DataFrame]
content_type: ContentTypes = Field(default="text")
meta: Dict[str, Any] = Field(default={})
id_hash_keys: List[str] = Field(default=["content"])
score: Optional[float] = None
embedding: Optional[ndarray] = None
# We use a custom init here as we want some custom logic. The annotations above are however still needed in order
# to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method
# They also help in annotating which object attributes will always be present (e.g. "id") even though they
# don't need to passed by the user in init and are rather initialized automatically in the init
def __init__(
self,
content: Union[str, DataFrame],
content_type: ContentTypes = "text",
id: Optional[str] = None,
score: Optional[float] = None,
meta: Optional[Dict[str, Any]] = None,
embedding: Optional[ndarray] = None,
id_hash_keys: Optional[List[str]] = None,
):
"""
One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
many other places that manipulate or interact with document-level data.
Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
into smaller passages. We'll have one Document per passage in this case.
Each document has a unique ID. This can be supplied by the user or generated automatically.
It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
:param content: Content of the document. For most cases, this will be text, but it can be a table or image.
:param content_type: One of "text", "table", "image" or "audio". Haystack components can use this to adjust their
handling of Documents and check compatibility.
:param id: Unique ID for the document. If not supplied by the user, we'll generate one automatically by
creating a hash from the supplied text. This behaviour can be further adjusted by `id_hash_keys`.
:param score: The relevance score of the Document determined by a model (e.g. Retriever or Re-Ranker).
If model's `scale_score` was set to True (default) score is in the unit interval (range of [0,1]), where 1 means extremely relevant.
:param meta: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed).
:param embedding: Vector encoding of the text
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's attributes.
To ensure you don't have duplicate documents in your DocumentStore if texts are
not unique, modify the metadata and pass, for example, "meta" to this field (example: ["content", "meta"]).
In this case, the id is generated by using the content and the defined metadata.
If you specify a custom ID for the `id` parameter, the `id_hash_keys` parameter is
ignored and the custom ID is used.
Note that you can use even nested fields of the `meta` as id_hash_keys. For example, if you
have a key in `meta` called `url` and you want to use it as part of the id, you can pass
this parameter as `["meta.url"]`. Haystack supports a maximum depth of 1. For example, if you
use `meta.url.path`, it looks for the `url.path` key in the `meta` dict, for example `meta['url.path']`.
"""
if content is None:
raise ValueError("Can't create 'Document': Mandatory 'content' field is None")
self.content = content
self.content_type = content_type
self.score = score
self.meta = meta or {}
allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding"]
if id_hash_keys is not None and not all(
key in allowed_hash_key_attributes or key.startswith("meta.") for key in id_hash_keys
):
raise ValueError(
f"You passed custom strings {id_hash_keys} to id_hash_keys which is deprecated. Supply instead a "
f"list of Document's attribute names (like {', '.join(allowed_hash_key_attributes)}) or "
f"a key of meta with a maximum depth of 1 (like meta.url). "
"See [Custom id hashing on documentstore level](https://github.com/deepset-ai/haystack/pull/1910) and "
"[Allow more flexible Document id hashing](https://github.com/deepset-ai/haystack/issues/4317) for details"
)
# We store id_hash_keys to be able to clone documents, for example when splitting them during pre-processing
self.id_hash_keys = id_hash_keys or ["content"]
if embedding is not None:
embedding = np.asarray(embedding)
self.embedding = embedding
# Create a unique ID (either new one, or one from user input)
if id is not None:
self.id: str = str(id)
else:
self.id: str = self._get_id(id_hash_keys=id_hash_keys)
def _get_id(self, id_hash_keys: Optional[List[str]] = None):
"""
Generate the id of a document by creating the hash of strings. By default the content of a document is
used to generate the hash. There are two ways of modifying the generated id of a document. Either static keys
or a selection of the content.
:param id_hash_keys: Optional list of fields that should be dynamically used to generate the hash.
"""
if id_hash_keys is None:
return "{:02x}".format(hash128(str(self.content)))
final_hash_key = ""
for attr in id_hash_keys:
if attr.startswith("meta."):
meta_key = attr.split(".", maxsplit=1)[1]
if meta_key in self.meta:
final_hash_key += ":" + str(self.meta[meta_key])
else:
final_hash_key += ":" + str(getattr(self, attr))
if final_hash_key == "":
raise ValueError(
"Can't create 'Document': 'id_hash_keys' must contain at least one of ['content', 'meta'] or be set to None."
)
return "{:02x}".format(hash128(final_hash_key))
def to_dict(self, field_map: Optional[Dict[str, Any]] = None) -> Dict:
"""
Convert Document to dict. An optional field_map can be supplied to change the names of the keys in the
resulting dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
they are serialized / stored in other places (e.g. elasticsearch)
Example:
```python
doc = Document(content="some text", content_type="text")
doc.to_dict(field_map={"custom_content_field": "content"})
# Returns {"custom_content_field": "some text", content_type": "text"}
```
:param field_map: Dict with keys being the custom target keys and values being the standard Document attributes
:return: dict with content of the Document
"""
if not field_map:
field_map = {}
inv_field_map = {v: k for k, v in field_map.items()}
_doc: Dict[str, str] = {}
for k, v in self.__dict__.items():
# Exclude internal fields (Pydantic, ...) fields from the conversion process
if k.startswith("__"):
continue
# Convert pd.DataFrame to list of rows for serialization
if k == "content" and self.content_type == "table" and isinstance(self.content, DataFrame):
v = dataframe_to_list(self.content)
k = k if k not in inv_field_map else inv_field_map[k]
_doc[k] = v
return _doc
@classmethod
def from_dict(cls, dict: Dict[str, Any], field_map: Optional[Dict[str, Any]] = None) -> Document:
"""
Create Document from dict. An optional `field_map` parameter can be supplied to adjust for custom names of the keys in the
input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
they are serialized / stored in other places (e.g. elasticsearch).
Example:
```python
my_dict = {"custom_content_field": "some text", "content_type": "text"}
Document.from_dict(my_dict, field_map={"custom_content_field": "content"})
```
:param field_map: Dict with keys being the custom target keys and values being the standard Document attributes
:return: A Document object
"""
if not field_map:
field_map = {}
_doc = dict.copy()
init_args = ["content", "content_type", "id", "score", "id_hash_keys", "question", "meta", "embedding"]
if "meta" not in _doc.keys():
_doc["meta"] = {}
# copy additional fields into "meta"
for k, v in _doc.items():
# Exclude internal fields (Pydantic, ...) fields from the conversion process
if k.startswith("__"):
continue
if k not in init_args and k not in field_map:
_doc["meta"][k] = v
# remove additional fields from top level
_new_doc = {}
for k, v in _doc.items():
if k in init_args:
_new_doc[k] = v
elif k in field_map:
k = field_map[k]
_new_doc[k] = v
# Convert list of rows to DataFrame
if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
_new_doc["content"] = dataframe_from_list(_new_doc["content"])
return cls(**_new_doc)
def to_json(self, field_map: Optional[Dict[str, Any]] = None) -> str:
if not field_map:
field_map = {}
dictionary = self.to_dict(field_map=field_map)
return json.dumps(dictionary, cls=NumpyEncoder)
@classmethod
def from_json(cls, data: Union[str, Dict[str, Any]], field_map: Optional[Dict[str, Any]] = None) -> Document:
if not field_map:
field_map = {}
if isinstance(data, str):
dict_data = json.loads(data)
else:
dict_data = data
return cls.from_dict(dict_data, field_map=field_map)
def __eq__(self, other):
content = getattr(other, "content", None)
if isinstance(content, pd.DataFrame):
is_content_equal = content.equals(self.content)
else:
is_content_equal = content == self.content
return (
isinstance(other, self.__class__)
and is_content_equal
and getattr(other, "content_type", None) == self.content_type
and getattr(other, "id", None) == self.id
and getattr(other, "id_hash_keys", None) == self.id_hash_keys
and getattr(other, "score", None) == self.score
and getattr(other, "meta", None) == self.meta
and np.array_equal(getattr(other, "embedding", None), self.embedding)
)
def __repr__(self):
doc_dict = self.to_dict()
embedding = doc_dict.get("embedding", None)
if embedding is not None:
doc_dict["embedding"] = f"<embedding of shape {getattr(embedding, 'shape', '[no shape]')}>"
return f"<Document: {str(doc_dict)}>"
def __str__(self):
# In some cases, self.content is None (therefore not subscriptable)
if self.content is None:
return f"<Document: id={self.id}, content=None>"
return f"<Document: id={self.id}, content='{self.content[:100]}{'...' if len(self.content) > 100 else ''}'>"
def __lt__(self, other):
"""Enable sorting of Documents by score"""
return self.score < other.score
@dataclass
class Span:
start: int
end: int
"""
Defining a sequence of characters (Text span) or cells (Table span) via start and end index.
For extractive QA: Character where answer starts/ends
:param start: Position where the span starts
:param end: Position where the span ends
"""
def __contains__(self, value):
"""
Checks for inclusion of the given value into the interval defined by Span.
```
assert 10 in Span(5, 15) # True
assert 20 in Span(1, 15) # False
```
Includes the left edge, but not the right edge.
```
assert 5 in Span(5, 15) # True
assert 15 in Span(5, 15) # False
```
Works for numbers and all values that can be safely converted into floats.
```
assert 10.0 in Span(5, 15) # True
assert "10" in Span(5, 15) # True
```
It also works for Span objects, returning True only if the given
Span is fully contained into the original Span.
As for numerical values, the left edge is included, the right edge is not.
```
assert Span(10, 11) in Span(5, 15) # True
assert Span(5, 10) in Span(5, 15) # True
assert Span(10, 15) in Span(5, 15) # False
assert Span(5, 15) in Span(5, 15) # False
assert Span(5, 14) in Span(5, 15) # True
assert Span(0, 1) in Span(5, 15) # False
assert Span(0, 10) in Span(5, 15) # False
assert Span(10, 20) in Span(5, 15) # False
```
"""
if isinstance(value, Span):
return self.start <= value.start and self.end > value.end
try:
value = float(value)
return self.start <= value < self.end
except Exception as e:
raise ValueError(
f"Cannot use 'in' with a value of type {type(value)}. Use numeric values or Span objects."
) from e
@dataclass
class TableCell:
row: int
col: int
"""
Defining a table cell via the row and column index.
:param row: Row index of the cell
:param col: Column index of the cell
"""
@dataclass
class Answer:
answer: str
type: Literal["generative", "extractive", "other"] = "extractive"
score: Optional[float] = None
context: Optional[Union[str, DataFrame]] = None
offsets_in_document: Optional[Union[List[Span], List[TableCell]]] = None
offsets_in_context: Optional[Union[List[Span], List[TableCell]]] = None
document_ids: Optional[List[str]] = None
meta: Optional[Dict[str, Any]] = None
"""
The fundamental object in Haystack to represent any type of Answers (e.g. extractive QA, generative QA or TableQA).
For example, it's used within some Nodes like the Reader, but also in the REST API.
:param answer: The answer string. If there's no possible answer (aka "no_answer" or "is_impossible) this will be an empty string.
:param type: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model
(i.e. we can locate an exact answer string in one of the documents) or from a generative model
(i.e. no pointer to a specific document, no offsets ...).
:param score: The relevance score of the Answer determined by a model (e.g. Reader or Generator).
In the range of [0,1], where 1 means extremely relevant.
:param context: The related content that was used to create the answer (i.e. a text passage, part of a table, image ...)
:param offsets_in_document: List of `Span` objects with start and end positions of the answer **in the
document** (as stored in the document store).
For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
:param offsets_in_context: List of `Span` objects with start and end positions of the answer **in the
context** (i.e. the surrounding text/table of a certain window size).
For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
:param document_ids: IDs of the documents the answer came from (if any).
For extractive QA, this will be a list of length 1.
For generative QA, this will be a list of length > 0.
:param meta: Dict that can be used to associate any kind of custom meta data with the answer.
In extractive QA, this will carry the meta data of the document where the answer was found.
"""
def __post_init__(self):
# In case offsets are passed as dicts rather than Span or TableCell objects we convert them here
# For example, this is used when instantiating an object via from_json()
if self.offsets_in_document is not None:
self.offsets_in_document = self._from_dict_offsets(self.offsets_in_document)
if self.offsets_in_context is not None:
self.offsets_in_context = self._from_dict_offsets(self.offsets_in_context)
if self.meta is None:
self.meta = {}
# In case the context is a list of lists for a table document that is instantiated by from_json() or from_dict()
if isinstance(self.context, list):
self.context = dataframe_from_list(self.context)
def __lt__(self, other):
"""Enable sorting of Answers by score"""
return self.score < other.score
def __str__(self):
# self.context might be None (therefore not subscriptable)
if self.context is None:
return f"<Answer: answer='{self.answer}', score={self.score}, context=None>"
return f"<Answer: answer='{self.answer}', score={self.score}, context='{self.context[:50]}{'...' if len(self.context) > 50 else ''}'>"
def __repr__(self):
return f"<Answer {self.to_dict()}>"
def to_dict(self) -> Dict:
return asdict(self, dict_factory=_dict_factory)
@classmethod
def from_dict(cls, dict: Dict) -> Answer:
# backwards compatibility: `document_id: Optional[str]` was changed to `document_ids: Optional[List[str]]`
if "document_id" in dict:
dict = dict.copy()
document_id = dict.pop("document_id")
dict["document_ids"] = [document_id] if document_id is not None else None
return cls(**dict)
def to_json(self):
return json.dumps(self.to_dict(), cls=NumpyEncoder)
@classmethod
def from_json(cls, data: Union[str, Dict[str, Any]]):
if isinstance(data, str):
dict_data = json.loads(data)
else:
dict_data = data
return cls.from_dict(dict_data)
@staticmethod
def _from_dict_offsets(offsets):
converted_offsets = []
for e in offsets:
if isinstance(e, dict):
if "row" in e: # is a TableCell
converted_offsets.append(TableCell(**e))
else:
converted_offsets.append(Span(**e))
else:
converted_offsets.append(e)
return converted_offsets
def __eq__(self, other):
context = getattr(other, "context", None)
if isinstance(context, pd.DataFrame):
is_content_equal = context.equals(self.context)
else:
is_content_equal = context == self.context
return (
isinstance(other, self.__class__)
and is_content_equal
and getattr(other, "answer", None) == self.answer
and getattr(other, "type", None) == self.type
and getattr(other, "score", None) == self.score
and getattr(other, "offsets_in_document", None) == self.offsets_in_document
and getattr(other, "offsets_in_context", None) == self.offsets_in_context
and getattr(other, "document_ids", None) == self.document_ids
and getattr(other, "meta", None) == self.meta
)
@dataclass
class Label:
id: str
query: str
document: Document
is_correct_answer: bool
is_correct_document: bool
origin: Literal["user-feedback", "gold-label"]
answer: Optional[Answer] = None
pipeline_id: Optional[str] = None
created_at: Optional[str] = None
updated_at: Optional[str] = None
meta: Optional[dict] = None
# Note that filters cannot be of type Optional[FilterType] as assignments like `filters = {"name": "file_name"}`
# won't work due to Dict's covariance. See https://github.com/python/mypy/issues/9418.
filters: Optional[Dict[str, Any]] = None
# We use a custom init here as we want some custom logic. The annotations above are however still needed in order
# to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method
def __init__(
self,
query: str,
document: Document,
is_correct_answer: bool,
is_correct_document: bool,
origin: Literal["user-feedback", "gold-label"],
answer: Optional[Answer],
id: Optional[str] = None,
pipeline_id: Optional[str] = None,
created_at: Optional[str] = None,
updated_at: Optional[str] = None,
meta: Optional[dict] = None,
filters: Optional[Dict[str, Any]] = None,
):
"""
Object used to represent label/feedback in a standardized way within Haystack.
This includes labels from dataset like SQuAD, annotations from labeling tools,
or, user-feedback from the Haystack REST API.
:param query: the question (or query) for finding answers.
:param document:
:param answer: the answer object.
:param is_correct_answer: whether the sample is positive or negative.
:param is_correct_document: in case of negative sample(is_correct_answer is False), there could be two cases;
incorrect answer but correct document & incorrect document. This flag denotes if
the returned document was correct.
:param origin: the source for the labels. It can be used to later for filtering.
:param id: Unique ID used within the DocumentStore. If not supplied, a uuid will be generated automatically.
:param pipeline_id: pipeline identifier (any str) that was involved for generating this label (in-case of user feedback).
:param created_at: Timestamp of creation with format yyyy-MM-dd HH:mm:ss.
Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S").
:param created_at: Timestamp of update with format yyyy-MM-dd HH:mm:ss.
Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S")
:param meta: Meta fields like "annotator_name" in the form of a custom dict (any keys and values allowed).
:param filters: filters that should be applied to the query to rule out non-relevant documents. For example, if there are different correct answers
in a DocumentStore depending on the retrieved document and the answer in this label is correct only on condition of the filters.
"""
# Create a unique ID (either new one, or one from user input)
if id:
self.id = str(id)
else:
self.id = str(uuid4())
if created_at is None:
created_at = time.strftime("%Y-%m-%d %H:%M:%S")
self.created_at = created_at
self.updated_at = updated_at
self.query = query
self.answer = answer
self.document = document
self.is_correct_answer = is_correct_answer
self.is_correct_document = is_correct_document
self.origin = origin
# TODO autofill answer.document_id if Document is provided
self.pipeline_id = pipeline_id
if not meta:
self.meta = {}
else:
self.meta = meta
self.filters = filters
@property
def no_answer(self) -> Optional[bool]:
no_answer = None
if self.answer is not None:
no_answer = self.answer.answer is None or self.answer.answer.strip() == ""
return no_answer
def to_dict(self):
return asdict(self, dict_factory=_dict_factory)
@classmethod
def from_dict(cls, dict: Dict):
answer = dict.get("answer")
if answer and isinstance(answer, Dict):
dict["answer"] = Answer.from_dict(dict["answer"])
doc = dict.get("document")
if isinstance(doc, Dict):
dict["document"] = Document.from_dict(dict["document"])
return cls(**dict)
def to_json(self):
return json.dumps(self.to_dict(), cls=NumpyEncoder)
@classmethod
def from_json(cls, data: Union[str, Dict[str, Any]]):
if isinstance(data, str):
dict_data = json.loads(data)
else:
dict_data = data
return cls.from_dict(dict_data)
# define __eq__ and __hash__ functions to deduplicate Label Objects
def __eq__(self, other):
return (
isinstance(other, self.__class__)
and getattr(other, "query", None) == self.query
and getattr(other, "answer", None) == self.answer
and getattr(other, "is_correct_answer", None) == self.is_correct_answer
and getattr(other, "is_correct_document", None) == self.is_correct_document
and getattr(other, "origin", None) == self.origin
and getattr(other, "document", None) == self.document
and getattr(other, "no_answer", None) == self.no_answer
and getattr(other, "pipeline_id", None) == self.pipeline_id
)
def __hash__(self):
return hash(
self.query
+ str(self.answer)
+ str(self.is_correct_answer)
+ str(self.is_correct_document)
+ str(self.origin)
+ str(self.document)
+ str(self.no_answer)
+ str(self.pipeline_id)
)
def __repr__(self):
return f"<Label: {self.to_dict()}>"
def __str__(self):
return f"<Label: {self.to_dict()}>"
def is_positive_label(label):
return (label.is_correct_answer and label.is_correct_document) or (
label.answer is None and label.is_correct_document
)
class MultiLabel:
def __init__(self, labels: List[Label], drop_negative_labels: bool = False, drop_no_answers: bool = False):
"""
There are often multiple `Labels` associated with a single query. For example, there can be multiple annotated
answers for one question or multiple documents contain the information you want for a query.
This class is "syntactic sugar" that simplifies the work with such a list of related Labels.
It stores the original labels in MultiLabel.labels and provides additional aggregated attributes that are
automatically created at init time. For example, MultiLabel.no_answer allows you to easily access if any of the
underlying Labels provided a text answer and therefore demonstrates that there is indeed a possible answer.
:param labels: A list of labels that belong to a similar query and shall be "grouped" together
:param drop_negative_labels: Whether to drop negative labels from that group (e.g. thumbs down feedback from UI)
:param drop_no_answers: Whether to drop labels that specify the answer is impossible
"""
# drop duplicate labels and remove negative labels if needed.
labels = list(dict.fromkeys(labels))
if drop_negative_labels:
labels = [l for l in labels if is_positive_label(l)]
if drop_no_answers:
labels = [l for l in labels if l.no_answer is False]
self._labels = labels
self._query = self._aggregate_labels(key="query", must_be_single_value=True)[0]
self._filters = self._aggregate_labels(key="filters", must_be_single_value=True)[0]
self.id = hashlib.md5((self.query + json.dumps(self.filters, sort_keys=True)).encode()).hexdigest()
# Currently no_answer is only true if all labels are "no_answers", we could later introduce a param here to let
# users decided which aggregation logic they want
self._no_answer = all(l.no_answer for l in self._labels)
# Answer strings and offsets cleaned for no_answers:
# If there are only no_answers, offsets are empty and answers will be a single empty string
# which equals the no_answers representation of reader nodes.
if self._no_answer:
self._answers = [""]
self._offsets_in_documents: List[dict] = []
self._offsets_in_contexts: List[dict] = []
else:
answered = [l.answer for l in self._labels if not l.no_answer and l.answer is not None]
self._answers = [answer.answer for answer in answered]
self._offsets_in_documents = []
self._offsets_in_contexts = []
for answer in answered:
if answer.offsets_in_document is not None:
for span in answer.offsets_in_document:
self._offsets_in_documents.append(self._to_dict_offsets(span))
if answer.offsets_in_context is not None:
for span in answer.offsets_in_context:
self._offsets_in_contexts.append(self._to_dict_offsets(span))
# There are two options here to represent document_ids:
# taking the id from the document of each label or taking the document_id of each label's answer.
# We take the former as labels without answers are allowed.
#
# For no_answer cases document_store.add_eval_data() currently adds all documents coming from the SQuAD paragraph's context
# as separate no_answer labels, and thus with document.id but without answer.document_id.
# If we do not exclude them from document_ids this would be problematic for retriever evaluation as they do not contain the answer.
# Hence, we exclude them here as well.
self._document_ids = [l.document.id for l in self._labels if not l.no_answer]
self._contexts = [str(l.document.content) for l in self._labels if not l.no_answer]
@staticmethod
def _to_dict_offsets(offset: Union[Span, TableCell]) -> Dict:
if isinstance(offset, TableCell):
return {"row": offset.row, "col": offset.col}
else:
return {"start": offset.start, "end": offset.end}
@property
def labels(self):
return self._labels
@property
def query(self):
return self._query
@property
def filters(self):
return self._filters
@property
def document_ids(self):
return self._document_ids
@property
def contexts(self):
return self._contexts
@property
def no_answer(self):
return self._no_answer
@property
def answers(self):
return self._answers
@property
def offsets_in_documents(self):
return self._offsets_in_documents
@property
def offsets_in_contexts(self):
return self._offsets_in_contexts
def _aggregate_labels(self, key, must_be_single_value=True) -> List[Any]:
if any(isinstance(getattr(l, key), dict) for l in self.labels):
# dict is not hashable so we collect unique filters via looping through all labels
unique_values = []
for l in self.labels:
if l.filters not in unique_values:
unique_values.append(l.filters)
else:
unique_values = list({getattr(l, key) for l in self.labels})
if must_be_single_value and len(unique_values) > 1:
raise ValueError(
f"Tried to combine attribute '{key}' of Labels, but found multiple different values: {unique_values}"
)
return unique_values
def to_dict(self):
# convert internal attribute names to property names
result = {k[1:] if k[0] == "_" else k: v for k, v in vars(self).items()}
# convert Label object to dict
result["labels"] = [label.to_dict() for label in result["labels"]]
return result
@classmethod
def from_dict(cls, dict: Dict):
# exclude extra arguments
inputs = {k: v for k, v in dict.items() if k in inspect.signature(cls).parameters}
inputs["labels"] = [Label.from_dict(label) for label in inputs["labels"]]
return cls(**inputs)
def to_json(self):
return json.dumps(self.to_dict(), default=pydantic_encoder)
@classmethod
def from_json(cls, data: Union[str, Dict[str, Any]]):
if isinstance(data, str):
dict_data = json.loads(data)
else:
dict_data = data
return cls.from_dict(dict_data)
def __eq__(self, other):
return isinstance(other, self.__class__) and self.labels == other.labels
def __repr__(self):
return f"<MultiLabel: {self.to_dict()}>"
def __str__(self):
return f"<MultiLabel: {self.to_dict()}>"
def _pydantic_dataclass_from_dict(dict: Dict, pydantic_dataclass_type) -> Any:
"""
Constructs a pydantic dataclass from a dict incl. other nested dataclasses.
This allows simple de-serialization of pydantic dataclasses from json.
:param dict: Dict containing all attributes and values for the dataclass.
:param pydantic_dataclass_type: The class of the dataclass that should be constructed (e.g. Document)
"""
base_model = pydantic_dataclass_type.__pydantic_model__.parse_obj(dict)
base_mode_fields = base_model.__fields__
values = {}
for base_model_field_name in base_mode_fields.keys():
value = getattr(base_model, base_model_field_name)
values[base_model_field_name] = value
dataclass_object = pydantic_dataclass_type(**values)
return dataclass_object
def _dict_factory(data):
"""Meant to be as the dict_factory for `asdict`. This function is called within `asdict` to convert a list of tuples
into a dictionary object. This handles the conversion of pandas Dataframes into a list of lists.
:param data: list of (key, value) pairs
"""
def convert_value(v):
if isinstance(v, pd.DataFrame):
return dataframe_to_list(v)
return v
return {k: convert_value(v) for k, v in data}
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
return json.JSONEncoder.default(self, obj)
def dataframe_to_list(df: pd.DataFrame) -> List[List]:
return [df.columns.tolist()] + df.values.tolist()
def dataframe_from_list(list_df: List[List]) -> pd.DataFrame:
return pd.DataFrame(columns=list_df[0], data=list_df[1:])
class EvaluationResult:
def __init__(self, node_results: Optional[Dict[str, DataFrame]] = None) -> None:
"""
A convenience class to store, pass, and interact with results of a pipeline evaluation run (for example `pipeline.eval()`).
Detailed results are stored as one dataframe per node. This class makes them more accessible and provides
convenience methods to work with them.
For example, you can calculate eval metrics, get detailed reports, or simulate different top_k settings:
```python
eval_results = pipeline.eval(...)
# derive detailed metrics
eval_results.calculate_metrics()
# show summary of incorrect queries
eval_results.wrong_examples()
```
Each row of the underlying DataFrames contains either an answer or a document that has been retrieved during evaluation.
Rows are enriched with basic information like rank, query, type, or node.
Additional answer or document-specific evaluation information, like gold labels
and metrics showing whether the row matches the gold labels, are included, too.
The DataFrames have the following schema:
- multilabel_id: The ID of the multilabel, which is unique for the pair of query and filters.
- query: The actual query string.
- filters: The filters used with the query.
- gold_answers (answers only): The expected answers.
- answer (answers only): The actual answer.
- context: The content of the document (the surrounding context of the answer for QA).
- exact_match (answers only): A metric showing if the answer exactly matches the gold label.
- f1 (answers only): A metric showing how well the answer overlaps with the gold label on a token basis.
- sas (answers only, optional): A metric showing how well the answer matches the gold label on a semantic basis.
- exact_match_context_scope (answers only): exact_match with enforced context match.
- f1_context_scope (answers only): f1 with enforced context scope match.
- sas_context_scope (answers only): sas with enforced context scope match.
- exact_match_document_scope (answers only): exact_match with enforced document scope match.
- f1_document_scope (answers only): f1 with enforced document scope match.
- sas_document_scope (answers only): sas with enforced document scope match.
- exact_match_document_id_and_context_scope: (answers only): exact_match with enforced document and context scope match.
- f1_document_id_and_context_scope (answers only): f1 with enforced document and context scope match.
- sas_document_id_and_context_scope (answers only): sas with enforced document and context scope match.
- gold_contexts: The contents of the gold documents.
- gold_id_match (documents only): A metric showing whether one of the gold document IDs matches the document.
- context_match (documents only): A metric showing whether one of the gold contexts matches the document content.
- answer_match (documents only): A metric showing whether the document contains the answer.
- gold_id_or_answer_match (documents only): A Boolean operation specifying that there should be either `'gold_id_match' OR 'answer_match'`.
- gold_id_and_answer_match (documents only): A Boolean operation specifying that there should be both `'gold_id_match' AND 'answer_match'`.
- gold_id_or_context_match (documents only): A Boolean operation specifying that there should be either `'gold_id_match' OR 'context_match'`.
- gold_id_and_context_match (documents only): A Boolean operation specifying that there should be both `'gold_id_match' AND 'context_match'`.
- gold_id_and_context_and_answer_match (documents only): A Boolean operation specifying that there should be `'gold_id_match' AND 'context_match' AND 'answer_match'`.
- context_and_answer_match (documents only): A Boolean operation specifying that there should be both `'context_match' AND 'answer_match'`.
- rank: A rank or 1-based-position in the result list.
- document_id: The ID of the document that has been retrieved or that contained the answer.
- gold_document_ids: The IDs of the documents to be retrieved.
- custom_document_id: The custom ID of the document (specified by `custom_document_id_field`) that has been retrieved or that contained the answer.
- gold_custom_document_ids: The custom documents IDs (specified by `custom_document_id_field`) to be retrieved.
- offsets_in_document (answers only): The position or offsets within the document where the answer was found.
- gold_offsets_in_documents (answers only): The position or offsets of the gold answer within the document.
- gold_answers_exact_match (answers only): exact_match values per gold_answer.
- gold_answers_f1 (answers only): f1 values per gold_answer.
- gold_answers_sas (answers only): sas values per gold answer.
- gold_documents_id_match: The document ID match per gold label (if `custom_document_id_field` has been specified, custom IDs are used).
- gold_contexts_similarity: Context similarity per gold label.
- gold_answers_match (documents only): Specifies whether the document contains an answer per gold label.
- type: Possible values: 'answer' or 'document'.
- node: The node name
- eval_mode: Specifies whether the evaluation was executed in integrated or isolated mode.
Check pipeline.eval()'s add_isolated_node_eval parameter for more information.
:param node_results: The evaluation Dataframes per pipeline node.
"""
self.node_results: Dict[str, DataFrame] = {} if node_results is None else node_results
def __getitem__(self, key: str):
return self.node_results.__getitem__(key)
def __delitem__(self, key: str):
self.node_results.__delitem__(key)
def __setitem__(self, key: str, value: DataFrame):
self.node_results.__setitem__(key, value)
def __contains__(self, key: str):
return self.node_results.keys().__contains__(key)
def __len__(self):
return self.node_results.__len__()
def append(self, key: str, value: DataFrame):
if value is not None and len(value) > 0:
if key in self.node_results:
self.node_results[key] = pd.concat([self.node_results[key], value]).reset_index(drop=True)
else:
self.node_results[key] = value
def calculate_metrics(
self,
simulated_top_k_reader: int = -1,
simulated_top_k_retriever: int = -1,
document_scope: Literal[
"document_id",
"context",
"document_id_and_context",
"document_id_or_context",
"answer",
"document_id_or_answer",
] = "document_id_or_answer",
eval_mode: Literal["integrated", "isolated"] = "integrated",
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
) -> Dict[str, Dict[str, float]]:
"""
Calculates proper metrics for each node.
For Nodes that return Documents, the default metrics are:
- mrr (`Mean Reciprocal Rank <https://en.wikipedia.org/wiki/Mean_reciprocal_rank>`_)
- map (`Mean Average Precision <https://en.wikipedia.org/wiki/Evaluation_measures_%28information_retrieval%29#Mean_average_precision>`_)
- ndcg (`Normalized Discounted Cumulative Gain <https://en.wikipedia.org/wiki/Discounted_cumulative_gain>`_)
- precision (Precision: How many of the returned documents were relevant?)
- recall_multi_hit (Recall according to Information Retrieval definition: How many of the relevant documents were retrieved per query?)
- recall_single_hit (Recall for Question Answering: How many of the queries returned at least one relevant document?)
For Nodes that return answers, the default metrics are:
- exact_match (How many of the queries returned the exact answer?)
- f1 (How well do the returned results overlap with any gold answer on a token basis?)
- sas, if a SAS model has been provided when calling `pipeline.eval()` (How semantically similar is the prediction to the gold answers?)
During the eval run, you can simulate lower top_k values for Reader and Retriever than the actual values.
For example, you can calculate `top_1_f1` for Reader nodes by setting `simulated_top_k_reader=1`.
If you applied `simulated_top_k_retriever` to a Reader node, you should treat the results with caution as they can differ from an actual eval run with a corresponding `top_k_retriever` heavily.
:param simulated_top_k_reader: Simulates the `top_k` parameter of the Reader.
:param simulated_top_k_retriever: Simulates the `top_k` parameter of the Retriever.
Note: There might be a discrepancy between simulated Reader metrics and an actual Pipeline run with Retriever `top_k`.
:param eval_mode: The input the Node was evaluated on.
Usually a Node gets evaluated on the prediction provided by its predecessor Nodes in the Pipeline (`value='integrated'`).
However, as the quality of the Node can heavily depend on the Node's input and thus the predecessor's quality,
you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your Node.
For example, when evaluating the Reader, use `value='isolated'` to simulate a perfect Retriever in an ExtractiveQAPipeline.
Possible values are: `integrated`, `isolated`.
The default value is `integrated`.
:param document_scope: A criterion for deciding whether documents are relevant or not.
You can select between:
- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
A typical use case is Document Retrieval.
- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
A typical use case is Document-Independent Passage Retrieval.
- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match.
A typical use case is Document-Specific Passage Retrieval.
- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match.
A typical use case is Document Retrieval having sparse context labels.
- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically.
A typical use case is Question Answering.
- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match.
This is intended to be a proper default value in order to support both main use cases:
- Document Retrieval
- Question Answering
The default value is 'document_id_or_answer'.
:param answer_scope: Specifies the scope in which a matching answer is considered correct.
You can select between:
- 'any' (default): Any matching answer is considered correct.
- 'context': The answer is only considered correct if its context matches as well.
Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
- 'document_id': The answer is only considered correct if its document ID matches as well.
You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
The default value is 'any'.
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
"""
return {
node: self._calculate_node_metrics(
df,
simulated_top_k_reader=simulated_top_k_reader,
simulated_top_k_retriever=simulated_top_k_retriever,
document_scope=document_scope,
answer_scope=answer_scope,
eval_mode=eval_mode,
)
for node, df in self.node_results.items()
}
def wrong_examples(
self,
node: str,
n: int = 3,
simulated_top_k_reader: int = -1,
simulated_top_k_retriever: int = -1,
document_scope: Literal[
"document_id",
"context",
"document_id_and_context",
"document_id_or_context",
"answer",
"document_id_or_answer",
] = "document_id_or_answer",
document_metric: str = "recall_single_hit",
answer_metric: str = "f1",
document_metric_threshold: float = 0.5,
answer_metric_threshold: float = 0.5,
eval_mode: Literal["integrated", "isolated"] = "integrated",
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
) -> List[Dict]:
"""
Returns the worst performing queries.
Worst performing queries are calculated based on the metric
that is either a document metric or an answer metric according to the node type.
Lower top_k values for reader and retriever than the actual values during the eval run can be simulated.
See calculate_metrics() for more information.
:param simulated_top_k_reader: simulates top_k param of reader
:param simulated_top_k_retriever: simulates top_k param of retriever.
remarks: there might be a discrepancy between simulated reader metrics and an actual pipeline run with retriever top_k
:param document_metric: the document metric worst queries are calculated with.
values can be: 'recall_single_hit', 'recall_multi_hit', 'mrr', 'map', 'precision'
:param answer_metric: the answer metric worst queries are calculated with.
values can be: 'f1', 'exact_match' and 'sas' if the evaluation was made using a SAS model.
:param document_metric_threshold: the threshold for the document metric (only samples below selected metric
threshold will be considered)
:param answer_metric_threshold: the threshold for the answer metric (only samples below selected metric
threshold will be considered)
:param eval_mode: the input on which the node was evaluated on.
Usually nodes get evaluated on the prediction provided by its predecessor nodes in the pipeline (value='integrated').
However, as the quality of the node itself can heavily depend on the node's input and thus the predecessor's quality,
you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your node.
For example when evaluating the reader use value='isolated' to simulate a perfect retriever in an ExtractiveQAPipeline.
Values can be 'integrated', 'isolated'.
Default value is 'integrated'.
:param document_scope: A criterion for deciding whether documents are relevant or not.
You can select between:
- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
A typical use case is Document Retrieval.
- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
A typical use case is Document-Independent Passage Retrieval.
- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match.
A typical use case is Document-Specific Passage Retrieval.
- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match.
A typical use case is Document Retrieval having sparse context labels.
- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically.
A typical use case is Question Answering.
- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match.
This is intended to be a proper default value in order to support both main use cases:
- Document Retrieval
- Question Answering
The default value is 'document_id_or_answer'.
:param answer_scope: Specifies the scope in which a matching answer is considered correct.
You can select between:
- 'any' (default): Any matching answer is considered correct.
- 'context': The answer is only considered correct if its context matches as well.
Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
- 'document_id': The answer is only considered correct if its document ID matches as well.
You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
The default value is 'any'.
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
"""
node_df = self.node_results[node]
node_df = self._filter_eval_mode(node_df, eval_mode)
answers = node_df[node_df["type"] == "answer"]
if len(answers) > 0:
metrics_df = self._build_answer_metrics_df(
answers,
simulated_top_k_reader=simulated_top_k_reader,
simulated_top_k_retriever=simulated_top_k_retriever,
answer_scope=answer_scope,
)
worst_df = metrics_df.sort_values(by=[answer_metric]).head(n)
wrong_examples = []
for multilabel_id, metrics in worst_df.iterrows():
query_answers = answers[answers["multilabel_id"] == multilabel_id]
if answer_metric not in metrics:
logger.warning(
"You specified an answer_metric=%s not available in calculated metrics=%s."
"Skipping collection of worst performing samples.",
answer_metric,
metrics.keys(),
)
break
if metrics[answer_metric] <= answer_metric_threshold:
query_dict = {
"multilabel_id": query_answers["multilabel_id"].iloc[0],
"query": query_answers["query"].iloc[0],
"filters": query_answers["filters"].iloc[0],
"metrics": metrics.to_dict(),
"answers": query_answers.drop(
["node", "query", "type", "gold_answers", "gold_offsets_in_documents", "gold_document_ids"],
axis=1,
).to_dict(orient="records"),
"gold_answers": query_answers["gold_answers"].iloc[0],
"gold_document_ids": query_answers["gold_document_ids"].iloc[0],
}
wrong_examples.append(query_dict)
return wrong_examples
documents = node_df[node_df["type"] == "document"]
if len(documents) > 0:
document_relevance_criterion = self._get_document_relevance_criterion(
document_scope=document_scope, answer_scope=answer_scope
)
metrics_df = self._build_document_metrics_df(
documents,
simulated_top_k_retriever=simulated_top_k_retriever,
document_relevance_criterion=document_relevance_criterion,
)
worst_df = metrics_df.sort_values(by=[document_metric]).head(n)
wrong_examples = []
for multilabel_id, metrics in worst_df.iterrows():
if document_metric not in metrics:
logger.warning(
"You specified a document_metric=%s not available in calculated metrics=%s."
"Skipping collection of worst performing samples.",
document_metric,
metrics.keys(),
)
break
if metrics[document_metric] <= document_metric_threshold:
query_documents = documents[documents["multilabel_id"] == multilabel_id]
query_dict = {
"multilabel_id": query_documents["multilabel_id"].iloc[0],
"query": query_documents["query"].iloc[0],
"filters": query_documents["filters"].iloc[0],
"metrics": metrics.to_dict(),
"documents": query_documents.drop(
["node", "query", "multilabel_id", "filters", "type", "gold_document_ids", "gold_contexts"],
axis=1,
).to_dict(orient="records"),
"gold_document_ids": query_documents["gold_document_ids"].iloc[0],
}
wrong_examples.append(query_dict)
return wrong_examples
return []
def _get_document_relevance_criterion(
self,
document_scope: Literal[
"document_id",
"context",
"document_id_and_context",
"document_id_or_context",
"answer",
"document_id_or_answer",
] = "document_id_or_answer",
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
) -> Literal[
"document_id",
"context",
"document_id_and_context",
"document_id_or_context",
"answer",
"context_and_answer",
"document_id_and_answer",
"document_id_and_context_and_answer",
"document_id_or_answer",
]:
"""
Combines document_scope and answer_scope to create the document_relevance_criterion.
"""
answer_scope_to_doc_relevance_crit = {
"context": "context_and_answer",
"document_id": "document_id_and_answer",
"document_id_and_context": "document_id_and_context_and_answer",
}
document_relevance_criterion: str = document_scope
if document_scope in ["answer", "document_id_or_answer"]:
document_relevance_criterion = answer_scope_to_doc_relevance_crit.get(answer_scope, document_scope)
elif answer_scope in answer_scope_to_doc_relevance_crit.keys():
logger.warning(
"You specified a non-answer document_scope together with a non-default answer_scope. "
"This may result in inconsistencies between answer and document metrics. "
"To enforce the same definition of correctness for both, document_scope must be one of 'answer', 'document_id_or_answer'."
)
return document_relevance_criterion # type: ignore[return-value]
def _calculate_node_metrics(
self,
df: DataFrame,
simulated_top_k_reader: int = -1,
simulated_top_k_retriever: int = -1,
document_scope: Literal[
"document_id",
"context",
"document_id_and_context",
"document_id_or_context",
"answer",
"document_id_or_answer",
] = "document_id_or_answer",
eval_mode: str = "integrated",
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
) -> Dict[str, float]:
df = self._filter_eval_mode(df, eval_mode)
answer_metrics = self._calculate_answer_metrics(
df,
simulated_top_k_reader=simulated_top_k_reader,
simulated_top_k_retriever=simulated_top_k_retriever,
answer_scope=answer_scope,
)
document_relevance_criterion = self._get_document_relevance_criterion(
document_scope=document_scope, answer_scope=answer_scope
)
document_metrics = self._calculate_document_metrics(
df,
simulated_top_k_retriever=simulated_top_k_retriever,
document_relevance_criterion=document_relevance_criterion,
)
return {**answer_metrics, **document_metrics}
def _filter_eval_mode(self, df: DataFrame, eval_mode: str) -> DataFrame:
if "eval_mode" in df.columns:
df = df[df["eval_mode"] == eval_mode]
else:
logger.warning("eval dataframe has no eval_mode column. eval_mode param will be ignored.")
return df
def _calculate_answer_metrics(
self,
df: DataFrame,
simulated_top_k_reader: int = -1,
simulated_top_k_retriever: int = -1,
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
) -> Dict[str, float]:
answers = df[df["type"] == "answer"]
if len(answers) == 0:
return {}
metrics_df = self._build_answer_metrics_df(
answers,
simulated_top_k_reader=simulated_top_k_reader,
simulated_top_k_retriever=simulated_top_k_retriever,
answer_scope=answer_scope,
)
num_examples_for_eval = len(answers["multilabel_id"].unique())
result = {metric: metrics_df[metric].mean().tolist() for metric in metrics_df.columns}
result["num_examples_for_eval"] = float(num_examples_for_eval) # formatter requires float
return result
def _build_answer_metrics_df(
self,
answers: DataFrame,
simulated_top_k_reader: int = -1,
simulated_top_k_retriever: int = -1,
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
) -> DataFrame:
"""
Builds a dataframe containing answer metrics (columns) per multilabel (index).
Answer metrics are:
- exact_match (Did the query exactly return any gold answer? -> 1.0 or 0.0)
- f1 (How well does the best matching returned results overlap with any gold answer on token basis?)
- sas if a SAS model has been provided during pipeline.eval() (How semantically similar is the prediction to the gold answers?)
"""
multilabel_ids = answers["multilabel_id"].unique()
# simulate top k retriever
if simulated_top_k_retriever != -1:
documents = self._get_documents_df()
top_k_documents = documents[documents["rank"] <= simulated_top_k_retriever]
simulated_answers = []
for multilabel_id in multilabel_ids:
top_k_document_ids = top_k_documents[top_k_documents["multilabel_id"] == multilabel_id][
"document_id"
].unique()
query_answers = answers[answers["multilabel_id"] == multilabel_id]
# consider only the answers within simulated_top_k_retriever documents
simulated_query_answers = query_answers[
query_answers["document_ids"].apply(
lambda document_ids, top_k_document_ids=top_k_document_ids: all(
document_id in top_k_document_ids for document_id in document_ids
)
)
]
# simulate top k reader
if simulated_top_k_reader != -1:
# consider only the simulated_top_k_reader answers within simulated_query_answers
simulated_query_answers = simulated_query_answers.nsmallest(simulated_top_k_reader, "rank")
simulated_query_answers["rank"] = np.arange(1, len(simulated_query_answers) + 1)
simulated_answers.append(simulated_query_answers)
answers = pd.concat(simulated_answers)
# simulate top k reader
elif simulated_top_k_reader != -1:
answers = answers[answers["rank"] <= simulated_top_k_reader]
# build metrics df
answer_metrics = ["exact_match", "f1", "sas"]
df_records = []
for multilabel_id in multilabel_ids:
query_df = answers[answers["multilabel_id"] == multilabel_id]
metric_to_scoped_col = {
metric: f"{metric}_{answer_scope}_scope" if answer_scope != "any" else metric
for metric in answer_metrics
if metric in query_df.columns
}
query_metrics = {
metric: query_df[col].max() if any(query_df) else 0.0 for metric, col in metric_to_scoped_col.items()
}
df_records.append(query_metrics)
metrics_df = DataFrame.from_records(df_records, index=multilabel_ids)
return metrics_df
def _get_documents_df(self):
document_dfs = [
node_df for node_df in self.node_results.values() if len(node_df[node_df["type"] == "document"]) > 0
]
if len(document_dfs) != 1:
raise ValueError("cannot detect retriever dataframe")
documents_df = document_dfs[0]
documents_df = documents_df[documents_df["type"] == "document"]
return documents_df
def _calculate_document_metrics(
self,
df: DataFrame,
simulated_top_k_retriever: int = -1,
document_relevance_criterion: Literal[
"document_id",
"context",
"document_id_and_context",
"document_id_or_context",
"answer",
"context_and_answer",
"document_id_and_answer",
"document_id_and_context_and_answer",
"document_id_or_answer",
] = "document_id_or_answer",
) -> Dict[str, float]:
documents = df[df["type"] == "document"]
if len(documents) == 0:
return {}
metrics_df = self._build_document_metrics_df(
documents,
simulated_top_k_retriever=simulated_top_k_retriever,
document_relevance_criterion=document_relevance_criterion,
)
return {metric: metrics_df[metric].mean().tolist() for metric in metrics_df.columns}
def _build_document_metrics_df(
self,
documents: DataFrame,
simulated_top_k_retriever: int = -1,
document_relevance_criterion: Literal[
"document_id",
"context",
"document_id_and_context",
"document_id_or_context",
"answer",
"context_and_answer",
"document_id_and_answer",
"document_id_and_context_and_answer",
"document_id_or_answer",
] = "document_id_or_answer",
) -> DataFrame:
"""
Builds a dataframe containing document metrics (columns) per pair of query and gold document ids (index).
Document metrics are:
- mrr (Mean Reciprocal Rank: see https://en.wikipedia.org/wiki/Mean_reciprocal_rank)
- map (Mean Average Precision: see https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision)
- precision (Precision: How many of the returned documents were relevant?)
- recall_multi_hit (Recall according to Information Retrieval definition: How many of the relevant documents were retrieved per query?)
- recall_single_hit (Recall for Question Answering: Did the query return at least one relevant document? -> 1.0 or 0.0)
:param documents: document eval dataframe
:param simulated_top_k_retriever: simulates top_k param of retriever.
:param document_relevance_criterion: criterion for deciding whether documents are relevant or not.
You can select between:
- 'document_id': Document's id or custom id must match.
Typical use case: Document Retrieval
- 'context': Document's content must match.
Typical use case: Document-independent Passage Retrieval
- 'document_id_and_context': boolean operation `'document_id' AND 'context'`.
Typical use case: Document-specific Passage Retrieval
- 'document_id_or_context': boolean operation `'document_id' OR 'context'`.
Typical use case: Document Retrieval having sparse context labels
- 'answer': Document's content must include the answer.
Typical use case: Question Answering
- 'document_id_or_answer' (default): boolean operation `'document_id' OR 'answer'`.
This is intended to be a proper default value in order to support both main use cases:
- Document Retrieval
- Question Answering
- 'context_and_answer': boolean operation `'context' AND 'answer'`.
Typical use case: Question Answering with context-specific answers (see answer_scope='context')
- 'document_id_and_answer': boolean operation `'document_id' AND 'answer'`.
Typical use case: Question Answering with document-specific answers (see answer_scope='document_id')
- 'document_id_and_context_and_answer': boolean operation `'document_id' AND 'context' and 'answer'`.
Typical use case: Question Answering with document-and-context-specific answers (see answer_scope='document_id_and_context')
Default value is 'document_id_or_answer'.
"""
if simulated_top_k_retriever != -1:
documents = documents[documents["rank"] <= simulated_top_k_retriever]
# find out which label matched
def find_matched_label_idxs(row) -> List[int]: # pylint: disable=too-many-return-statements
id_matches = [idx for idx, val in enumerate(row["gold_documents_id_match"]) if val == 1.0]
context_matches = [
idx for idx, val in enumerate(row["gold_contexts_similarity"]) if val > 65.0
] # TODO: hardcoded threshold for now, will be param of calculate_metrics
answer_matches = [idx for idx, val in enumerate(row["gold_answers_match"]) if val == 1.0]
if document_relevance_criterion == "document_id":
return id_matches
elif document_relevance_criterion == "context":
return context_matches
elif document_relevance_criterion == "answer":
return answer_matches
elif document_relevance_criterion == "document_id_and_context":
return list(set(id_matches) & set(context_matches))
elif document_relevance_criterion == "document_id_or_context":
return list(set(id_matches) | set(context_matches))
elif document_relevance_criterion == "document_id_and_answer":
return list(set(id_matches) & set(answer_matches))
elif document_relevance_criterion == "document_id_or_answer":
return list(set(id_matches) | set(answer_matches))
elif document_relevance_criterion == "context_and_answer":
return list(set(context_matches) & set(answer_matches))
elif document_relevance_criterion == "document_id_and_context_and_answer":
return list(set(id_matches) & set(context_matches) & set(answer_matches))
else:
raise ValueError(f"document_relevance_criterion '{document_relevance_criterion}' not supported.")
documents["matched_label_idxs"] = documents.apply(find_matched_label_idxs, axis=1)
metrics = []
for multilabel_id in documents["multilabel_id"].unique():
query_df = documents[documents["multilabel_id"] == multilabel_id]
# Note: Metrics are always calculated on document_ids.
# For some document relevance criteria (e.g. context), the gold_document_ids are not enough or not useful at all.
# So, we have to adjust the relevant ids according to the document_relevance_criterion.
relevance_criterion_col = f"{document_relevance_criterion.replace('document_id', 'gold_id')}_match"
relevant_rows = query_df[query_df[relevance_criterion_col] == 1]
# all labels without no_answers
# we need to match all (except for single hit recall)
gold_document_ids = (
list(query_df["gold_custom_document_ids"].iloc[0])
if "gold_custom_document_ids" in query_df
else list(query_df["gold_document_ids"].iloc[0])
)
# remove no_answer label
gold_document_ids = [id for id in gold_document_ids if id != "00"]
num_labels = len(gold_document_ids)
num_matched_labels = len({idx for idxs in relevant_rows["matched_label_idxs"] for idx in idxs})
num_missing_labels = num_labels - num_matched_labels
relevance_criterion_ids = list(relevant_rows["document_id"].values)
num_relevants = len(set(relevance_criterion_ids)) + num_missing_labels
num_retrieved = len(query_df["document_id"])
num_retrieved_relevants = len(relevant_rows)
rank_retrieved_relevants = relevant_rows["rank"].values
if num_labels == 0:
# For no_answer queries, we set all metrics to 1.0, to indicate that the retriever cannot improve the pipeline.
# This behavior is different from pytrec_eval, which sets the metrics to 0.0 if there is no relevant document in the evalset.
rr = 1.0
avg_precision = 1.0
recall_multi_hit = 1.0
recall_single_hit = 1.0
precision = 1.0
ndcg = 1.0
elif num_retrieved_relevants == 0:
# Set all metrics to 0.0 if no relevant document has been retrieved to avoid undefined metrics.
rr = 0.0
avg_precision = 0.0
recall_multi_hit = 0.0
recall_single_hit = 0.0
precision = 0.0
ndcg = 0.0
else:
# The previous checks ensure:
# - `num_labels` > 0
# - `num_retrieved_relevants` > 0
# - `num_relevants` > 0 (`num_relevants` is always >= `num_labels`)
# - `num_retrieved` > 0 (`num_retrieved` is always >= `num_retrieved_relevants`)
# - `len(rank_retrieved_relevants)` > 0 (`len(rank_retrieved_relevants)` is always == `num_retrieved_relevants`)
avp_retrieved_relevants = [
len(relevant_rows[relevant_rows["rank"] <= rank]) / rank for rank in rank_retrieved_relevants
]
avg_precision = np.sum(avp_retrieved_relevants) / num_relevants
recall_multi_hit = num_matched_labels / num_labels
recall_single_hit = 1.0
precision = num_retrieved_relevants / num_retrieved
rr = 1.0 / rank_retrieved_relevants.min()
dcg = np.sum([1.0 / np.log2(rank + 1) for rank in rank_retrieved_relevants])
idcg = np.sum([1.0 / np.log2(rank + 1) for rank in range(1, num_relevants + 1)])
ndcg = dcg / idcg
metrics.append(
{
"recall_multi_hit": recall_multi_hit,
"recall_single_hit": recall_single_hit,
"precision": precision,
"map": avg_precision,
"mrr": rr,
"ndcg": ndcg,
}
)
metrics_df = DataFrame.from_records(metrics, index=documents["multilabel_id"].unique())
return metrics_df
def save(self, out_dir: Union[str, Path], **to_csv_kwargs):
"""
Saves the evaluation result.
The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder.
:param out_dir: Path to the target folder the csvs will be saved.
:param to_csv_kwargs: kwargs to be passed to DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
This method uses different default values than DataFrame.to_csv() for the following parameters:
index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
"""
out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir)
logger.info("Saving evaluation results to %s", out_dir)
if not out_dir.exists():
out_dir.mkdir(parents=True)
for node_name, df in self.node_results.items():
target_path = out_dir / f"{node_name}.csv"
default_to_csv_kwargs = {
"index": False,
"quoting": csv.QUOTE_NONNUMERIC, # avoids problems with \r chars in texts by enclosing all string values in quotes
}
to_csv_kwargs = {**default_to_csv_kwargs, **to_csv_kwargs}
df.to_csv(target_path, **to_csv_kwargs)
@classmethod
def load(cls, load_dir: Union[str, Path], **read_csv_kwargs):
"""
Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.
:param load_dir: The directory containing the csv files.
:param read_csv_kwargs: kwargs to be passed to pd.read_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.
This method uses different default values than pd.read_csv() for the following parameters:
header=0, converters=CONVERTERS
where CONVERTERS is a dictionary mapping all array typed columns to ast.literal_eval.
"""
load_dir = load_dir if isinstance(load_dir, Path) else Path(load_dir)
csv_files = [file for file in load_dir.iterdir() if file.is_file() and file.suffix == ".csv"]
cols_to_convert = [
"filters",
"gold_document_ids",
"gold_custom_document_ids",
"gold_contexts",
"gold_answers",
"gold_documents_id_match",
"gold_offsets_in_documents",
"gold_offsets_in_contexts",
"gold_answers_exact_match",
"gold_answers_f1",
"gold_answers_sas",
"gold_answers_match",
"gold_contexts_similarity",
"offsets_in_document",
"offsets_in_context",
"document_ids",
"custom_document_ids",
"gold_document_contents",
]
def safe_literal_eval(x: str) -> Any:
if x == "":
return None
return ast.literal_eval(x)
converters = dict.fromkeys(cols_to_convert, safe_literal_eval)
default_read_csv_kwargs = {"converters": converters, "header": 0}
read_csv_kwargs = {**default_read_csv_kwargs, **read_csv_kwargs}
node_results = {file.stem: pd.read_csv(file, **read_csv_kwargs) for file in csv_files}
# backward compatibility mappings
for df in node_results.values():
df.replace(to_replace=np.nan, value=None, inplace=True)
df.rename(columns={"gold_document_contents": "gold_contexts", "content": "context"}, inplace=True)
# convert single document_id to list
if "answer" in df.columns and "document_id" in df.columns and not "document_ids" in df.columns:
df["document_ids"] = df["document_id"].apply(lambda x: [x] if x not in [None, "None"] else [])
df.drop(columns=["document_id"], inplace=True)
if (
"answer" in df.columns
and "custom_document_id" in df.columns
and not "custom_document_ids" in df.columns
):
df["custom_document_ids"] = df["custom_document_id"].apply(
lambda x: [x] if x not in [None, "None"] else []
)
df.drop(columns=["custom_document_id"], inplace=True)
result = cls(node_results)
return result