mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-18 13:37:55 +00:00

* ci: Simplify Python code with ruff rules SIM * Revert #5828 * ruff --select=I --fix haystack/modeling/infer.py --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
1634 lines
80 KiB
Python
1634 lines
80 KiB
Python
from __future__ import annotations
|
|
import csv
|
|
import hashlib
|
|
import inspect
|
|
|
|
from typing import Any, Optional, Dict, List, Union, Literal
|
|
|
|
from pathlib import Path
|
|
from uuid import uuid4
|
|
import logging
|
|
import time
|
|
import json
|
|
import ast
|
|
from dataclasses import asdict
|
|
|
|
import numpy as np
|
|
from numpy import ndarray
|
|
import pandas as pd
|
|
from pandas import DataFrame
|
|
|
|
from pydantic import BaseConfig, Field
|
|
from pydantic.json import pydantic_encoder
|
|
|
|
# We are using Pydantic dataclasses instead of vanilla Python's
|
|
# See #1598 for the reasons behind this choice & performance considerations
|
|
from pydantic.dataclasses import dataclass
|
|
|
|
from haystack.mmh3 import hash128
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
BaseConfig.arbitrary_types_allowed = True
|
|
|
|
|
|
#: Types of content_types supported
|
|
ContentTypes = Literal["text", "table", "image", "audio"]
|
|
FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]]
|
|
|
|
|
|
@dataclass
|
|
class Document:
|
|
id: str
|
|
content: Union[str, DataFrame]
|
|
content_type: ContentTypes = Field(default="text")
|
|
meta: Dict[str, Any] = Field(default={})
|
|
id_hash_keys: List[str] = Field(default=["content"])
|
|
score: Optional[float] = None
|
|
embedding: Optional[ndarray] = None
|
|
|
|
# We use a custom init here as we want some custom logic. The annotations above are however still needed in order
|
|
# to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method
|
|
# They also help in annotating which object attributes will always be present (e.g. "id") even though they
|
|
# don't need to passed by the user in init and are rather initialized automatically in the init
|
|
def __init__(
|
|
self,
|
|
content: Union[str, DataFrame],
|
|
content_type: ContentTypes = "text",
|
|
id: Optional[str] = None,
|
|
score: Optional[float] = None,
|
|
meta: Optional[Dict[str, Any]] = None,
|
|
embedding: Optional[ndarray] = None,
|
|
id_hash_keys: Optional[List[str]] = None,
|
|
):
|
|
"""
|
|
One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
|
|
Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
|
|
many other places that manipulate or interact with document-level data.
|
|
Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
|
|
into smaller passages. We'll have one Document per passage in this case.
|
|
Each document has a unique ID. This can be supplied by the user or generated automatically.
|
|
It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
|
|
There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
|
|
:param content: Content of the document. For most cases, this will be text, but it can be a table or image.
|
|
:param content_type: One of "text", "table", "image" or "audio". Haystack components can use this to adjust their
|
|
handling of Documents and check compatibility.
|
|
:param id: Unique ID for the document. If not supplied by the user, we'll generate one automatically by
|
|
creating a hash from the supplied text. This behaviour can be further adjusted by `id_hash_keys`.
|
|
:param score: The relevance score of the Document determined by a model (e.g. Retriever or Re-Ranker).
|
|
If model's `scale_score` was set to True (default) score is in the unit interval (range of [0,1]), where 1 means extremely relevant.
|
|
:param meta: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed).
|
|
:param embedding: Vector encoding of the text
|
|
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's attributes.
|
|
To ensure you don't have duplicate documents in your DocumentStore if texts are
|
|
not unique, modify the metadata and pass, for example, "meta" to this field (example: ["content", "meta"]).
|
|
In this case, the id is generated by using the content and the defined metadata.
|
|
If you specify a custom ID for the `id` parameter, the `id_hash_keys` parameter is
|
|
ignored and the custom ID is used.
|
|
|
|
Note that you can use even nested fields of the `meta` as id_hash_keys. For example, if you
|
|
have a key in `meta` called `url` and you want to use it as part of the id, you can pass
|
|
this parameter as `["meta.url"]`. Haystack supports a maximum depth of 1. For example, if you
|
|
use `meta.url.path`, it looks for the `url.path` key in the `meta` dict, for example `meta['url.path']`.
|
|
|
|
|
|
"""
|
|
|
|
if content is None:
|
|
raise ValueError("Can't create 'Document': Mandatory 'content' field is None")
|
|
|
|
self.content = content
|
|
self.content_type = content_type
|
|
self.score = score
|
|
self.meta = meta or {}
|
|
|
|
allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding"]
|
|
|
|
if id_hash_keys is not None and not all(
|
|
key in allowed_hash_key_attributes or key.startswith("meta.") for key in id_hash_keys
|
|
):
|
|
raise ValueError(
|
|
f"You passed custom strings {id_hash_keys} to id_hash_keys which is deprecated. Supply instead a "
|
|
f"list of Document's attribute names (like {', '.join(allowed_hash_key_attributes)}) or "
|
|
f"a key of meta with a maximum depth of 1 (like meta.url). "
|
|
"See [Custom id hashing on documentstore level](https://github.com/deepset-ai/haystack/pull/1910) and "
|
|
"[Allow more flexible Document id hashing](https://github.com/deepset-ai/haystack/issues/4317) for details"
|
|
)
|
|
# We store id_hash_keys to be able to clone documents, for example when splitting them during pre-processing
|
|
self.id_hash_keys = id_hash_keys or ["content"]
|
|
|
|
if embedding is not None:
|
|
embedding = np.asarray(embedding)
|
|
self.embedding = embedding
|
|
|
|
# Create a unique ID (either new one, or one from user input)
|
|
if id is not None:
|
|
self.id: str = str(id)
|
|
else:
|
|
self.id: str = self._get_id(id_hash_keys=id_hash_keys)
|
|
|
|
def _get_id(self, id_hash_keys: Optional[List[str]] = None):
|
|
"""
|
|
Generate the id of a document by creating the hash of strings. By default the content of a document is
|
|
used to generate the hash. There are two ways of modifying the generated id of a document. Either static keys
|
|
or a selection of the content.
|
|
:param id_hash_keys: Optional list of fields that should be dynamically used to generate the hash.
|
|
"""
|
|
|
|
if id_hash_keys is None:
|
|
return "{:02x}".format(hash128(str(self.content)))
|
|
|
|
final_hash_key = ""
|
|
for attr in id_hash_keys:
|
|
if attr.startswith("meta."):
|
|
meta_key = attr.split(".", maxsplit=1)[1]
|
|
if meta_key in self.meta:
|
|
final_hash_key += ":" + str(self.meta[meta_key])
|
|
else:
|
|
final_hash_key += ":" + str(getattr(self, attr))
|
|
|
|
if final_hash_key == "":
|
|
raise ValueError(
|
|
"Can't create 'Document': 'id_hash_keys' must contain at least one of ['content', 'meta'] or be set to None."
|
|
)
|
|
|
|
return "{:02x}".format(hash128(final_hash_key))
|
|
|
|
def to_dict(self, field_map: Optional[Dict[str, Any]] = None) -> Dict:
|
|
"""
|
|
Convert Document to dict. An optional field_map can be supplied to change the names of the keys in the
|
|
resulting dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
|
|
they are serialized / stored in other places (e.g. elasticsearch)
|
|
Example:
|
|
|
|
```python
|
|
doc = Document(content="some text", content_type="text")
|
|
doc.to_dict(field_map={"custom_content_field": "content"})
|
|
|
|
# Returns {"custom_content_field": "some text", content_type": "text"}
|
|
```
|
|
|
|
:param field_map: Dict with keys being the custom target keys and values being the standard Document attributes
|
|
:return: dict with content of the Document
|
|
"""
|
|
if not field_map:
|
|
field_map = {}
|
|
|
|
inv_field_map = {v: k for k, v in field_map.items()}
|
|
_doc: Dict[str, str] = {}
|
|
for k, v in self.__dict__.items():
|
|
# Exclude internal fields (Pydantic, ...) fields from the conversion process
|
|
if k.startswith("__"):
|
|
continue
|
|
# Convert pd.DataFrame to list of rows for serialization
|
|
if k == "content" and self.content_type == "table" and isinstance(self.content, DataFrame):
|
|
v = dataframe_to_list(self.content)
|
|
k = k if k not in inv_field_map else inv_field_map[k]
|
|
_doc[k] = v
|
|
return _doc
|
|
|
|
@classmethod
|
|
def from_dict(cls, dict: Dict[str, Any], field_map: Optional[Dict[str, Any]] = None) -> Document:
|
|
"""
|
|
Create Document from dict. An optional `field_map` parameter can be supplied to adjust for custom names of the keys in the
|
|
input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
|
|
they are serialized / stored in other places (e.g. elasticsearch).
|
|
|
|
Example:
|
|
|
|
```python
|
|
my_dict = {"custom_content_field": "some text", "content_type": "text"}
|
|
Document.from_dict(my_dict, field_map={"custom_content_field": "content"})
|
|
```
|
|
|
|
:param field_map: Dict with keys being the custom target keys and values being the standard Document attributes
|
|
:return: A Document object
|
|
"""
|
|
if not field_map:
|
|
field_map = {}
|
|
|
|
_doc = dict.copy()
|
|
init_args = ["content", "content_type", "id", "score", "id_hash_keys", "question", "meta", "embedding"]
|
|
if "meta" not in _doc.keys():
|
|
_doc["meta"] = {}
|
|
# copy additional fields into "meta"
|
|
for k, v in _doc.items():
|
|
# Exclude internal fields (Pydantic, ...) fields from the conversion process
|
|
if k.startswith("__"):
|
|
continue
|
|
if k not in init_args and k not in field_map:
|
|
_doc["meta"][k] = v
|
|
# remove additional fields from top level
|
|
_new_doc = {}
|
|
for k, v in _doc.items():
|
|
if k in init_args:
|
|
_new_doc[k] = v
|
|
elif k in field_map:
|
|
k = field_map[k]
|
|
_new_doc[k] = v
|
|
|
|
# Convert list of rows to DataFrame
|
|
if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
|
|
_new_doc["content"] = dataframe_from_list(_new_doc["content"])
|
|
|
|
return cls(**_new_doc)
|
|
|
|
def to_json(self, field_map: Optional[Dict[str, Any]] = None) -> str:
|
|
if not field_map:
|
|
field_map = {}
|
|
dictionary = self.to_dict(field_map=field_map)
|
|
return json.dumps(dictionary, cls=NumpyEncoder)
|
|
|
|
@classmethod
|
|
def from_json(cls, data: Union[str, Dict[str, Any]], field_map: Optional[Dict[str, Any]] = None) -> Document:
|
|
if not field_map:
|
|
field_map = {}
|
|
if isinstance(data, str):
|
|
dict_data = json.loads(data)
|
|
else:
|
|
dict_data = data
|
|
return cls.from_dict(dict_data, field_map=field_map)
|
|
|
|
def __eq__(self, other):
|
|
content = getattr(other, "content", None)
|
|
if isinstance(content, pd.DataFrame):
|
|
is_content_equal = content.equals(self.content)
|
|
else:
|
|
is_content_equal = content == self.content
|
|
return (
|
|
isinstance(other, self.__class__)
|
|
and is_content_equal
|
|
and getattr(other, "content_type", None) == self.content_type
|
|
and getattr(other, "id", None) == self.id
|
|
and getattr(other, "id_hash_keys", None) == self.id_hash_keys
|
|
and getattr(other, "score", None) == self.score
|
|
and getattr(other, "meta", None) == self.meta
|
|
and np.array_equal(getattr(other, "embedding", None), self.embedding)
|
|
)
|
|
|
|
def __repr__(self):
|
|
doc_dict = self.to_dict()
|
|
embedding = doc_dict.get("embedding", None)
|
|
if embedding is not None:
|
|
doc_dict["embedding"] = f"<embedding of shape {getattr(embedding, 'shape', '[no shape]')}>"
|
|
return f"<Document: {str(doc_dict)}>"
|
|
|
|
def __str__(self):
|
|
# In some cases, self.content is None (therefore not subscriptable)
|
|
if self.content is None:
|
|
return f"<Document: id={self.id}, content=None>"
|
|
return f"<Document: id={self.id}, content='{self.content[:100]}{'...' if len(self.content) > 100 else ''}'>"
|
|
|
|
def __lt__(self, other):
|
|
"""Enable sorting of Documents by score"""
|
|
return self.score < other.score
|
|
|
|
|
|
@dataclass
|
|
class Span:
|
|
start: int
|
|
end: int
|
|
"""
|
|
Defining a sequence of characters (Text span) or cells (Table span) via start and end index.
|
|
For extractive QA: Character where answer starts/ends
|
|
|
|
:param start: Position where the span starts
|
|
:param end: Position where the span ends
|
|
"""
|
|
|
|
def __contains__(self, value):
|
|
"""
|
|
Checks for inclusion of the given value into the interval defined by Span.
|
|
```
|
|
assert 10 in Span(5, 15) # True
|
|
assert 20 in Span(1, 15) # False
|
|
```
|
|
Includes the left edge, but not the right edge.
|
|
```
|
|
assert 5 in Span(5, 15) # True
|
|
assert 15 in Span(5, 15) # False
|
|
```
|
|
Works for numbers and all values that can be safely converted into floats.
|
|
```
|
|
assert 10.0 in Span(5, 15) # True
|
|
assert "10" in Span(5, 15) # True
|
|
```
|
|
It also works for Span objects, returning True only if the given
|
|
Span is fully contained into the original Span.
|
|
As for numerical values, the left edge is included, the right edge is not.
|
|
```
|
|
assert Span(10, 11) in Span(5, 15) # True
|
|
assert Span(5, 10) in Span(5, 15) # True
|
|
assert Span(10, 15) in Span(5, 15) # False
|
|
assert Span(5, 15) in Span(5, 15) # False
|
|
assert Span(5, 14) in Span(5, 15) # True
|
|
assert Span(0, 1) in Span(5, 15) # False
|
|
assert Span(0, 10) in Span(5, 15) # False
|
|
assert Span(10, 20) in Span(5, 15) # False
|
|
```
|
|
"""
|
|
if isinstance(value, Span):
|
|
return self.start <= value.start and self.end > value.end
|
|
try:
|
|
value = float(value)
|
|
return self.start <= value < self.end
|
|
except Exception as e:
|
|
raise ValueError(
|
|
f"Cannot use 'in' with a value of type {type(value)}. Use numeric values or Span objects."
|
|
) from e
|
|
|
|
|
|
@dataclass
|
|
class TableCell:
|
|
row: int
|
|
col: int
|
|
"""
|
|
Defining a table cell via the row and column index.
|
|
|
|
:param row: Row index of the cell
|
|
:param col: Column index of the cell
|
|
"""
|
|
|
|
|
|
@dataclass
|
|
class Answer:
|
|
answer: str
|
|
type: Literal["generative", "extractive", "other"] = "extractive"
|
|
score: Optional[float] = None
|
|
context: Optional[Union[str, DataFrame]] = None
|
|
offsets_in_document: Optional[Union[List[Span], List[TableCell]]] = None
|
|
offsets_in_context: Optional[Union[List[Span], List[TableCell]]] = None
|
|
document_ids: Optional[List[str]] = None
|
|
meta: Optional[Dict[str, Any]] = None
|
|
|
|
"""
|
|
The fundamental object in Haystack to represent any type of Answers (e.g. extractive QA, generative QA or TableQA).
|
|
For example, it's used within some Nodes like the Reader, but also in the REST API.
|
|
|
|
:param answer: The answer string. If there's no possible answer (aka "no_answer" or "is_impossible) this will be an empty string.
|
|
:param type: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model
|
|
(i.e. we can locate an exact answer string in one of the documents) or from a generative model
|
|
(i.e. no pointer to a specific document, no offsets ...).
|
|
:param score: The relevance score of the Answer determined by a model (e.g. Reader or Generator).
|
|
In the range of [0,1], where 1 means extremely relevant.
|
|
:param context: The related content that was used to create the answer (i.e. a text passage, part of a table, image ...)
|
|
:param offsets_in_document: List of `Span` objects with start and end positions of the answer **in the
|
|
document** (as stored in the document store).
|
|
For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
|
|
For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
|
|
(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
|
|
:param offsets_in_context: List of `Span` objects with start and end positions of the answer **in the
|
|
context** (i.e. the surrounding text/table of a certain window size).
|
|
For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
|
|
For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
|
|
(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
|
|
:param document_ids: IDs of the documents the answer came from (if any).
|
|
For extractive QA, this will be a list of length 1.
|
|
For generative QA, this will be a list of length > 0.
|
|
:param meta: Dict that can be used to associate any kind of custom meta data with the answer.
|
|
In extractive QA, this will carry the meta data of the document where the answer was found.
|
|
"""
|
|
|
|
def __post_init__(self):
|
|
# In case offsets are passed as dicts rather than Span or TableCell objects we convert them here
|
|
# For example, this is used when instantiating an object via from_json()
|
|
if self.offsets_in_document is not None:
|
|
self.offsets_in_document = self._from_dict_offsets(self.offsets_in_document)
|
|
|
|
if self.offsets_in_context is not None:
|
|
self.offsets_in_context = self._from_dict_offsets(self.offsets_in_context)
|
|
|
|
if self.meta is None:
|
|
self.meta = {}
|
|
|
|
# In case the context is a list of lists for a table document that is instantiated by from_json() or from_dict()
|
|
if isinstance(self.context, list):
|
|
self.context = dataframe_from_list(self.context)
|
|
|
|
def __lt__(self, other):
|
|
"""Enable sorting of Answers by score"""
|
|
return self.score < other.score
|
|
|
|
def __str__(self):
|
|
# self.context might be None (therefore not subscriptable)
|
|
if self.context is None:
|
|
return f"<Answer: answer='{self.answer}', score={self.score}, context=None>"
|
|
return f"<Answer: answer='{self.answer}', score={self.score}, context='{self.context[:50]}{'...' if len(self.context) > 50 else ''}'>"
|
|
|
|
def __repr__(self):
|
|
return f"<Answer {self.to_dict()}>"
|
|
|
|
def to_dict(self) -> Dict:
|
|
return asdict(self, dict_factory=_dict_factory)
|
|
|
|
@classmethod
|
|
def from_dict(cls, dict: Dict) -> Answer:
|
|
# backwards compatibility: `document_id: Optional[str]` was changed to `document_ids: Optional[List[str]]`
|
|
if "document_id" in dict:
|
|
dict = dict.copy()
|
|
document_id = dict.pop("document_id")
|
|
dict["document_ids"] = [document_id] if document_id is not None else None
|
|
return cls(**dict)
|
|
|
|
def to_json(self):
|
|
return json.dumps(self.to_dict(), cls=NumpyEncoder)
|
|
|
|
@classmethod
|
|
def from_json(cls, data: Union[str, Dict[str, Any]]):
|
|
if isinstance(data, str):
|
|
dict_data = json.loads(data)
|
|
else:
|
|
dict_data = data
|
|
return cls.from_dict(dict_data)
|
|
|
|
@staticmethod
|
|
def _from_dict_offsets(offsets):
|
|
converted_offsets = []
|
|
for e in offsets:
|
|
if isinstance(e, dict):
|
|
if "row" in e: # is a TableCell
|
|
converted_offsets.append(TableCell(**e))
|
|
else:
|
|
converted_offsets.append(Span(**e))
|
|
else:
|
|
converted_offsets.append(e)
|
|
return converted_offsets
|
|
|
|
def __eq__(self, other):
|
|
context = getattr(other, "context", None)
|
|
if isinstance(context, pd.DataFrame):
|
|
is_content_equal = context.equals(self.context)
|
|
else:
|
|
is_content_equal = context == self.context
|
|
return (
|
|
isinstance(other, self.__class__)
|
|
and is_content_equal
|
|
and getattr(other, "answer", None) == self.answer
|
|
and getattr(other, "type", None) == self.type
|
|
and getattr(other, "score", None) == self.score
|
|
and getattr(other, "offsets_in_document", None) == self.offsets_in_document
|
|
and getattr(other, "offsets_in_context", None) == self.offsets_in_context
|
|
and getattr(other, "document_ids", None) == self.document_ids
|
|
and getattr(other, "meta", None) == self.meta
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class Label:
|
|
id: str
|
|
query: str
|
|
document: Document
|
|
is_correct_answer: bool
|
|
is_correct_document: bool
|
|
origin: Literal["user-feedback", "gold-label"]
|
|
answer: Optional[Answer] = None
|
|
pipeline_id: Optional[str] = None
|
|
created_at: Optional[str] = None
|
|
updated_at: Optional[str] = None
|
|
meta: Optional[dict] = None
|
|
# Note that filters cannot be of type Optional[FilterType] as assignments like `filters = {"name": "file_name"}`
|
|
# won't work due to Dict's covariance. See https://github.com/python/mypy/issues/9418.
|
|
filters: Optional[Dict[str, Any]] = None
|
|
|
|
# We use a custom init here as we want some custom logic. The annotations above are however still needed in order
|
|
# to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method
|
|
def __init__(
|
|
self,
|
|
query: str,
|
|
document: Document,
|
|
is_correct_answer: bool,
|
|
is_correct_document: bool,
|
|
origin: Literal["user-feedback", "gold-label"],
|
|
answer: Optional[Answer],
|
|
id: Optional[str] = None,
|
|
pipeline_id: Optional[str] = None,
|
|
created_at: Optional[str] = None,
|
|
updated_at: Optional[str] = None,
|
|
meta: Optional[dict] = None,
|
|
filters: Optional[Dict[str, Any]] = None,
|
|
):
|
|
"""
|
|
Object used to represent label/feedback in a standardized way within Haystack.
|
|
This includes labels from dataset like SQuAD, annotations from labeling tools,
|
|
or, user-feedback from the Haystack REST API.
|
|
|
|
:param query: the question (or query) for finding answers.
|
|
:param document:
|
|
:param answer: the answer object.
|
|
:param is_correct_answer: whether the sample is positive or negative.
|
|
:param is_correct_document: in case of negative sample(is_correct_answer is False), there could be two cases;
|
|
incorrect answer but correct document & incorrect document. This flag denotes if
|
|
the returned document was correct.
|
|
:param origin: the source for the labels. It can be used to later for filtering.
|
|
:param id: Unique ID used within the DocumentStore. If not supplied, a uuid will be generated automatically.
|
|
:param pipeline_id: pipeline identifier (any str) that was involved for generating this label (in-case of user feedback).
|
|
:param created_at: Timestamp of creation with format yyyy-MM-dd HH:mm:ss.
|
|
Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S").
|
|
:param created_at: Timestamp of update with format yyyy-MM-dd HH:mm:ss.
|
|
Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S")
|
|
:param meta: Meta fields like "annotator_name" in the form of a custom dict (any keys and values allowed).
|
|
:param filters: filters that should be applied to the query to rule out non-relevant documents. For example, if there are different correct answers
|
|
in a DocumentStore depending on the retrieved document and the answer in this label is correct only on condition of the filters.
|
|
"""
|
|
|
|
# Create a unique ID (either new one, or one from user input)
|
|
if id:
|
|
self.id = str(id)
|
|
else:
|
|
self.id = str(uuid4())
|
|
|
|
if created_at is None:
|
|
created_at = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
self.created_at = created_at
|
|
|
|
self.updated_at = updated_at
|
|
self.query = query
|
|
|
|
self.answer = answer
|
|
self.document = document
|
|
|
|
self.is_correct_answer = is_correct_answer
|
|
self.is_correct_document = is_correct_document
|
|
self.origin = origin
|
|
|
|
# TODO autofill answer.document_id if Document is provided
|
|
|
|
self.pipeline_id = pipeline_id
|
|
if not meta:
|
|
self.meta = {}
|
|
else:
|
|
self.meta = meta
|
|
self.filters = filters
|
|
|
|
@property
|
|
def no_answer(self) -> Optional[bool]:
|
|
no_answer = None
|
|
if self.answer is not None:
|
|
no_answer = self.answer.answer is None or self.answer.answer.strip() == ""
|
|
return no_answer
|
|
|
|
def to_dict(self):
|
|
return asdict(self, dict_factory=_dict_factory)
|
|
|
|
@classmethod
|
|
def from_dict(cls, dict: Dict):
|
|
answer = dict.get("answer")
|
|
if answer and isinstance(answer, Dict):
|
|
dict["answer"] = Answer.from_dict(dict["answer"])
|
|
doc = dict.get("document")
|
|
if isinstance(doc, Dict):
|
|
dict["document"] = Document.from_dict(dict["document"])
|
|
return cls(**dict)
|
|
|
|
def to_json(self):
|
|
return json.dumps(self.to_dict(), cls=NumpyEncoder)
|
|
|
|
@classmethod
|
|
def from_json(cls, data: Union[str, Dict[str, Any]]):
|
|
if isinstance(data, str):
|
|
dict_data = json.loads(data)
|
|
else:
|
|
dict_data = data
|
|
return cls.from_dict(dict_data)
|
|
|
|
# define __eq__ and __hash__ functions to deduplicate Label Objects
|
|
def __eq__(self, other):
|
|
return (
|
|
isinstance(other, self.__class__)
|
|
and getattr(other, "query", None) == self.query
|
|
and getattr(other, "answer", None) == self.answer
|
|
and getattr(other, "is_correct_answer", None) == self.is_correct_answer
|
|
and getattr(other, "is_correct_document", None) == self.is_correct_document
|
|
and getattr(other, "origin", None) == self.origin
|
|
and getattr(other, "document", None) == self.document
|
|
and getattr(other, "no_answer", None) == self.no_answer
|
|
and getattr(other, "pipeline_id", None) == self.pipeline_id
|
|
)
|
|
|
|
def __hash__(self):
|
|
return hash(
|
|
self.query
|
|
+ str(self.answer)
|
|
+ str(self.is_correct_answer)
|
|
+ str(self.is_correct_document)
|
|
+ str(self.origin)
|
|
+ str(self.document)
|
|
+ str(self.no_answer)
|
|
+ str(self.pipeline_id)
|
|
)
|
|
|
|
def __repr__(self):
|
|
return f"<Label: {self.to_dict()}>"
|
|
|
|
def __str__(self):
|
|
return f"<Label: {self.to_dict()}>"
|
|
|
|
|
|
def is_positive_label(label):
|
|
return (label.is_correct_answer and label.is_correct_document) or (
|
|
label.answer is None and label.is_correct_document
|
|
)
|
|
|
|
|
|
class MultiLabel:
|
|
def __init__(self, labels: List[Label], drop_negative_labels: bool = False, drop_no_answers: bool = False):
|
|
"""
|
|
There are often multiple `Labels` associated with a single query. For example, there can be multiple annotated
|
|
answers for one question or multiple documents contain the information you want for a query.
|
|
This class is "syntactic sugar" that simplifies the work with such a list of related Labels.
|
|
It stores the original labels in MultiLabel.labels and provides additional aggregated attributes that are
|
|
automatically created at init time. For example, MultiLabel.no_answer allows you to easily access if any of the
|
|
underlying Labels provided a text answer and therefore demonstrates that there is indeed a possible answer.
|
|
|
|
:param labels: A list of labels that belong to a similar query and shall be "grouped" together
|
|
:param drop_negative_labels: Whether to drop negative labels from that group (e.g. thumbs down feedback from UI)
|
|
:param drop_no_answers: Whether to drop labels that specify the answer is impossible
|
|
"""
|
|
# drop duplicate labels and remove negative labels if needed.
|
|
labels = list(dict.fromkeys(labels))
|
|
if drop_negative_labels:
|
|
labels = [l for l in labels if is_positive_label(l)]
|
|
if drop_no_answers:
|
|
labels = [l for l in labels if l.no_answer is False]
|
|
|
|
self._labels = labels
|
|
self._query = self._aggregate_labels(key="query", must_be_single_value=True)[0]
|
|
self._filters = self._aggregate_labels(key="filters", must_be_single_value=True)[0]
|
|
self.id = hashlib.md5((self.query + json.dumps(self.filters, sort_keys=True)).encode()).hexdigest()
|
|
|
|
# Currently no_answer is only true if all labels are "no_answers", we could later introduce a param here to let
|
|
# users decided which aggregation logic they want
|
|
self._no_answer = all(l.no_answer for l in self._labels)
|
|
|
|
# Answer strings and offsets cleaned for no_answers:
|
|
# If there are only no_answers, offsets are empty and answers will be a single empty string
|
|
# which equals the no_answers representation of reader nodes.
|
|
if self._no_answer:
|
|
self._answers = [""]
|
|
self._offsets_in_documents: List[dict] = []
|
|
self._offsets_in_contexts: List[dict] = []
|
|
else:
|
|
answered = [l.answer for l in self._labels if not l.no_answer and l.answer is not None]
|
|
self._answers = [answer.answer for answer in answered]
|
|
self._offsets_in_documents = []
|
|
self._offsets_in_contexts = []
|
|
for answer in answered:
|
|
if answer.offsets_in_document is not None:
|
|
for span in answer.offsets_in_document:
|
|
self._offsets_in_documents.append(self._to_dict_offsets(span))
|
|
if answer.offsets_in_context is not None:
|
|
for span in answer.offsets_in_context:
|
|
self._offsets_in_contexts.append(self._to_dict_offsets(span))
|
|
|
|
# There are two options here to represent document_ids:
|
|
# taking the id from the document of each label or taking the document_id of each label's answer.
|
|
# We take the former as labels without answers are allowed.
|
|
#
|
|
# For no_answer cases document_store.add_eval_data() currently adds all documents coming from the SQuAD paragraph's context
|
|
# as separate no_answer labels, and thus with document.id but without answer.document_id.
|
|
# If we do not exclude them from document_ids this would be problematic for retriever evaluation as they do not contain the answer.
|
|
# Hence, we exclude them here as well.
|
|
self._document_ids = [l.document.id for l in self._labels if not l.no_answer]
|
|
self._contexts = [str(l.document.content) for l in self._labels if not l.no_answer]
|
|
|
|
@staticmethod
|
|
def _to_dict_offsets(offset: Union[Span, TableCell]) -> Dict:
|
|
if isinstance(offset, TableCell):
|
|
return {"row": offset.row, "col": offset.col}
|
|
else:
|
|
return {"start": offset.start, "end": offset.end}
|
|
|
|
@property
|
|
def labels(self):
|
|
return self._labels
|
|
|
|
@property
|
|
def query(self):
|
|
return self._query
|
|
|
|
@property
|
|
def filters(self):
|
|
return self._filters
|
|
|
|
@property
|
|
def document_ids(self):
|
|
return self._document_ids
|
|
|
|
@property
|
|
def contexts(self):
|
|
return self._contexts
|
|
|
|
@property
|
|
def no_answer(self):
|
|
return self._no_answer
|
|
|
|
@property
|
|
def answers(self):
|
|
return self._answers
|
|
|
|
@property
|
|
def offsets_in_documents(self):
|
|
return self._offsets_in_documents
|
|
|
|
@property
|
|
def offsets_in_contexts(self):
|
|
return self._offsets_in_contexts
|
|
|
|
def _aggregate_labels(self, key, must_be_single_value=True) -> List[Any]:
|
|
if any(isinstance(getattr(l, key), dict) for l in self.labels):
|
|
# dict is not hashable so we collect unique filters via looping through all labels
|
|
unique_values = []
|
|
for l in self.labels:
|
|
if l.filters not in unique_values:
|
|
unique_values.append(l.filters)
|
|
else:
|
|
unique_values = list({getattr(l, key) for l in self.labels})
|
|
if must_be_single_value and len(unique_values) > 1:
|
|
raise ValueError(
|
|
f"Tried to combine attribute '{key}' of Labels, but found multiple different values: {unique_values}"
|
|
)
|
|
return unique_values
|
|
|
|
def to_dict(self):
|
|
# convert internal attribute names to property names
|
|
result = {k[1:] if k[0] == "_" else k: v for k, v in vars(self).items()}
|
|
# convert Label object to dict
|
|
result["labels"] = [label.to_dict() for label in result["labels"]]
|
|
return result
|
|
|
|
@classmethod
|
|
def from_dict(cls, dict: Dict):
|
|
# exclude extra arguments
|
|
inputs = {k: v for k, v in dict.items() if k in inspect.signature(cls).parameters}
|
|
inputs["labels"] = [Label.from_dict(label) for label in inputs["labels"]]
|
|
return cls(**inputs)
|
|
|
|
def to_json(self):
|
|
return json.dumps(self.to_dict(), default=pydantic_encoder)
|
|
|
|
@classmethod
|
|
def from_json(cls, data: Union[str, Dict[str, Any]]):
|
|
if isinstance(data, str):
|
|
dict_data = json.loads(data)
|
|
else:
|
|
dict_data = data
|
|
return cls.from_dict(dict_data)
|
|
|
|
def __eq__(self, other):
|
|
return isinstance(other, self.__class__) and self.labels == other.labels
|
|
|
|
def __repr__(self):
|
|
return f"<MultiLabel: {self.to_dict()}>"
|
|
|
|
def __str__(self):
|
|
return f"<MultiLabel: {self.to_dict()}>"
|
|
|
|
|
|
def _pydantic_dataclass_from_dict(dict: Dict, pydantic_dataclass_type) -> Any:
|
|
"""
|
|
Constructs a pydantic dataclass from a dict incl. other nested dataclasses.
|
|
This allows simple de-serialization of pydantic dataclasses from json.
|
|
:param dict: Dict containing all attributes and values for the dataclass.
|
|
:param pydantic_dataclass_type: The class of the dataclass that should be constructed (e.g. Document)
|
|
"""
|
|
base_model = pydantic_dataclass_type.__pydantic_model__.parse_obj(dict)
|
|
base_mode_fields = base_model.__fields__
|
|
|
|
values = {}
|
|
for base_model_field_name in base_mode_fields.keys():
|
|
value = getattr(base_model, base_model_field_name)
|
|
values[base_model_field_name] = value
|
|
|
|
dataclass_object = pydantic_dataclass_type(**values)
|
|
return dataclass_object
|
|
|
|
|
|
def _dict_factory(data):
|
|
"""Meant to be as the dict_factory for `asdict`. This function is called within `asdict` to convert a list of tuples
|
|
into a dictionary object. This handles the conversion of pandas Dataframes into a list of lists.
|
|
|
|
:param data: list of (key, value) pairs
|
|
"""
|
|
|
|
def convert_value(v):
|
|
if isinstance(v, pd.DataFrame):
|
|
return dataframe_to_list(v)
|
|
return v
|
|
|
|
return {k: convert_value(v) for k, v in data}
|
|
|
|
|
|
class NumpyEncoder(json.JSONEncoder):
|
|
def default(self, obj):
|
|
if isinstance(obj, np.ndarray):
|
|
return obj.tolist()
|
|
return json.JSONEncoder.default(self, obj)
|
|
|
|
|
|
def dataframe_to_list(df: pd.DataFrame) -> List[List]:
|
|
return [df.columns.tolist()] + df.values.tolist()
|
|
|
|
|
|
def dataframe_from_list(list_df: List[List]) -> pd.DataFrame:
|
|
return pd.DataFrame(columns=list_df[0], data=list_df[1:])
|
|
|
|
|
|
class EvaluationResult:
|
|
def __init__(self, node_results: Optional[Dict[str, DataFrame]] = None) -> None:
|
|
"""
|
|
A convenience class to store, pass, and interact with results of a pipeline evaluation run (for example `pipeline.eval()`).
|
|
Detailed results are stored as one dataframe per node. This class makes them more accessible and provides
|
|
convenience methods to work with them.
|
|
For example, you can calculate eval metrics, get detailed reports, or simulate different top_k settings:
|
|
|
|
```python
|
|
eval_results = pipeline.eval(...)
|
|
|
|
# derive detailed metrics
|
|
eval_results.calculate_metrics()
|
|
|
|
# show summary of incorrect queries
|
|
eval_results.wrong_examples()
|
|
```
|
|
|
|
Each row of the underlying DataFrames contains either an answer or a document that has been retrieved during evaluation.
|
|
Rows are enriched with basic information like rank, query, type, or node.
|
|
Additional answer or document-specific evaluation information, like gold labels
|
|
and metrics showing whether the row matches the gold labels, are included, too.
|
|
The DataFrames have the following schema:
|
|
- multilabel_id: The ID of the multilabel, which is unique for the pair of query and filters.
|
|
- query: The actual query string.
|
|
- filters: The filters used with the query.
|
|
- gold_answers (answers only): The expected answers.
|
|
- answer (answers only): The actual answer.
|
|
- context: The content of the document (the surrounding context of the answer for QA).
|
|
- exact_match (answers only): A metric showing if the answer exactly matches the gold label.
|
|
- f1 (answers only): A metric showing how well the answer overlaps with the gold label on a token basis.
|
|
- sas (answers only, optional): A metric showing how well the answer matches the gold label on a semantic basis.
|
|
- exact_match_context_scope (answers only): exact_match with enforced context match.
|
|
- f1_context_scope (answers only): f1 with enforced context scope match.
|
|
- sas_context_scope (answers only): sas with enforced context scope match.
|
|
- exact_match_document_scope (answers only): exact_match with enforced document scope match.
|
|
- f1_document_scope (answers only): f1 with enforced document scope match.
|
|
- sas_document_scope (answers only): sas with enforced document scope match.
|
|
- exact_match_document_id_and_context_scope: (answers only): exact_match with enforced document and context scope match.
|
|
- f1_document_id_and_context_scope (answers only): f1 with enforced document and context scope match.
|
|
- sas_document_id_and_context_scope (answers only): sas with enforced document and context scope match.
|
|
- gold_contexts: The contents of the gold documents.
|
|
- gold_id_match (documents only): A metric showing whether one of the gold document IDs matches the document.
|
|
- context_match (documents only): A metric showing whether one of the gold contexts matches the document content.
|
|
- answer_match (documents only): A metric showing whether the document contains the answer.
|
|
- gold_id_or_answer_match (documents only): A Boolean operation specifying that there should be either `'gold_id_match' OR 'answer_match'`.
|
|
- gold_id_and_answer_match (documents only): A Boolean operation specifying that there should be both `'gold_id_match' AND 'answer_match'`.
|
|
- gold_id_or_context_match (documents only): A Boolean operation specifying that there should be either `'gold_id_match' OR 'context_match'`.
|
|
- gold_id_and_context_match (documents only): A Boolean operation specifying that there should be both `'gold_id_match' AND 'context_match'`.
|
|
- gold_id_and_context_and_answer_match (documents only): A Boolean operation specifying that there should be `'gold_id_match' AND 'context_match' AND 'answer_match'`.
|
|
- context_and_answer_match (documents only): A Boolean operation specifying that there should be both `'context_match' AND 'answer_match'`.
|
|
- rank: A rank or 1-based-position in the result list.
|
|
- document_id: The ID of the document that has been retrieved or that contained the answer.
|
|
- gold_document_ids: The IDs of the documents to be retrieved.
|
|
- custom_document_id: The custom ID of the document (specified by `custom_document_id_field`) that has been retrieved or that contained the answer.
|
|
- gold_custom_document_ids: The custom documents IDs (specified by `custom_document_id_field`) to be retrieved.
|
|
- offsets_in_document (answers only): The position or offsets within the document where the answer was found.
|
|
- gold_offsets_in_documents (answers only): The position or offsets of the gold answer within the document.
|
|
- gold_answers_exact_match (answers only): exact_match values per gold_answer.
|
|
- gold_answers_f1 (answers only): f1 values per gold_answer.
|
|
- gold_answers_sas (answers only): sas values per gold answer.
|
|
- gold_documents_id_match: The document ID match per gold label (if `custom_document_id_field` has been specified, custom IDs are used).
|
|
- gold_contexts_similarity: Context similarity per gold label.
|
|
- gold_answers_match (documents only): Specifies whether the document contains an answer per gold label.
|
|
- type: Possible values: 'answer' or 'document'.
|
|
- node: The node name
|
|
- eval_mode: Specifies whether the evaluation was executed in integrated or isolated mode.
|
|
Check pipeline.eval()'s add_isolated_node_eval parameter for more information.
|
|
|
|
:param node_results: The evaluation Dataframes per pipeline node.
|
|
"""
|
|
self.node_results: Dict[str, DataFrame] = {} if node_results is None else node_results
|
|
|
|
def __getitem__(self, key: str):
|
|
return self.node_results.__getitem__(key)
|
|
|
|
def __delitem__(self, key: str):
|
|
self.node_results.__delitem__(key)
|
|
|
|
def __setitem__(self, key: str, value: DataFrame):
|
|
self.node_results.__setitem__(key, value)
|
|
|
|
def __contains__(self, key: str):
|
|
return self.node_results.keys().__contains__(key)
|
|
|
|
def __len__(self):
|
|
return self.node_results.__len__()
|
|
|
|
def append(self, key: str, value: DataFrame):
|
|
if value is not None and len(value) > 0:
|
|
if key in self.node_results:
|
|
self.node_results[key] = pd.concat([self.node_results[key], value]).reset_index(drop=True)
|
|
else:
|
|
self.node_results[key] = value
|
|
|
|
def calculate_metrics(
|
|
self,
|
|
simulated_top_k_reader: int = -1,
|
|
simulated_top_k_retriever: int = -1,
|
|
document_scope: Literal[
|
|
"document_id",
|
|
"context",
|
|
"document_id_and_context",
|
|
"document_id_or_context",
|
|
"answer",
|
|
"document_id_or_answer",
|
|
] = "document_id_or_answer",
|
|
eval_mode: Literal["integrated", "isolated"] = "integrated",
|
|
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
|
) -> Dict[str, Dict[str, float]]:
|
|
"""
|
|
Calculates proper metrics for each node.
|
|
|
|
For Nodes that return Documents, the default metrics are:
|
|
- mrr (`Mean Reciprocal Rank <https://en.wikipedia.org/wiki/Mean_reciprocal_rank>`_)
|
|
- map (`Mean Average Precision <https://en.wikipedia.org/wiki/Evaluation_measures_%28information_retrieval%29#Mean_average_precision>`_)
|
|
- ndcg (`Normalized Discounted Cumulative Gain <https://en.wikipedia.org/wiki/Discounted_cumulative_gain>`_)
|
|
- precision (Precision: How many of the returned documents were relevant?)
|
|
- recall_multi_hit (Recall according to Information Retrieval definition: How many of the relevant documents were retrieved per query?)
|
|
- recall_single_hit (Recall for Question Answering: How many of the queries returned at least one relevant document?)
|
|
|
|
For Nodes that return answers, the default metrics are:
|
|
- exact_match (How many of the queries returned the exact answer?)
|
|
- f1 (How well do the returned results overlap with any gold answer on a token basis?)
|
|
- sas, if a SAS model has been provided when calling `pipeline.eval()` (How semantically similar is the prediction to the gold answers?)
|
|
|
|
During the eval run, you can simulate lower top_k values for Reader and Retriever than the actual values.
|
|
For example, you can calculate `top_1_f1` for Reader nodes by setting `simulated_top_k_reader=1`.
|
|
|
|
If you applied `simulated_top_k_retriever` to a Reader node, you should treat the results with caution as they can differ from an actual eval run with a corresponding `top_k_retriever` heavily.
|
|
|
|
:param simulated_top_k_reader: Simulates the `top_k` parameter of the Reader.
|
|
:param simulated_top_k_retriever: Simulates the `top_k` parameter of the Retriever.
|
|
Note: There might be a discrepancy between simulated Reader metrics and an actual Pipeline run with Retriever `top_k`.
|
|
:param eval_mode: The input the Node was evaluated on.
|
|
Usually a Node gets evaluated on the prediction provided by its predecessor Nodes in the Pipeline (`value='integrated'`).
|
|
However, as the quality of the Node can heavily depend on the Node's input and thus the predecessor's quality,
|
|
you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your Node.
|
|
For example, when evaluating the Reader, use `value='isolated'` to simulate a perfect Retriever in an ExtractiveQAPipeline.
|
|
Possible values are: `integrated`, `isolated`.
|
|
The default value is `integrated`.
|
|
:param document_scope: A criterion for deciding whether documents are relevant or not.
|
|
You can select between:
|
|
- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
|
|
A typical use case is Document Retrieval.
|
|
- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
|
|
A typical use case is Document-Independent Passage Retrieval.
|
|
- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match.
|
|
A typical use case is Document-Specific Passage Retrieval.
|
|
- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match.
|
|
A typical use case is Document Retrieval having sparse context labels.
|
|
- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically.
|
|
A typical use case is Question Answering.
|
|
- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match.
|
|
This is intended to be a proper default value in order to support both main use cases:
|
|
- Document Retrieval
|
|
- Question Answering
|
|
The default value is 'document_id_or_answer'.
|
|
:param answer_scope: Specifies the scope in which a matching answer is considered correct.
|
|
You can select between:
|
|
- 'any' (default): Any matching answer is considered correct.
|
|
- 'context': The answer is only considered correct if its context matches as well.
|
|
Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
|
|
- 'document_id': The answer is only considered correct if its document ID matches as well.
|
|
You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
|
|
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
|
|
The default value is 'any'.
|
|
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
|
|
"""
|
|
return {
|
|
node: self._calculate_node_metrics(
|
|
df,
|
|
simulated_top_k_reader=simulated_top_k_reader,
|
|
simulated_top_k_retriever=simulated_top_k_retriever,
|
|
document_scope=document_scope,
|
|
answer_scope=answer_scope,
|
|
eval_mode=eval_mode,
|
|
)
|
|
for node, df in self.node_results.items()
|
|
}
|
|
|
|
def wrong_examples(
|
|
self,
|
|
node: str,
|
|
n: int = 3,
|
|
simulated_top_k_reader: int = -1,
|
|
simulated_top_k_retriever: int = -1,
|
|
document_scope: Literal[
|
|
"document_id",
|
|
"context",
|
|
"document_id_and_context",
|
|
"document_id_or_context",
|
|
"answer",
|
|
"document_id_or_answer",
|
|
] = "document_id_or_answer",
|
|
document_metric: str = "recall_single_hit",
|
|
answer_metric: str = "f1",
|
|
document_metric_threshold: float = 0.5,
|
|
answer_metric_threshold: float = 0.5,
|
|
eval_mode: Literal["integrated", "isolated"] = "integrated",
|
|
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
|
) -> List[Dict]:
|
|
"""
|
|
Returns the worst performing queries.
|
|
Worst performing queries are calculated based on the metric
|
|
that is either a document metric or an answer metric according to the node type.
|
|
|
|
Lower top_k values for reader and retriever than the actual values during the eval run can be simulated.
|
|
See calculate_metrics() for more information.
|
|
|
|
:param simulated_top_k_reader: simulates top_k param of reader
|
|
:param simulated_top_k_retriever: simulates top_k param of retriever.
|
|
remarks: there might be a discrepancy between simulated reader metrics and an actual pipeline run with retriever top_k
|
|
:param document_metric: the document metric worst queries are calculated with.
|
|
values can be: 'recall_single_hit', 'recall_multi_hit', 'mrr', 'map', 'precision'
|
|
:param answer_metric: the answer metric worst queries are calculated with.
|
|
values can be: 'f1', 'exact_match' and 'sas' if the evaluation was made using a SAS model.
|
|
:param document_metric_threshold: the threshold for the document metric (only samples below selected metric
|
|
threshold will be considered)
|
|
:param answer_metric_threshold: the threshold for the answer metric (only samples below selected metric
|
|
threshold will be considered)
|
|
:param eval_mode: the input on which the node was evaluated on.
|
|
Usually nodes get evaluated on the prediction provided by its predecessor nodes in the pipeline (value='integrated').
|
|
However, as the quality of the node itself can heavily depend on the node's input and thus the predecessor's quality,
|
|
you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your node.
|
|
For example when evaluating the reader use value='isolated' to simulate a perfect retriever in an ExtractiveQAPipeline.
|
|
Values can be 'integrated', 'isolated'.
|
|
Default value is 'integrated'.
|
|
:param document_scope: A criterion for deciding whether documents are relevant or not.
|
|
You can select between:
|
|
- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
|
|
A typical use case is Document Retrieval.
|
|
- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
|
|
A typical use case is Document-Independent Passage Retrieval.
|
|
- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match.
|
|
A typical use case is Document-Specific Passage Retrieval.
|
|
- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match.
|
|
A typical use case is Document Retrieval having sparse context labels.
|
|
- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically.
|
|
A typical use case is Question Answering.
|
|
- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match.
|
|
This is intended to be a proper default value in order to support both main use cases:
|
|
- Document Retrieval
|
|
- Question Answering
|
|
The default value is 'document_id_or_answer'.
|
|
:param answer_scope: Specifies the scope in which a matching answer is considered correct.
|
|
You can select between:
|
|
- 'any' (default): Any matching answer is considered correct.
|
|
- 'context': The answer is only considered correct if its context matches as well.
|
|
Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params).
|
|
- 'document_id': The answer is only considered correct if its document ID matches as well.
|
|
You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param.
|
|
- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well.
|
|
The default value is 'any'.
|
|
In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'.
|
|
"""
|
|
node_df = self.node_results[node]
|
|
node_df = self._filter_eval_mode(node_df, eval_mode)
|
|
|
|
answers = node_df[node_df["type"] == "answer"]
|
|
if len(answers) > 0:
|
|
metrics_df = self._build_answer_metrics_df(
|
|
answers,
|
|
simulated_top_k_reader=simulated_top_k_reader,
|
|
simulated_top_k_retriever=simulated_top_k_retriever,
|
|
answer_scope=answer_scope,
|
|
)
|
|
worst_df = metrics_df.sort_values(by=[answer_metric]).head(n)
|
|
wrong_examples = []
|
|
for multilabel_id, metrics in worst_df.iterrows():
|
|
query_answers = answers[answers["multilabel_id"] == multilabel_id]
|
|
if answer_metric not in metrics:
|
|
logger.warning(
|
|
"You specified an answer_metric=%s not available in calculated metrics=%s."
|
|
"Skipping collection of worst performing samples.",
|
|
answer_metric,
|
|
metrics.keys(),
|
|
)
|
|
break
|
|
if metrics[answer_metric] <= answer_metric_threshold:
|
|
query_dict = {
|
|
"multilabel_id": query_answers["multilabel_id"].iloc[0],
|
|
"query": query_answers["query"].iloc[0],
|
|
"filters": query_answers["filters"].iloc[0],
|
|
"metrics": metrics.to_dict(),
|
|
"answers": query_answers.drop(
|
|
["node", "query", "type", "gold_answers", "gold_offsets_in_documents", "gold_document_ids"],
|
|
axis=1,
|
|
).to_dict(orient="records"),
|
|
"gold_answers": query_answers["gold_answers"].iloc[0],
|
|
"gold_document_ids": query_answers["gold_document_ids"].iloc[0],
|
|
}
|
|
wrong_examples.append(query_dict)
|
|
return wrong_examples
|
|
|
|
documents = node_df[node_df["type"] == "document"]
|
|
if len(documents) > 0:
|
|
document_relevance_criterion = self._get_document_relevance_criterion(
|
|
document_scope=document_scope, answer_scope=answer_scope
|
|
)
|
|
metrics_df = self._build_document_metrics_df(
|
|
documents,
|
|
simulated_top_k_retriever=simulated_top_k_retriever,
|
|
document_relevance_criterion=document_relevance_criterion,
|
|
)
|
|
worst_df = metrics_df.sort_values(by=[document_metric]).head(n)
|
|
wrong_examples = []
|
|
for multilabel_id, metrics in worst_df.iterrows():
|
|
if document_metric not in metrics:
|
|
logger.warning(
|
|
"You specified a document_metric=%s not available in calculated metrics=%s."
|
|
"Skipping collection of worst performing samples.",
|
|
document_metric,
|
|
metrics.keys(),
|
|
)
|
|
break
|
|
if metrics[document_metric] <= document_metric_threshold:
|
|
query_documents = documents[documents["multilabel_id"] == multilabel_id]
|
|
query_dict = {
|
|
"multilabel_id": query_documents["multilabel_id"].iloc[0],
|
|
"query": query_documents["query"].iloc[0],
|
|
"filters": query_documents["filters"].iloc[0],
|
|
"metrics": metrics.to_dict(),
|
|
"documents": query_documents.drop(
|
|
["node", "query", "multilabel_id", "filters", "type", "gold_document_ids", "gold_contexts"],
|
|
axis=1,
|
|
).to_dict(orient="records"),
|
|
"gold_document_ids": query_documents["gold_document_ids"].iloc[0],
|
|
}
|
|
wrong_examples.append(query_dict)
|
|
return wrong_examples
|
|
|
|
return []
|
|
|
|
def _get_document_relevance_criterion(
|
|
self,
|
|
document_scope: Literal[
|
|
"document_id",
|
|
"context",
|
|
"document_id_and_context",
|
|
"document_id_or_context",
|
|
"answer",
|
|
"document_id_or_answer",
|
|
] = "document_id_or_answer",
|
|
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
|
) -> Literal[
|
|
"document_id",
|
|
"context",
|
|
"document_id_and_context",
|
|
"document_id_or_context",
|
|
"answer",
|
|
"context_and_answer",
|
|
"document_id_and_answer",
|
|
"document_id_and_context_and_answer",
|
|
"document_id_or_answer",
|
|
]:
|
|
"""
|
|
Combines document_scope and answer_scope to create the document_relevance_criterion.
|
|
"""
|
|
answer_scope_to_doc_relevance_crit = {
|
|
"context": "context_and_answer",
|
|
"document_id": "document_id_and_answer",
|
|
"document_id_and_context": "document_id_and_context_and_answer",
|
|
}
|
|
|
|
document_relevance_criterion: str = document_scope
|
|
if document_scope in ["answer", "document_id_or_answer"]:
|
|
document_relevance_criterion = answer_scope_to_doc_relevance_crit.get(answer_scope, document_scope)
|
|
elif answer_scope in answer_scope_to_doc_relevance_crit.keys():
|
|
logger.warning(
|
|
"You specified a non-answer document_scope together with a non-default answer_scope. "
|
|
"This may result in inconsistencies between answer and document metrics. "
|
|
"To enforce the same definition of correctness for both, document_scope must be one of 'answer', 'document_id_or_answer'."
|
|
)
|
|
|
|
return document_relevance_criterion # type: ignore[return-value]
|
|
|
|
def _calculate_node_metrics(
|
|
self,
|
|
df: DataFrame,
|
|
simulated_top_k_reader: int = -1,
|
|
simulated_top_k_retriever: int = -1,
|
|
document_scope: Literal[
|
|
"document_id",
|
|
"context",
|
|
"document_id_and_context",
|
|
"document_id_or_context",
|
|
"answer",
|
|
"document_id_or_answer",
|
|
] = "document_id_or_answer",
|
|
eval_mode: str = "integrated",
|
|
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
|
) -> Dict[str, float]:
|
|
df = self._filter_eval_mode(df, eval_mode)
|
|
|
|
answer_metrics = self._calculate_answer_metrics(
|
|
df,
|
|
simulated_top_k_reader=simulated_top_k_reader,
|
|
simulated_top_k_retriever=simulated_top_k_retriever,
|
|
answer_scope=answer_scope,
|
|
)
|
|
|
|
document_relevance_criterion = self._get_document_relevance_criterion(
|
|
document_scope=document_scope, answer_scope=answer_scope
|
|
)
|
|
document_metrics = self._calculate_document_metrics(
|
|
df,
|
|
simulated_top_k_retriever=simulated_top_k_retriever,
|
|
document_relevance_criterion=document_relevance_criterion,
|
|
)
|
|
|
|
return {**answer_metrics, **document_metrics}
|
|
|
|
def _filter_eval_mode(self, df: DataFrame, eval_mode: str) -> DataFrame:
|
|
if "eval_mode" in df.columns:
|
|
df = df[df["eval_mode"] == eval_mode]
|
|
else:
|
|
logger.warning("eval dataframe has no eval_mode column. eval_mode param will be ignored.")
|
|
return df
|
|
|
|
def _calculate_answer_metrics(
|
|
self,
|
|
df: DataFrame,
|
|
simulated_top_k_reader: int = -1,
|
|
simulated_top_k_retriever: int = -1,
|
|
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
|
) -> Dict[str, float]:
|
|
answers = df[df["type"] == "answer"]
|
|
if len(answers) == 0:
|
|
return {}
|
|
|
|
metrics_df = self._build_answer_metrics_df(
|
|
answers,
|
|
simulated_top_k_reader=simulated_top_k_reader,
|
|
simulated_top_k_retriever=simulated_top_k_retriever,
|
|
answer_scope=answer_scope,
|
|
)
|
|
num_examples_for_eval = len(answers["multilabel_id"].unique())
|
|
result = {metric: metrics_df[metric].mean().tolist() for metric in metrics_df.columns}
|
|
result["num_examples_for_eval"] = float(num_examples_for_eval) # formatter requires float
|
|
return result
|
|
|
|
def _build_answer_metrics_df(
|
|
self,
|
|
answers: DataFrame,
|
|
simulated_top_k_reader: int = -1,
|
|
simulated_top_k_retriever: int = -1,
|
|
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
|
) -> DataFrame:
|
|
"""
|
|
Builds a dataframe containing answer metrics (columns) per multilabel (index).
|
|
Answer metrics are:
|
|
- exact_match (Did the query exactly return any gold answer? -> 1.0 or 0.0)
|
|
- f1 (How well does the best matching returned results overlap with any gold answer on token basis?)
|
|
- sas if a SAS model has been provided during pipeline.eval() (How semantically similar is the prediction to the gold answers?)
|
|
"""
|
|
multilabel_ids = answers["multilabel_id"].unique()
|
|
# simulate top k retriever
|
|
if simulated_top_k_retriever != -1:
|
|
documents = self._get_documents_df()
|
|
|
|
top_k_documents = documents[documents["rank"] <= simulated_top_k_retriever]
|
|
simulated_answers = []
|
|
for multilabel_id in multilabel_ids:
|
|
top_k_document_ids = top_k_documents[top_k_documents["multilabel_id"] == multilabel_id][
|
|
"document_id"
|
|
].unique()
|
|
query_answers = answers[answers["multilabel_id"] == multilabel_id]
|
|
# consider only the answers within simulated_top_k_retriever documents
|
|
|
|
simulated_query_answers = query_answers[
|
|
query_answers["document_ids"].apply(
|
|
lambda document_ids, top_k_document_ids=top_k_document_ids: all(
|
|
document_id in top_k_document_ids for document_id in document_ids
|
|
)
|
|
)
|
|
]
|
|
# simulate top k reader
|
|
if simulated_top_k_reader != -1:
|
|
# consider only the simulated_top_k_reader answers within simulated_query_answers
|
|
simulated_query_answers = simulated_query_answers.nsmallest(simulated_top_k_reader, "rank")
|
|
simulated_query_answers["rank"] = np.arange(1, len(simulated_query_answers) + 1)
|
|
simulated_answers.append(simulated_query_answers)
|
|
answers = pd.concat(simulated_answers)
|
|
# simulate top k reader
|
|
elif simulated_top_k_reader != -1:
|
|
answers = answers[answers["rank"] <= simulated_top_k_reader]
|
|
|
|
# build metrics df
|
|
answer_metrics = ["exact_match", "f1", "sas"]
|
|
df_records = []
|
|
|
|
for multilabel_id in multilabel_ids:
|
|
query_df = answers[answers["multilabel_id"] == multilabel_id]
|
|
metric_to_scoped_col = {
|
|
metric: f"{metric}_{answer_scope}_scope" if answer_scope != "any" else metric
|
|
for metric in answer_metrics
|
|
if metric in query_df.columns
|
|
}
|
|
query_metrics = {
|
|
metric: query_df[col].max() if any(query_df) else 0.0 for metric, col in metric_to_scoped_col.items()
|
|
}
|
|
df_records.append(query_metrics)
|
|
|
|
metrics_df = DataFrame.from_records(df_records, index=multilabel_ids)
|
|
return metrics_df
|
|
|
|
def _get_documents_df(self):
|
|
document_dfs = [
|
|
node_df for node_df in self.node_results.values() if len(node_df[node_df["type"] == "document"]) > 0
|
|
]
|
|
if len(document_dfs) != 1:
|
|
raise ValueError("cannot detect retriever dataframe")
|
|
documents_df = document_dfs[0]
|
|
documents_df = documents_df[documents_df["type"] == "document"]
|
|
return documents_df
|
|
|
|
def _calculate_document_metrics(
|
|
self,
|
|
df: DataFrame,
|
|
simulated_top_k_retriever: int = -1,
|
|
document_relevance_criterion: Literal[
|
|
"document_id",
|
|
"context",
|
|
"document_id_and_context",
|
|
"document_id_or_context",
|
|
"answer",
|
|
"context_and_answer",
|
|
"document_id_and_answer",
|
|
"document_id_and_context_and_answer",
|
|
"document_id_or_answer",
|
|
] = "document_id_or_answer",
|
|
) -> Dict[str, float]:
|
|
documents = df[df["type"] == "document"]
|
|
if len(documents) == 0:
|
|
return {}
|
|
|
|
metrics_df = self._build_document_metrics_df(
|
|
documents,
|
|
simulated_top_k_retriever=simulated_top_k_retriever,
|
|
document_relevance_criterion=document_relevance_criterion,
|
|
)
|
|
|
|
return {metric: metrics_df[metric].mean().tolist() for metric in metrics_df.columns}
|
|
|
|
def _build_document_metrics_df(
|
|
self,
|
|
documents: DataFrame,
|
|
simulated_top_k_retriever: int = -1,
|
|
document_relevance_criterion: Literal[
|
|
"document_id",
|
|
"context",
|
|
"document_id_and_context",
|
|
"document_id_or_context",
|
|
"answer",
|
|
"context_and_answer",
|
|
"document_id_and_answer",
|
|
"document_id_and_context_and_answer",
|
|
"document_id_or_answer",
|
|
] = "document_id_or_answer",
|
|
) -> DataFrame:
|
|
"""
|
|
Builds a dataframe containing document metrics (columns) per pair of query and gold document ids (index).
|
|
Document metrics are:
|
|
- mrr (Mean Reciprocal Rank: see https://en.wikipedia.org/wiki/Mean_reciprocal_rank)
|
|
- map (Mean Average Precision: see https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision)
|
|
- precision (Precision: How many of the returned documents were relevant?)
|
|
- recall_multi_hit (Recall according to Information Retrieval definition: How many of the relevant documents were retrieved per query?)
|
|
- recall_single_hit (Recall for Question Answering: Did the query return at least one relevant document? -> 1.0 or 0.0)
|
|
|
|
:param documents: document eval dataframe
|
|
:param simulated_top_k_retriever: simulates top_k param of retriever.
|
|
:param document_relevance_criterion: criterion for deciding whether documents are relevant or not.
|
|
You can select between:
|
|
- 'document_id': Document's id or custom id must match.
|
|
Typical use case: Document Retrieval
|
|
- 'context': Document's content must match.
|
|
Typical use case: Document-independent Passage Retrieval
|
|
- 'document_id_and_context': boolean operation `'document_id' AND 'context'`.
|
|
Typical use case: Document-specific Passage Retrieval
|
|
- 'document_id_or_context': boolean operation `'document_id' OR 'context'`.
|
|
Typical use case: Document Retrieval having sparse context labels
|
|
- 'answer': Document's content must include the answer.
|
|
Typical use case: Question Answering
|
|
- 'document_id_or_answer' (default): boolean operation `'document_id' OR 'answer'`.
|
|
This is intended to be a proper default value in order to support both main use cases:
|
|
- Document Retrieval
|
|
- Question Answering
|
|
- 'context_and_answer': boolean operation `'context' AND 'answer'`.
|
|
Typical use case: Question Answering with context-specific answers (see answer_scope='context')
|
|
- 'document_id_and_answer': boolean operation `'document_id' AND 'answer'`.
|
|
Typical use case: Question Answering with document-specific answers (see answer_scope='document_id')
|
|
- 'document_id_and_context_and_answer': boolean operation `'document_id' AND 'context' and 'answer'`.
|
|
Typical use case: Question Answering with document-and-context-specific answers (see answer_scope='document_id_and_context')
|
|
Default value is 'document_id_or_answer'.
|
|
"""
|
|
if simulated_top_k_retriever != -1:
|
|
documents = documents[documents["rank"] <= simulated_top_k_retriever]
|
|
|
|
# find out which label matched
|
|
def find_matched_label_idxs(row) -> List[int]: # pylint: disable=too-many-return-statements
|
|
id_matches = [idx for idx, val in enumerate(row["gold_documents_id_match"]) if val == 1.0]
|
|
context_matches = [
|
|
idx for idx, val in enumerate(row["gold_contexts_similarity"]) if val > 65.0
|
|
] # TODO: hardcoded threshold for now, will be param of calculate_metrics
|
|
answer_matches = [idx for idx, val in enumerate(row["gold_answers_match"]) if val == 1.0]
|
|
if document_relevance_criterion == "document_id":
|
|
return id_matches
|
|
elif document_relevance_criterion == "context":
|
|
return context_matches
|
|
elif document_relevance_criterion == "answer":
|
|
return answer_matches
|
|
elif document_relevance_criterion == "document_id_and_context":
|
|
return list(set(id_matches) & set(context_matches))
|
|
elif document_relevance_criterion == "document_id_or_context":
|
|
return list(set(id_matches) | set(context_matches))
|
|
elif document_relevance_criterion == "document_id_and_answer":
|
|
return list(set(id_matches) & set(answer_matches))
|
|
elif document_relevance_criterion == "document_id_or_answer":
|
|
return list(set(id_matches) | set(answer_matches))
|
|
elif document_relevance_criterion == "context_and_answer":
|
|
return list(set(context_matches) & set(answer_matches))
|
|
elif document_relevance_criterion == "document_id_and_context_and_answer":
|
|
return list(set(id_matches) & set(context_matches) & set(answer_matches))
|
|
else:
|
|
raise ValueError(f"document_relevance_criterion '{document_relevance_criterion}' not supported.")
|
|
|
|
documents["matched_label_idxs"] = documents.apply(find_matched_label_idxs, axis=1)
|
|
|
|
metrics = []
|
|
|
|
for multilabel_id in documents["multilabel_id"].unique():
|
|
query_df = documents[documents["multilabel_id"] == multilabel_id]
|
|
|
|
# Note: Metrics are always calculated on document_ids.
|
|
# For some document relevance criteria (e.g. context), the gold_document_ids are not enough or not useful at all.
|
|
# So, we have to adjust the relevant ids according to the document_relevance_criterion.
|
|
relevance_criterion_col = f"{document_relevance_criterion.replace('document_id', 'gold_id')}_match"
|
|
relevant_rows = query_df[query_df[relevance_criterion_col] == 1]
|
|
|
|
# all labels without no_answers
|
|
# we need to match all (except for single hit recall)
|
|
gold_document_ids = (
|
|
list(query_df["gold_custom_document_ids"].iloc[0])
|
|
if "gold_custom_document_ids" in query_df
|
|
else list(query_df["gold_document_ids"].iloc[0])
|
|
)
|
|
# remove no_answer label
|
|
gold_document_ids = [id for id in gold_document_ids if id != "00"]
|
|
|
|
num_labels = len(gold_document_ids)
|
|
num_matched_labels = len({idx for idxs in relevant_rows["matched_label_idxs"] for idx in idxs})
|
|
num_missing_labels = num_labels - num_matched_labels
|
|
|
|
relevance_criterion_ids = list(relevant_rows["document_id"].values)
|
|
num_relevants = len(set(relevance_criterion_ids)) + num_missing_labels
|
|
|
|
num_retrieved = len(query_df["document_id"])
|
|
num_retrieved_relevants = len(relevant_rows)
|
|
rank_retrieved_relevants = relevant_rows["rank"].values
|
|
|
|
if num_labels == 0:
|
|
# For no_answer queries, we set all metrics to 1.0, to indicate that the retriever cannot improve the pipeline.
|
|
# This behavior is different from pytrec_eval, which sets the metrics to 0.0 if there is no relevant document in the evalset.
|
|
rr = 1.0
|
|
avg_precision = 1.0
|
|
recall_multi_hit = 1.0
|
|
recall_single_hit = 1.0
|
|
precision = 1.0
|
|
ndcg = 1.0
|
|
elif num_retrieved_relevants == 0:
|
|
# Set all metrics to 0.0 if no relevant document has been retrieved to avoid undefined metrics.
|
|
rr = 0.0
|
|
avg_precision = 0.0
|
|
recall_multi_hit = 0.0
|
|
recall_single_hit = 0.0
|
|
precision = 0.0
|
|
ndcg = 0.0
|
|
else:
|
|
# The previous checks ensure:
|
|
# - `num_labels` > 0
|
|
# - `num_retrieved_relevants` > 0
|
|
# - `num_relevants` > 0 (`num_relevants` is always >= `num_labels`)
|
|
# - `num_retrieved` > 0 (`num_retrieved` is always >= `num_retrieved_relevants`)
|
|
# - `len(rank_retrieved_relevants)` > 0 (`len(rank_retrieved_relevants)` is always == `num_retrieved_relevants`)
|
|
avp_retrieved_relevants = [
|
|
len(relevant_rows[relevant_rows["rank"] <= rank]) / rank for rank in rank_retrieved_relevants
|
|
]
|
|
avg_precision = np.sum(avp_retrieved_relevants) / num_relevants
|
|
recall_multi_hit = num_matched_labels / num_labels
|
|
recall_single_hit = 1.0
|
|
precision = num_retrieved_relevants / num_retrieved
|
|
rr = 1.0 / rank_retrieved_relevants.min()
|
|
dcg = np.sum([1.0 / np.log2(rank + 1) for rank in rank_retrieved_relevants])
|
|
idcg = np.sum([1.0 / np.log2(rank + 1) for rank in range(1, num_relevants + 1)])
|
|
ndcg = dcg / idcg
|
|
|
|
metrics.append(
|
|
{
|
|
"recall_multi_hit": recall_multi_hit,
|
|
"recall_single_hit": recall_single_hit,
|
|
"precision": precision,
|
|
"map": avg_precision,
|
|
"mrr": rr,
|
|
"ndcg": ndcg,
|
|
}
|
|
)
|
|
|
|
metrics_df = DataFrame.from_records(metrics, index=documents["multilabel_id"].unique())
|
|
return metrics_df
|
|
|
|
def save(self, out_dir: Union[str, Path], **to_csv_kwargs):
|
|
"""
|
|
Saves the evaluation result.
|
|
The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder.
|
|
|
|
:param out_dir: Path to the target folder the csvs will be saved.
|
|
:param to_csv_kwargs: kwargs to be passed to DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
|
|
This method uses different default values than DataFrame.to_csv() for the following parameters:
|
|
index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
|
|
"""
|
|
out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir)
|
|
logger.info("Saving evaluation results to %s", out_dir)
|
|
if not out_dir.exists():
|
|
out_dir.mkdir(parents=True)
|
|
for node_name, df in self.node_results.items():
|
|
target_path = out_dir / f"{node_name}.csv"
|
|
default_to_csv_kwargs = {
|
|
"index": False,
|
|
"quoting": csv.QUOTE_NONNUMERIC, # avoids problems with \r chars in texts by enclosing all string values in quotes
|
|
}
|
|
to_csv_kwargs = {**default_to_csv_kwargs, **to_csv_kwargs}
|
|
df.to_csv(target_path, **to_csv_kwargs)
|
|
|
|
@classmethod
|
|
def load(cls, load_dir: Union[str, Path], **read_csv_kwargs):
|
|
"""
|
|
Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.
|
|
|
|
:param load_dir: The directory containing the csv files.
|
|
:param read_csv_kwargs: kwargs to be passed to pd.read_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.
|
|
This method uses different default values than pd.read_csv() for the following parameters:
|
|
header=0, converters=CONVERTERS
|
|
where CONVERTERS is a dictionary mapping all array typed columns to ast.literal_eval.
|
|
"""
|
|
load_dir = load_dir if isinstance(load_dir, Path) else Path(load_dir)
|
|
csv_files = [file for file in load_dir.iterdir() if file.is_file() and file.suffix == ".csv"]
|
|
cols_to_convert = [
|
|
"filters",
|
|
"gold_document_ids",
|
|
"gold_custom_document_ids",
|
|
"gold_contexts",
|
|
"gold_answers",
|
|
"gold_documents_id_match",
|
|
"gold_offsets_in_documents",
|
|
"gold_offsets_in_contexts",
|
|
"gold_answers_exact_match",
|
|
"gold_answers_f1",
|
|
"gold_answers_sas",
|
|
"gold_answers_match",
|
|
"gold_contexts_similarity",
|
|
"offsets_in_document",
|
|
"offsets_in_context",
|
|
"document_ids",
|
|
"custom_document_ids",
|
|
"gold_document_contents",
|
|
]
|
|
|
|
def safe_literal_eval(x: str) -> Any:
|
|
if x == "":
|
|
return None
|
|
return ast.literal_eval(x)
|
|
|
|
converters = dict.fromkeys(cols_to_convert, safe_literal_eval)
|
|
default_read_csv_kwargs = {"converters": converters, "header": 0}
|
|
read_csv_kwargs = {**default_read_csv_kwargs, **read_csv_kwargs}
|
|
node_results = {file.stem: pd.read_csv(file, **read_csv_kwargs) for file in csv_files}
|
|
# backward compatibility mappings
|
|
for df in node_results.values():
|
|
df.replace(to_replace=np.nan, value=None, inplace=True)
|
|
df.rename(columns={"gold_document_contents": "gold_contexts", "content": "context"}, inplace=True)
|
|
# convert single document_id to list
|
|
if "answer" in df.columns and "document_id" in df.columns and not "document_ids" in df.columns:
|
|
df["document_ids"] = df["document_id"].apply(lambda x: [x] if x not in [None, "None"] else [])
|
|
df.drop(columns=["document_id"], inplace=True)
|
|
if (
|
|
"answer" in df.columns
|
|
and "custom_document_id" in df.columns
|
|
and not "custom_document_ids" in df.columns
|
|
):
|
|
df["custom_document_ids"] = df["custom_document_id"].apply(
|
|
lambda x: [x] if x not in [None, "None"] else []
|
|
)
|
|
df.drop(columns=["custom_document_id"], inplace=True)
|
|
result = cls(node_results)
|
|
return result
|