Show warning in reader.eval() about differences compared to pipeline.eval() (#2477)

* deprecate reader.eval

* Update Documentation & Code Style

* update warning to describe differences between pipeline.eval()

* remove empty lines

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
tstadel 2022-06-23 18:40:17 +02:00 committed by GitHub
parent e69492a28f
commit a084a982c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -818,6 +818,14 @@ class FARMReader(BaseReader):
Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda") Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
or use the Reader's device by default. or use the Reader's device by default.
""" """
logger.warning(
"FARMReader.eval_on_file() uses a slightly different evaluation approach than `Pipeline.eval()`:\n"
"- instead of giving you full control over which labels to use, this method always returns three types of metrics: combined (no suffix), text_answer ('_text_answer' suffix) and no_answer ('_no_answer' suffix) metrics.\n"
"- instead of comparing predictions with labels on a string level, this method compares them on a token-ID level. This makes it unable to do any string normalization (e.g. normalize whitespaces) beforehand.\n"
"Hence, results might slightly differ from those of `Pipeline.eval()`\n."
"If you are just about starting to evaluate your model consider using `Pipeline.eval()` instead."
)
if device is None: if device is None:
device = self.devices[0] device = self.devices[0]
else: else:
@ -887,6 +895,14 @@ class FARMReader(BaseReader):
:param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence. :param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default). Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
""" """
logger.warning(
"FARMReader.eval() uses a slightly different evaluation approach than `Pipeline.eval()`:\n"
"- instead of giving you full control over which labels to use, this method always returns three types of metrics: combined (no suffix), text_answer ('_text_answer' suffix) and no_answer ('_no_answer' suffix) metrics.\n"
"- instead of comparing predictions with labels on a string level, this method compares them on a token-ID level. This makes it unable to do any string normalization (e.g. normalize whitespaces) beforehand.\n"
"Hence, results might slightly differ from those of `Pipeline.eval()`\n."
"If you are just about starting to evaluate your model consider using `Pipeline.eval()` instead."
)
if device is None: if device is None:
device = self.devices[0] device = self.devices[0]
else: else: