From a084a982c4e5a3e7b9a96ccd6bbae1a703721019 Mon Sep 17 00:00:00 2001 From: tstadel <60758086+tstadel@users.noreply.github.com> Date: Thu, 23 Jun 2022 18:40:17 +0200 Subject: [PATCH] Show warning in reader.eval() about differences compared to pipeline.eval() (#2477) * deprecate reader.eval * Update Documentation & Code Style * update warning to describe differences between pipeline.eval() * remove empty lines * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- haystack/nodes/reader/farm.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py index 260528d0a..b4355672d 100644 --- a/haystack/nodes/reader/farm.py +++ b/haystack/nodes/reader/farm.py @@ -818,6 +818,14 @@ class FARMReader(BaseReader): Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda") or use the Reader's device by default. """ + logger.warning( + "FARMReader.eval_on_file() uses a slightly different evaluation approach than `Pipeline.eval()`:\n" + "- instead of giving you full control over which labels to use, this method always returns three types of metrics: combined (no suffix), text_answer ('_text_answer' suffix) and no_answer ('_no_answer' suffix) metrics.\n" + "- instead of comparing predictions with labels on a string level, this method compares them on a token-ID level. This makes it unable to do any string normalization (e.g. normalize whitespaces) beforehand.\n" + "Hence, results might slightly differ from those of `Pipeline.eval()`\n." + "If you are just about starting to evaluate your model consider using `Pipeline.eval()` instead." + ) + if device is None: device = self.devices[0] else: @@ -887,6 +895,14 @@ class FARMReader(BaseReader): :param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence. Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default). """ + logger.warning( + "FARMReader.eval() uses a slightly different evaluation approach than `Pipeline.eval()`:\n" + "- instead of giving you full control over which labels to use, this method always returns three types of metrics: combined (no suffix), text_answer ('_text_answer' suffix) and no_answer ('_no_answer' suffix) metrics.\n" + "- instead of comparing predictions with labels on a string level, this method compares them on a token-ID level. This makes it unable to do any string normalization (e.g. normalize whitespaces) beforehand.\n" + "Hence, results might slightly differ from those of `Pipeline.eval()`\n." + "If you are just about starting to evaluate your model consider using `Pipeline.eval()` instead." + ) + if device is None: device = self.devices[0] else: