mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-23 07:58:36 +00:00
Resolving issue 2853: no answer logic in FARMReader (#2856)
* Update FARMReader.eval_on_file to be consistent with FARMReader.eval * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
fc8ecbf20c
commit
44e2b1beed
@ -88,12 +88,14 @@ want to debug the Language Model, you might need to disable multiprocessing!
|
||||
Can be helpful to disable in production deployments to keep the logs clean.
|
||||
- `duplicate_filtering`: Answers are filtered based on their position. Both start and end position of the answers are considered.
|
||||
The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal.
|
||||
- `use_confidence_scores`: Sets the type of score that is returned with every predicted answer.
|
||||
- `use_confidence_scores`: Determines the type of score that is used for ranking a predicted answer.
|
||||
`True` => a scaled confidence / relevance score between [0, 1].
|
||||
This score can also be further calibrated on your dataset via self.eval()
|
||||
(see https://haystack.deepset.ai/components/reader#confidence-scores) .
|
||||
(see https://haystack.deepset.ai/components/reader#confidence-scores).
|
||||
`False` => an unscaled, raw score [-inf, +inf] which is the sum of start and end logit
|
||||
from the model for the predicted span.
|
||||
Using confidence scores can change the ranking of no_answer compared to using the
|
||||
unscaled raw scores.
|
||||
- `confidence_threshold`: Filters out predictions below confidence_threshold. Value should be between 0 and 1. Disabled by default.
|
||||
- `proxies`: Dict of proxy servers to use for downloading external models. Example: {'http': 'some.proxy:1234', 'http://hostname': 'my.proxy:3111'}
|
||||
- `local_files_only`: Whether to force checking for local files only (and forbid downloads)
|
||||
@ -427,7 +429,7 @@ Dict containing query and answers
|
||||
#### FARMReader.eval\_on\_file
|
||||
|
||||
```python
|
||||
def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None)
|
||||
def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None, calibrate_conf_scores: bool = False)
|
||||
```
|
||||
|
||||
Performs evaluation on a SQuAD-formatted file.
|
||||
@ -444,13 +446,14 @@ Returns a dict containing the following metrics:
|
||||
- `device`: The device on which the tensors should be processed.
|
||||
Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
|
||||
or use the Reader's device by default.
|
||||
- `calibrate_conf_scores`: Whether to calibrate the temperature for scaling of the confidence scores.
|
||||
|
||||
<a id="farm.FARMReader.eval"></a>
|
||||
|
||||
#### FARMReader.eval
|
||||
|
||||
```python
|
||||
def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False, use_no_answer_legacy_confidence=False)
|
||||
def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False)
|
||||
```
|
||||
|
||||
Performs evaluation on evaluation documents in the DocumentStore.
|
||||
@ -469,9 +472,7 @@ or use the Reader's device by default.
|
||||
- `label_index`: Index/Table name where labeled questions are stored
|
||||
- `doc_index`: Index/Table name where documents that are used for evaluation are stored
|
||||
- `label_origin`: Field name where the gold labels are stored
|
||||
- `calibrate_conf_scores`: Whether to calibrate the temperature for temperature scaling of the confidence scores
|
||||
- `use_no_answer_legacy_confidence`: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
|
||||
Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
|
||||
- `calibrate_conf_scores`: Whether to calibrate the temperature for scaling of the confidence scores.
|
||||
|
||||
<a id="farm.FARMReader.calibrate_confidence_scores"></a>
|
||||
|
||||
|
@ -38,18 +38,20 @@ class Evaluator:
|
||||
model: AdaptiveModel,
|
||||
return_preds_and_labels: bool = False,
|
||||
calibrate_conf_scores: bool = False,
|
||||
use_confidence_scores_for_ranking=True,
|
||||
use_no_answer_legacy_confidence=False,
|
||||
use_confidence_scores_for_ranking: bool = True,
|
||||
use_no_answer_legacy_confidence: bool = False,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Performs evaluation on a given model.
|
||||
|
||||
:param model: The model on which to perform evaluation
|
||||
:param return_preds_and_labels: Whether to add preds and labels in the returned dicts of the
|
||||
:param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores
|
||||
:param calibrate_conf_scores: Whether to calibrate the temperature for scaling of the confidence scores.
|
||||
:param use_confidence_scores_for_ranking: Whether to sort answers by confidence score (normalized between 0 and 1)(default) or by standard score (unbounded).
|
||||
:param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
|
||||
Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
|
||||
:param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference
|
||||
between the best overall answer confidence and the no_answer gap confidence.
|
||||
Otherwise, we use the no_answer score normalized to a range of [0,1] by
|
||||
an expit function (default).
|
||||
:return: all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics
|
||||
and reports generated during evaluation.
|
||||
"""
|
||||
|
@ -108,12 +108,14 @@ class FARMReader(BaseReader):
|
||||
Can be helpful to disable in production deployments to keep the logs clean.
|
||||
:param duplicate_filtering: Answers are filtered based on their position. Both start and end position of the answers are considered.
|
||||
The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal.
|
||||
:param use_confidence_scores: Sets the type of score that is returned with every predicted answer.
|
||||
:param use_confidence_scores: Determines the type of score that is used for ranking a predicted answer.
|
||||
`True` => a scaled confidence / relevance score between [0, 1].
|
||||
This score can also be further calibrated on your dataset via self.eval()
|
||||
(see https://haystack.deepset.ai/components/reader#confidence-scores) .
|
||||
(see https://haystack.deepset.ai/components/reader#confidence-scores).
|
||||
`False` => an unscaled, raw score [-inf, +inf] which is the sum of start and end logit
|
||||
from the model for the predicted span.
|
||||
Using confidence scores can change the ranking of no_answer compared to using the
|
||||
unscaled raw scores.
|
||||
:param confidence_threshold: Filters out predictions below confidence_threshold. Value should be between 0 and 1. Disabled by default.
|
||||
:param proxies: Dict of proxy servers to use for downloading external models. Example: {'http': 'some.proxy:1234', 'http://hostname': 'my.proxy:3111'}
|
||||
:param local_files_only: Whether to force checking for local files only (and forbid downloads)
|
||||
@ -873,7 +875,11 @@ class FARMReader(BaseReader):
|
||||
return result
|
||||
|
||||
def eval_on_file(
|
||||
self, data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None
|
||||
self,
|
||||
data_dir: Union[Path, str],
|
||||
test_filename: str,
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
calibrate_conf_scores: bool = False,
|
||||
):
|
||||
"""
|
||||
Performs evaluation on a SQuAD-formatted file.
|
||||
@ -887,6 +893,7 @@ class FARMReader(BaseReader):
|
||||
:param device: The device on which the tensors should be processed.
|
||||
Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda")
|
||||
or use the Reader's device by default.
|
||||
:param calibrate_conf_scores: Whether to calibrate the temperature for scaling of the confidence scores.
|
||||
"""
|
||||
logger.warning(
|
||||
"FARMReader.eval_on_file() uses a slightly different evaluation approach than `Pipeline.eval()`:\n"
|
||||
@ -918,7 +925,11 @@ class FARMReader(BaseReader):
|
||||
|
||||
evaluator = Evaluator(data_loader=data_loader, tasks=eval_processor.tasks, device=device)
|
||||
|
||||
eval_results = evaluator.eval(self.inferencer.model)
|
||||
eval_results = evaluator.eval(
|
||||
self.inferencer.model,
|
||||
calibrate_conf_scores=calibrate_conf_scores,
|
||||
use_confidence_scores_for_ranking=self.use_confidence_scores,
|
||||
)
|
||||
results = {
|
||||
"EM": eval_results[0]["EM"] * 100,
|
||||
"f1": eval_results[0]["f1"] * 100,
|
||||
@ -945,7 +956,6 @@ class FARMReader(BaseReader):
|
||||
doc_index: str = "eval_document",
|
||||
label_origin: str = "gold-label",
|
||||
calibrate_conf_scores: bool = False,
|
||||
use_no_answer_legacy_confidence=False,
|
||||
):
|
||||
"""
|
||||
Performs evaluation on evaluation documents in the DocumentStore.
|
||||
@ -961,9 +971,7 @@ class FARMReader(BaseReader):
|
||||
:param label_index: Index/Table name where labeled questions are stored
|
||||
:param doc_index: Index/Table name where documents that are used for evaluation are stored
|
||||
:param label_origin: Field name where the gold labels are stored
|
||||
:param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores
|
||||
:param use_no_answer_legacy_confidence: Whether to use the legacy confidence definition for no_answer: difference between the best overall answer confidence and the no_answer gap confidence.
|
||||
Otherwise we use the no_answer score normalized to a range of [0,1] by an expit function (default).
|
||||
:param calibrate_conf_scores: Whether to calibrate the temperature for scaling of the confidence scores.
|
||||
"""
|
||||
logger.warning(
|
||||
"FARMReader.eval() uses a slightly different evaluation approach than `Pipeline.eval()`:\n"
|
||||
@ -1082,7 +1090,6 @@ class FARMReader(BaseReader):
|
||||
self.inferencer.model,
|
||||
calibrate_conf_scores=calibrate_conf_scores,
|
||||
use_confidence_scores_for_ranking=self.use_confidence_scores,
|
||||
use_no_answer_legacy_confidence=use_no_answer_legacy_confidence,
|
||||
)
|
||||
toc = perf_counter()
|
||||
reader_time = toc - tic
|
||||
|
Loading…
x
Reference in New Issue
Block a user