diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index 54cdc9a6e..ad96fc821 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -297,7 +297,7 @@ Runs the Pipeline, one node at a time. - `labels`: Ground-truth labels that you can use to perform an isolated evaluation of pipelines. These labels are input to nodes in the pipeline. - `documents`: A list of Document objects to be processed by the Pipeline Nodes. - `meta`: Files' metadata. Used in indexing pipelines in combination with `file_paths`. -- `params`: Dictionary of parameters to be dispatched to the nodes. +- `params`: A dictionary of parameters that you want to pass to the nodes. To pass a parameter to all Nodes, use: `{"top_k": 10}`. To pass a parameter to targeted Nodes, run: `{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}}` @@ -332,7 +332,7 @@ Here's what this method returns for Reader-only pipelines: - `labels`: Ground-truth labels that you can use to perform an isolated evaluation of pipelines. These labels are input to nodes in the pipeline. - `documents`: A list of Document objects or a list of lists of Document objects to be processed by the Pipeline Nodes. - `meta`: Files' metadata. Used in indexing pipelines in combination with `file_paths`. -- `params`: Dictionary of parameters to be dispatched to the nodes. +- `params`: A dictionary of parameters that you want to pass to the nodes. To pass a parameter to all Nodes, use: `{"top_k": 10}`. To pass a parameter to targeted Nodes, run: `{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}}` @@ -375,7 +375,7 @@ Each metric is represented by a dictionary containing the scores for each top_k ```python @classmethod -def execute_eval_run(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, evaluation_set_labels: List[MultiLabel], corpus_file_paths: List[str], experiment_name: str, experiment_run_name: str, experiment_tracking_tool: Literal["mlflow", None] = None, experiment_tracking_uri: Optional[str] = None, corpus_file_metas: List[Dict[str, Any]] = None, corpus_meta: Dict[str, Any] = {}, evaluation_set_meta: Dict[str, Any] = {}, pipeline_meta: Dict[str, Any] = {}, index_params: dict = {}, query_params: dict = {}, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, reuse_index: bool = False, custom_document_id_field: Optional[str] = None, document_scope: Literal[ +def execute_eval_run(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, evaluation_set_labels: List[MultiLabel], corpus_file_paths: List[str], experiment_name: str, experiment_run_name: str, experiment_tracking_tool: Literal["mlflow", None] = None, experiment_tracking_uri: Optional[str] = None, corpus_file_metas: List[Dict[str, Any]] = None, corpus_meta: Dict[str, Any] = {}, evaluation_set_meta: Dict[str, Any] = {}, pipeline_meta: Dict[str, Any] = {}, index_params: dict = {}, query_params: dict = {}, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, use_batch_mode: bool = False, add_isolated_node_eval: bool = False, reuse_index: bool = False, custom_document_id_field: Optional[str] = None, document_scope: Literal[ "document_id", "context", "document_id_and_context", @@ -442,11 +442,11 @@ Note, that artifact logging (e.g. Pipeline YAML or evaluation result CSVs) are c - `query_params`: The params to use during querying (see pipeline.run's params). - `sas_model_name_or_path`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric. The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps. -Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture. +Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture with being close to 1. More info in the paper: https://arxiv.org/abs/2108.06130 -Models: +Here are some guidelines regarding the models that you can use: - You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data. -Not all cross encoders can be used because of different return types. +The return type of the encoder needs to be a single prediction score (as opposed to multiple scores). If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class - Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" - Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large" @@ -454,8 +454,9 @@ If you use custom cross encoders please make sure they work with sentence_transf - `sas_batch_size`: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS. - `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity. Falls back to CPU if no GPU is available. +- `use_batch_mode`: Whether to use batches for pipeline executions or single queries (default). - `add_isolated_node_eval`: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode. -This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. +The isolated mode shows you how each node is performing on its own and helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance. If a node's performance is similar in both modes, this node itself needs to be optimized to improve the pipeline's performance. The isolated evaluation calculates the upper bound of each node's evaluation metrics under the assumption that it received perfect inputs from the previous node. @@ -509,7 +510,7 @@ Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scori ```python @send_event -def eval(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0, use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult +def eval(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0, use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult ``` Evaluates the pipeline by running the pipeline once per query in debug mode @@ -527,17 +528,17 @@ Some of these scopes require additional information that already needs to be spe - `labels`: The labels to evaluate on - `documents`: List of List of Document that the first node in the pipeline should get as input per multilabel. Can be used to evaluate a pipeline that consists of a reader without a retriever. -- `params`: Dictionary of parameters to be dispatched to the nodes. +- `params`: A dictionary of parameters that you want to pass to the nodes. If you want to pass a param to all nodes, you can just use: {"top_k":10} If you want to pass it to targeted nodes, you can do: {"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}} - `sas_model_name_or_path`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric. The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps. -Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture. +Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture with being close to 1. More info in the paper: https://arxiv.org/abs/2108.06130 -Models: +Here are some guidelines regarding the models that you can use: - You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data. -Not all cross encoders can be used because of different return types. +The return type of the encoder needs to be a single prediction score (as opposed to multiple scores). If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class - Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" - Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large" @@ -546,7 +547,7 @@ If you use custom cross encoders please make sure they work with sentence_transf - `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity. Falls back to CPU if no GPU is available. - `add_isolated_node_eval`: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode. -This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. +The isolated mode shows you how each node is performing on its own and helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance. If a node's performance is similar in both modes, this node itself needs to be optimized to improve the pipeline's performance. The isolated evaluation calculates the upper bound of each node's evaluation metrics under the assumption that it received perfect inputs from the previous node. @@ -569,6 +570,72 @@ If this parameter is set to `True`, then the token generated when running Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + + +#### Pipeline.eval\_batch + +```python +@send_event +def eval_batch(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0, use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult +``` + +Evaluates the pipeline by running it in batches in the debug mode + +and putting together all data that are needed for evaluation, for example, calculating metrics. + +To calculate SAS (Semantic Answer Similarity) metrics, specify `sas_model_name_or_path`. + +You can control the scope within which an answer or a document is considered correct afterwards (see `document_scope` and `answer_scope` params in `EvaluationResult.calculate_metrics()`). +For some of these scopes, you need to add the following information during `eval()`: +- `custom_document_id_field` parameter to select a custom document ID from document's metadata for ID matching (only affects 'document_id' scopes). +- `context_matching_...` parameter to fine-tune the fuzzy matching mechanism that determines whether text contexts match each other (only affects 'context' scopes, default values should work most of the time). + +**Arguments**: + +- `labels`: The labels to evaluate on. +- `documents`: List of List of Document that the first node in the pipeline gets as input per multilabel. You can use it to evaluate a pipeline that consists of a reader without a retriever. +- `params`: Dictionary of parameters to be dispatched to the nodes. +To pass a parameter to all nodes, just use: {"top_k":10}. +To pass a parametrer to targeted nodes, you can type: +{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}} +- `sas_model_name_or_path`: Name or path of the SAS model. If you specify the path, the model is used to calculate the similarity between predictions and labels and to generate the SAS metric. +The SAS metric correlates better with the human judgment of correct answers as it does not rely on string overlaps. +Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture. +If you want to learn more, have a look at the [Semantic Answer Similarity for Evaluating Question Answering Models](https://arxiv.org/abs/2108.06130) paper. +Models: +- You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data. +The return type of the encoder needs to be a single prediction score (as opposed to multiple scores). +When using custom cross encoders, ensure they work with the `sentence_transformers.CrossEncoder` class. +- This is a good default model for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2." +- A large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large". +- A large model for German only: "deepset/gbert-large-sts". +- `sas_batch_size`: The number of prediction label pairs you want to encode at once by CrossEncoder or SentenceTransformer while calculating SAS. +- `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity. +It uses CPU if no GPU is available. +- `add_isolated_node_eval`: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode. +The isolated mode shows you how each node is performing on its own and helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. +If a node performs much better in the isolated evaluation than in the integrated evaluation, it means you should optimize the preceding node to improve the pipeline's performance. +If a node's performance is similar in both modes, it means you should optimize this node itself to improve the pipeline's performance. +The isolated evaluation calculates the upper bound of each node's evaluation metrics, assuming it received perfect inputs from the previous node. +To achieve this, the isolated evaluation uses labels as input to the node instead of the output of the previous node in the pipeline. +The generated dataframes in the EvaluationResult then contain additional rows, which you can tell apart from the integrated evaluation results based on the +values "integrated" or "isolated" in the column "eval_mode". The evaluation report then additionally lists the upper bound of each node's evaluation metrics. +- `custom_document_id_field`: Custom field name within `Document`'s `meta` which identifies the document. This field is used as a criterion for matching documents to labels during evaluation. +This is especially useful if you want to match documents on other criteria (for example, file names) than the default document IDs as these could be heavily influenced by preprocessing. +If you don't set any value, the default `Document`'s `id` is used as a criterion for matching documents to labels. +- `context_matching_min_length`: The minimum string length context and candidate need to have in order to be scored. +Returns 0.0 otherwise. +- `context_matching_boost_split_overlaps`: Whether to boost split overlaps (for example, [AB] <-> [BC]) that result from different preprocessing params. +If we detect that the score is near a half match and the matching part of the candidate is at its boundaries, +we cut the context on the same side, recalculate the score and, take the mean of both. +Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total. +- `context_matching_threshold`: Score threshold that candidates must surpass to be included into the result list. Range: [0,100]. +- `use_auth_token`: The API token used to download private models from Huggingface. +If this parameter is set to `True`, then the token generated when running +`transformers-cli login` (stored in ~/.huggingface) will be used. +Additional information can be found here +https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + #### Pipeline.get\_nodes\_by\_class @@ -1246,6 +1313,46 @@ we cut the context on the same side, recalculate the score and take the mean of Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total. - `context_matching_threshold`: Score threshold that candidates must surpass to be included into the result list. Range: [0,100] + + +#### BaseStandardPipeline.eval\_batch + +```python +def eval_batch(labels: List[MultiLabel], params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult +``` + +Evaluates the pipeline by running the pipeline once per query in the debug mode + +and putting together all data that is needed for evaluation, for example, calculating metrics. + +To calculate SAS (Semantic Answer Similarity) metrics, specify `sas_model_name_or_path`. + + You can control the scope within which an Answer or a Document is considered correct afterwards (see `document_scope` and `answer_scope` params in `EvaluationResult.calculate_metrics()`). + For some of these scopes, you need to add the following information during `eval()`: + - `custom_document_id_field` parameter to select a custom document ID from document's metadata for ID matching (only affects 'document_id' scopes). + - `context_matching_...` parameter to fine-tune the fuzzy matching mechanism that determines whether text contexts match each other (only affects 'context' scopes, default values should work most of the time). + + :param labels: The labels to evaluate on. + :param params: Parameters for the `retriever` and `reader`. For instance, + params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}. + :param sas_model_name_or_path: Sentence transformers semantic textual similarity model you want to use for the SAS value calculation. + It should be a path or a string pointing to downloadable models. + :param sas_batch_size: Number of prediction label pairs to encode at once by cross encoder or sentence transformer while calculating SAS. + :param sas_use_gpu: Whether to use a GPU or the CPU for calculating semantic answer similarity. + Falls back to CPU if no GPU is available. + :param add_isolated_node_eval: Whether to additionally evaluate the reader based on labels as input, instead of the output of the previous node in the pipeline. + :param custom_document_id_field: Custom field name within `Document`'s `meta` which identifies the document and is used as a criterion for matching documents to labels during evaluation. + This is especially useful if you want to match documents on other criteria (for example, file names) than the default document IDs, as these could be heavily influenced by preprocessing. + If not set, the default `Document`'s `id` is used as the criterion for matching documents to labels. + :param context_matching_min_length: The minimum string length context and candidate need to have to be scored. + Returns 0.0 otherwise. + :param context_matching_boost_split_overlaps: Whether to boost split overlaps (for example, [AB] <-> [BC]) that result from different preprocessing parameters. + If we detect that the score is near a half match and the matching part of the candidate is at its boundaries, + we cut the context on the same side, recalculate the score, and take the mean of both. + Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total. + :param context_matching_threshold: Score threshold that candidates must surpass to be included into the result list. Range: [0,100] + + #### BaseStandardPipeline.print\_eval\_report diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index 4f2ba8ca1..85334930b 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -466,7 +466,7 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### TfidfRetriever.retrieve\_batch ```python -def retrieve_batch(queries: List[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] +def retrieve_batch(queries: Union[str, List[str]], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents diff --git a/haystack/document_stores/utils.py b/haystack/document_stores/utils.py index ddb16d3cd..ff14a5c74 100644 --- a/haystack/document_stores/utils.py +++ b/haystack/document_stores/utils.py @@ -146,7 +146,7 @@ def _extract_docs_and_labels_from_dict( ## Create Document cur_full_doc = Document(content=paragraph["context"], meta=cur_meta) if preprocessor is not None: - splits_docs = preprocessor.process(cur_full_doc) + splits_docs = preprocessor.process(documents=[cur_full_doc]) # we need to pull in _split_id into the document id for unique reference in labels splits: List[Document] = [] offset = 0 diff --git a/haystack/nodes/reader/base.py b/haystack/nodes/reader/base.py index ebf6b7d7f..b7c524c6e 100644 --- a/haystack/nodes/reader/base.py +++ b/haystack/nodes/reader/base.py @@ -109,8 +109,10 @@ class BaseReader(BaseComponent): documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None, + labels: Optional[List[MultiLabel]] = None, + add_isolated_node_eval: bool = False, ): - self.query_count += len(queries) if isinstance(queries, list) else 1 + self.query_count += len(queries) if not documents: return {"answers": []}, "output_1" @@ -121,17 +123,43 @@ class BaseReader(BaseComponent): # Add corresponding document_name and more meta data, if an answer contains the document_id answer_iterator = itertools.chain.from_iterable(results["answers"]) if isinstance(documents[0], Document): - if isinstance(queries, list): - answer_iterator = itertools.chain.from_iterable(itertools.chain.from_iterable(results["answers"])) + answer_iterator = itertools.chain.from_iterable(itertools.chain.from_iterable(results["answers"])) flattened_documents = [] for doc_list in documents: if isinstance(doc_list, list): flattened_documents.extend(doc_list) else: flattened_documents.append(doc_list) + for answer in answer_iterator: BaseReader.add_doc_meta_data_to_answer(documents=flattened_documents, answer=answer) + # run evaluation with labels as node inputs + if add_isolated_node_eval and labels is not None: + relevant_documents = [] + for labelx in labels: + relevant_documents.append([label.document for label in labelx.labels]) + results_label_input = predict_batch(queries=queries, documents=relevant_documents, top_k=top_k) + + # Add corresponding document_name and more meta data, if an answer contains the document_id + answer_iterator = itertools.chain.from_iterable(results_label_input["answers"]) + if isinstance(documents[0], Document): + if isinstance(queries, list): + answer_iterator = itertools.chain.from_iterable( + itertools.chain.from_iterable(results_label_input["answers"]) + ) + flattened_documents = [] + for doc_list in documents: + if isinstance(doc_list, list): + flattened_documents.extend(doc_list) + else: + flattened_documents.append(doc_list) + + for answer in answer_iterator: + BaseReader.add_doc_meta_data_to_answer(documents=flattened_documents, answer=answer) + + results["answers_isolated"] = results_label_input["answers"] + return results, "output_1" def timing(self, fn, attr_name): diff --git a/haystack/nodes/retriever/sparse.py b/haystack/nodes/retriever/sparse.py index e07d4c5a8..3f87e6879 100644 --- a/haystack/nodes/retriever/sparse.py +++ b/haystack/nodes/retriever/sparse.py @@ -532,7 +532,7 @@ class TfidfRetriever(BaseRetriever): def retrieve_batch( self, - queries: List[str], + queries: Union[str, List[str]], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 661e5da86..a61945f1a 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -456,7 +456,7 @@ class Pipeline: :param labels: Ground-truth labels that you can use to perform an isolated evaluation of pipelines. These labels are input to nodes in the pipeline. :param documents: A list of Document objects to be processed by the Pipeline Nodes. :param meta: Files' metadata. Used in indexing pipelines in combination with `file_paths`. - :param params: Dictionary of parameters to be dispatched to the nodes. + :param params: A dictionary of parameters that you want to pass to the nodes. To pass a parameter to all Nodes, use: `{"top_k": 10}`. To pass a parameter to targeted Nodes, run: `{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}}` @@ -587,7 +587,7 @@ class Pipeline: :param labels: Ground-truth labels that you can use to perform an isolated evaluation of pipelines. These labels are input to nodes in the pipeline. :param documents: A list of Document objects or a list of lists of Document objects to be processed by the Pipeline Nodes. :param meta: Files' metadata. Used in indexing pipelines in combination with `file_paths`. - :param params: Dictionary of parameters to be dispatched to the nodes. + :param params: A dictionary of parameters that you want to pass to the nodes. To pass a parameter to all Nodes, use: `{"top_k": 10}`. To pass a parameter to targeted Nodes, run: `{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}}` @@ -804,6 +804,7 @@ class Pipeline: sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, + use_batch_mode: bool = False, add_isolated_node_eval: bool = False, reuse_index: bool = False, custom_document_id_field: Optional[str] = None, @@ -875,11 +876,11 @@ class Pipeline: :param query_params: The params to use during querying (see pipeline.run's params). :param sas_model_name_or_path: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric. The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps. - Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture. + Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture with being close to 1. More info in the paper: https://arxiv.org/abs/2108.06130 - Models: + Here are some guidelines regarding the models that you can use: - You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data. - Not all cross encoders can be used because of different return types. + The return type of the encoder needs to be a single prediction score (as opposed to multiple scores). If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class - Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" - Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large" @@ -887,8 +888,9 @@ class Pipeline: :param sas_batch_size: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS. :param sas_use_gpu: Whether to use a GPU or the CPU for calculating semantic answer similarity. Falls back to CPU if no GPU is available. + :param use_batch_mode: Whether to use batches for pipeline executions or single queries (default). :param add_isolated_node_eval: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode. - This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. + The isolated mode shows you how each node is performing on its own and helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance. If a node's performance is similar in both modes, this node itself needs to be optimized to improve the pipeline's performance. The isolated evaluation calculates the upper bound of each node's evaluation metrics under the assumption that it received perfect inputs from the previous node. @@ -990,18 +992,32 @@ class Pipeline: tracker.track_params({"pipeline_index_document_count": document_count}) - eval_result = query_pipeline.eval( - labels=evaluation_set_labels, - params=query_params, - sas_model_name_or_path=sas_model_name_or_path, - sas_batch_size=sas_batch_size, - sas_use_gpu=sas_use_gpu, - add_isolated_node_eval=add_isolated_node_eval, - custom_document_id_field=custom_document_id_field, - context_matching_boost_split_overlaps=context_matching_boost_split_overlaps, - context_matching_min_length=context_matching_min_length, - context_matching_threshold=context_matching_threshold, - ) + if use_batch_mode: + eval_result = query_pipeline.eval_batch( + labels=evaluation_set_labels, + params=query_params, + sas_model_name_or_path=sas_model_name_or_path, + sas_batch_size=sas_batch_size, + sas_use_gpu=sas_use_gpu, + add_isolated_node_eval=add_isolated_node_eval, + custom_document_id_field=custom_document_id_field, + context_matching_boost_split_overlaps=context_matching_boost_split_overlaps, + context_matching_min_length=context_matching_min_length, + context_matching_threshold=context_matching_threshold, + ) + else: + eval_result = query_pipeline.eval( + labels=evaluation_set_labels, + params=query_params, + sas_model_name_or_path=sas_model_name_or_path, + sas_batch_size=sas_batch_size, + sas_use_gpu=sas_use_gpu, + add_isolated_node_eval=add_isolated_node_eval, + custom_document_id_field=custom_document_id_field, + context_matching_boost_split_overlaps=context_matching_boost_split_overlaps, + context_matching_min_length=context_matching_min_length, + context_matching_threshold=context_matching_threshold, + ) integrated_metrics = eval_result.calculate_metrics(document_scope=document_scope, answer_scope=answer_scope) integrated_top_1_metrics = eval_result.calculate_metrics( @@ -1054,7 +1070,7 @@ class Pipeline: labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, - sas_model_name_or_path: str = None, + sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, @@ -1077,17 +1093,17 @@ class Pipeline: :param labels: The labels to evaluate on :param documents: List of List of Document that the first node in the pipeline should get as input per multilabel. Can be used to evaluate a pipeline that consists of a reader without a retriever. - :param params: Dictionary of parameters to be dispatched to the nodes. + :param params: A dictionary of parameters that you want to pass to the nodes. If you want to pass a param to all nodes, you can just use: {"top_k":10} If you want to pass it to targeted nodes, you can do: {"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}} :param sas_model_name_or_path: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric. The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps. - Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture. + Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture with being close to 1. More info in the paper: https://arxiv.org/abs/2108.06130 - Models: + Here are some guidelines regarding the models that you can use: - You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data. - Not all cross encoders can be used because of different return types. + The return type of the encoder needs to be a single prediction score (as opposed to multiple scores). If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class - Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" - Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large" @@ -1096,7 +1112,7 @@ class Pipeline: :param sas_use_gpu: Whether to use a GPU or the CPU for calculating semantic answer similarity. Falls back to CPU if no GPU is available. :param add_isolated_node_eval: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode. - This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. + The isolated mode shows you how each node is performing on its own and helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance. If a node's performance is similar in both modes, this node itself needs to be optimized to improve the pipeline's performance. The isolated evaluation calculates the upper bound of each node's evaluation metrics under the assumption that it received perfect inputs from the previous node. @@ -1121,10 +1137,7 @@ class Pipeline: """ eval_result = EvaluationResult() if add_isolated_node_eval: - if params is None: - params = {} - else: - params = params.copy() + params = {} if params is None else params.copy() params["add_isolated_node_eval"] = True # if documents is None, set docs_per_label to None for each label @@ -1144,8 +1157,8 @@ class Pipeline: for node_name in predictions["_debug"].keys(): node_output = predictions["_debug"][node_name]["output"] df = self._build_eval_dataframe( - query=label.query, - query_labels=label, + queries=[label.query], + query_labels_per_query=[label], node_name=node_name, node_output=node_output, custom_document_id_field=custom_document_id_field, @@ -1155,6 +1168,135 @@ class Pipeline: ) eval_result.append(node_name, df) + eval_result = self._add_sas_to_eval_result( + sas_model_name_or_path=sas_model_name_or_path, + sas_batch_size=sas_batch_size, + sas_use_gpu=sas_use_gpu, + context_matching_threshold=context_matching_threshold, + eval_result=eval_result, + use_auth_token=use_auth_token, + ) + # reorder columns for better qualitative evaluation + eval_result = self._reorder_columns_in_eval_result(eval_result=eval_result) + + return eval_result + + @send_event + def eval_batch( + self, + labels: List[MultiLabel], + documents: Optional[List[List[Document]]] = None, + params: Optional[dict] = None, + sas_model_name_or_path: Optional[str] = None, + sas_batch_size: int = 32, + sas_use_gpu: bool = True, + add_isolated_node_eval: bool = False, + custom_document_id_field: Optional[str] = None, + context_matching_min_length: int = 100, + context_matching_boost_split_overlaps: bool = True, + context_matching_threshold: float = 65.0, + use_auth_token: Optional[Union[str, bool]] = None, + ) -> EvaluationResult: + """ + Evaluates the pipeline by running it in batches in the debug mode + and putting together all data that are needed for evaluation, for example, calculating metrics. + + To calculate SAS (Semantic Answer Similarity) metrics, specify `sas_model_name_or_path`. + + You can control the scope within which an answer or a document is considered correct afterwards (see `document_scope` and `answer_scope` params in `EvaluationResult.calculate_metrics()`). + For some of these scopes, you need to add the following information during `eval()`: + - `custom_document_id_field` parameter to select a custom document ID from document's metadata for ID matching (only affects 'document_id' scopes). + - `context_matching_...` parameter to fine-tune the fuzzy matching mechanism that determines whether text contexts match each other (only affects 'context' scopes, default values should work most of the time). + + :param labels: The labels to evaluate on. + :param documents: List of List of Document that the first node in the pipeline gets as input per multilabel. You can use it to evaluate a pipeline that consists of a reader without a retriever. + :param params: Dictionary of parameters to be dispatched to the nodes. + To pass a parameter to all nodes, just use: {"top_k":10}. + To pass a parametrer to targeted nodes, you can type: + {"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}} + :param sas_model_name_or_path: Name or path of the SAS model. If you specify the path, the model is used to calculate the similarity between predictions and labels and to generate the SAS metric. + The SAS metric correlates better with the human judgment of correct answers as it does not rely on string overlaps. + Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture. + If you want to learn more, have a look at the [Semantic Answer Similarity for Evaluating Question Answering Models](https://arxiv.org/abs/2108.06130) paper. + Models: + - You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data. + The return type of the encoder needs to be a single prediction score (as opposed to multiple scores). + When using custom cross encoders, ensure they work with the `sentence_transformers.CrossEncoder` class. + - This is a good default model for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2." + - A large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large". + - A large model for German only: "deepset/gbert-large-sts". + :param sas_batch_size: The number of prediction label pairs you want to encode at once by CrossEncoder or SentenceTransformer while calculating SAS. + :param sas_use_gpu: Whether to use a GPU or the CPU for calculating semantic answer similarity. + It uses CPU if no GPU is available. + :param add_isolated_node_eval: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode. + The isolated mode shows you how each node is performing on its own and helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. + If a node performs much better in the isolated evaluation than in the integrated evaluation, it means you should optimize the preceding node to improve the pipeline's performance. + If a node's performance is similar in both modes, it means you should optimize this node itself to improve the pipeline's performance. + The isolated evaluation calculates the upper bound of each node's evaluation metrics, assuming it received perfect inputs from the previous node. + To achieve this, the isolated evaluation uses labels as input to the node instead of the output of the previous node in the pipeline. + The generated dataframes in the EvaluationResult then contain additional rows, which you can tell apart from the integrated evaluation results based on the + values "integrated" or "isolated" in the column "eval_mode". The evaluation report then additionally lists the upper bound of each node's evaluation metrics. + :param custom_document_id_field: Custom field name within `Document`'s `meta` which identifies the document. This field is used as a criterion for matching documents to labels during evaluation. + This is especially useful if you want to match documents on other criteria (for example, file names) than the default document IDs as these could be heavily influenced by preprocessing. + If you don't set any value, the default `Document`'s `id` is used as a criterion for matching documents to labels. + :param context_matching_min_length: The minimum string length context and candidate need to have in order to be scored. + Returns 0.0 otherwise. + :param context_matching_boost_split_overlaps: Whether to boost split overlaps (for example, [AB] <-> [BC]) that result from different preprocessing params. + If we detect that the score is near a half match and the matching part of the candidate is at its boundaries, + we cut the context on the same side, recalculate the score and, take the mean of both. + Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total. + :param context_matching_threshold: Score threshold that candidates must surpass to be included into the result list. Range: [0,100]. + :param use_auth_token: The API token used to download private models from Huggingface. + If this parameter is set to `True`, then the token generated when running + `transformers-cli login` (stored in ~/.huggingface) will be used. + Additional information can be found here + https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + """ + eval_result = EvaluationResult() + if add_isolated_node_eval: + params = {} if params is None else params.copy() + params["add_isolated_node_eval"] = True + + predictions_batches = self.run_batch( + queries=[label.query for label in labels], labels=labels, documents=documents, params=params, debug=True + ) + + for node_name in predictions_batches["_debug"].keys(): + node_output = predictions_batches["_debug"][node_name]["output"] + df = self._build_eval_dataframe( + queries=predictions_batches["queries"], + query_labels_per_query=predictions_batches["labels"], + node_name=node_name, + node_output=node_output, + custom_document_id_field=custom_document_id_field, + context_matching_threshold=context_matching_threshold, + context_matching_boost_split_overlaps=context_matching_boost_split_overlaps, + context_matching_min_length=context_matching_min_length, + ) + eval_result.append(node_name, df) + + eval_result = self._add_sas_to_eval_result( + sas_model_name_or_path=sas_model_name_or_path, + sas_batch_size=sas_batch_size, + sas_use_gpu=sas_use_gpu, + context_matching_threshold=context_matching_threshold, + eval_result=eval_result, + use_auth_token=use_auth_token, + ) + # reorder columns for better qualitative evaluation + eval_result = self._reorder_columns_in_eval_result(eval_result=eval_result) + + return eval_result + + def _add_sas_to_eval_result( + self, + sas_model_name_or_path: Optional[str], + sas_batch_size: int, + sas_use_gpu: bool, + context_matching_threshold: float, + eval_result: EvaluationResult, + use_auth_token: Optional[Union[str, bool]] = None, + ) -> EvaluationResult: # add sas values in batch mode for whole Dataframe # this is way faster than if we calculate it for each query separately if sas_model_name_or_path is not None: @@ -1205,54 +1347,56 @@ class Pipeline: ) ) - # reorder columns for better qualitative evaluation + return eval_result + + def _reorder_columns_in_eval_result(self, eval_result: EvaluationResult) -> EvaluationResult: + desired_col_order = [ + "multilabel_id", # generic + "query", # generic + "filters", # generic + "gold_answers", # answer-specific + "answer", # answer-specific + "context", # generic + "exact_match", # answer-specific + "f1", # answer-specific + "sas", # answer-specific + "exact_match_context_scope", # answer-specific + "f1_context_scope", # answer-specific + "sas_context_scope", # answer-specific + "exact_match_document_id_scope", # answer-specific + "f1_document_id_scope", # answer-specific + "sas_document_id_scope", # answer-specific + "exact_match_document_id_and_context_scope", # answer-specific + "f1_document_id_and_context_scope", # answer-specific + "sas_document_id_and_context_scope", # answer-specific + "gold_contexts", # generic + "gold_id_match", # doc-specific + "context_match", # doc-specific + "answer_match", # doc-specific + "gold_id_or_answer_match", # doc-specific + "gold_id_and_answer_match", # doc-specific + "gold_id_or_context_match", # doc-specific + "gold_id_and_context_match", # doc-specific + "gold_id_and_context_and_answer_match", # doc-specific + "context_and_answer_match", # doc-specific + "rank", # generic + "document_id", # generic + "gold_document_ids", # generic + "custom_document_id", # generic + "gold_custom_document_ids", # generic + "offsets_in_document", # answer-specific + "gold_offsets_in_documents", # answer-specific + "gold_answers_exact_match", # answer-specific + "gold_answers_f1", # answer-specific + "gold_answers_sas", # answer-specific + "gold_documents_id_match", # generic + "gold_contexts_similarity", # generic + "gold_answers_match", # doc-specific + "type", # generic + "node", # generic + "eval_mode", # generic + ] for key, df in eval_result.node_results.items(): - desired_col_order = [ - "multilabel_id", # generic - "query", # generic - "filters", # generic - "gold_answers", # answer-specific - "answer", # answer-specific - "context", # generic - "exact_match", # answer-specific - "f1", # answer-specific - "sas", # answer-specific - "exact_match_context_scope", # answer-specific - "f1_context_scope", # answer-specific - "sas_context_scope", # answer-specific - "exact_match_document_id_scope", # answer-specific - "f1_document_id_scope", # answer-specific - "sas_document_id_scope", # answer-specific - "exact_match_document_id_and_context_scope", # answer-specific - "f1_document_id_and_context_scope", # answer-specific - "sas_document_id_and_context_scope", # answer-specific - "gold_contexts", # generic - "gold_id_match", # doc-specific - "context_match", # doc-specific - "answer_match", # doc-specific - "gold_id_or_answer_match", # doc-specific - "gold_id_and_answer_match", # doc-specific - "gold_id_or_context_match", # doc-specific - "gold_id_and_context_match", # doc-specific - "gold_id_and_context_and_answer_match", # doc-specific - "context_and_answer_match", # doc-specific - "rank", # generic - "document_id", # generic - "gold_document_ids", # generic - "custom_document_id", # generic - "gold_custom_document_ids", # generic - "offsets_in_document", # answer-specific - "gold_offsets_in_documents", # answer-specific - "gold_answers_exact_match", # answer-specific - "gold_answers_f1", # answer-specific - "gold_answers_sas", # answer-specific - "gold_documents_id_match", # generic - "gold_contexts_similarity", # generic - "gold_answers_match", # doc-specific - "type", # generic - "node", # generic - "eval_mode", # generic - ] eval_result.node_results[key] = self._reorder_columns(df, desired_col_order) return eval_result @@ -1266,8 +1410,8 @@ class Pipeline: def _build_eval_dataframe( self, - query: str, - query_labels: MultiLabel, + queries: List[str], + query_labels_per_query: List[MultiLabel], node_name: str, node_output: dict, custom_document_id_field: Optional[str] = None, @@ -1284,269 +1428,286 @@ class Pipeline: Additional answer or document specific evaluation infos like gold labels and metrics depicting whether the row matches the gold labels are included, too. """ - - if query_labels is None or query_labels.labels is None: - logger.warning(f"There is no label for query '{query}'. Query will be omitted.") - return pd.DataFrame() - - # remarks for no_answers: - # Single 'no_answer'-labels are not contained in MultiLabel aggregates. - # If all labels are no_answers, MultiLabel.answers will be [""] and the other aggregates [] - gold_answers = query_labels.answers - gold_offsets_in_documents = query_labels.offsets_in_documents - gold_document_ids = query_labels.document_ids - gold_custom_document_ids = ( - [l.document.meta[custom_document_id_field] for l in query_labels.labels if not l.no_answer] - if custom_document_id_field is not None - else [] - ) - gold_contexts = query_labels.contexts - - # if node returned answers, include answer specific info: - # - the answer returned itself - # - the document_id the answer was found in - # - the position or offsets within the document the answer was found - # - the surrounding context of the answer within the document - # - the gold answers - # - the position or offsets of the gold answer within the document - # - the gold document ids containing the answer - # - the exact_match metric depicting if the answer exactly matches the gold label - # - the f1 metric depicting how well the answer overlaps with the gold label on token basis - # - the sas metric depicting how well the answer matches the gold label on a semantic basis. - # this will be calculated on all queries in eval() for performance reasons if a sas model has been provided + # Disable all the cell-var-from-loop violations in this function + # pylint: disable=cell-var-from-loop partial_dfs = [] - for field_name in ["answers", "answers_isolated"]: - df_answers = pd.DataFrame() - answers = node_output.get(field_name, None) - if answers is not None: - if len(answers) == 0: - # add no_answer if there was no answer retrieved, so query does not get lost in dataframe - answers = [Answer(answer="", offsets_in_document=[Span(start=0, end=0)])] - answer_cols_to_keep = ["answer", "document_id", "offsets_in_document", "context"] - df_answers = pd.DataFrame(answers, columns=answer_cols_to_keep) - df_answers.map_rows = partial(df_answers.apply, axis=1) - df_answers["rank"] = np.arange(1, len(df_answers) + 1) - df_answers["gold_answers"] = [gold_answers] * len(df_answers) - df_answers["gold_offsets_in_documents"] = [gold_offsets_in_documents] * len(df_answers) - df_answers["gold_document_ids"] = [gold_document_ids] * len(df_answers) - df_answers["gold_contexts"] = [gold_contexts] * len(df_answers) - df_answers["gold_answers_exact_match"] = df_answers.map_rows( - lambda row: [calculate_em_str(gold_answer, row["answer"]) for gold_answer in gold_answers] - ) - df_answers["gold_answers_f1"] = df_answers.map_rows( - lambda row: [calculate_f1_str(gold_answer, row["answer"]) for gold_answer in gold_answers] - ) - df_answers["gold_contexts_similarity"] = df_answers.map_rows( - lambda row: [ - calculate_context_similarity( - str(gold_context), # could be dataframe - str(row["context"]) if row["context"] is not None else "", # could be dataframe - min_length=context_matching_min_length, - boost_split_overlaps=context_matching_boost_split_overlaps, - ) - for gold_context in gold_contexts - ] - ) - df_answers["gold_documents_id_match"] = df_answers.map_rows( - lambda row: [1.0 if row["document_id"] == gold_id else 0.0 for gold_id in gold_document_ids] - ) + for i, (query, query_labels) in enumerate(zip(queries, query_labels_per_query)): - if custom_document_id_field is not None: - df_answers["gold_custom_document_ids"] = [gold_custom_document_ids] * len(df_answers) - df_answers["custom_document_id"] = [ - answer.meta.get(custom_document_id_field, "") for answer in answers - ] + if query_labels is None or query_labels.labels is None: + logger.warning(f"There is no label for query '{query}'. Query will be omitted.") + continue + + # remarks for no_answers: + # Single 'no_answer'-labels are not contained in MultiLabel aggregates. + # If all labels are no_answers, MultiLabel.answers will be [""] and the other aggregates [] + gold_answers = query_labels.answers + gold_offsets_in_documents = query_labels.offsets_in_documents + gold_document_ids = query_labels.document_ids + gold_custom_document_ids = ( + [l.document.meta[custom_document_id_field] for l in query_labels.labels if not l.no_answer] + if custom_document_id_field is not None + else [] + ) + gold_contexts = query_labels.contexts + + # if node returned answers, include answer specific info: + # - the answer returned itself + # - the document_id the answer was found in + # - the position or offsets within the document the answer was found + # - the surrounding context of the answer within the document + # - the gold answers + # - the position or offsets of the gold answer within the document + # - the gold document ids containing the answer + # - the exact_match metric depicting if the answer exactly matches the gold label + # - the f1 metric depicting how well the answer overlaps with the gold label on token basis + # - the sas metric depicting how well the answer matches the gold label on a semantic basis. + # this will be calculated on all queries in eval() for performance reasons if a sas model has been provided + + for field_name in ["answers", "answers_isolated"]: + df_answers = pd.DataFrame() + answers = node_output.get(field_name, None) + if answers is not None: + if isinstance(answers[i], list): + # answers_isolated refers to only one relevant document and thus only a list of answers + # answers refers to multiple relevant documents and thus multiple lists of lists of answers + answers = answers[i] + if len(answers) == 0: + # add no_answer if there was no answer retrieved, so query does not get lost in dataframe + answers = [Answer(answer="", offsets_in_document=[Span(start=0, end=0)])] + answer_cols_to_keep = ["answer", "document_id", "offsets_in_document", "context"] + df_answers = pd.DataFrame(answers, columns=answer_cols_to_keep) + df_answers.map_rows = partial(df_answers.apply, axis=1) + df_answers["rank"] = np.arange(1, len(df_answers) + 1) + df_answers["gold_answers"] = [gold_answers] * len(df_answers) + df_answers["gold_offsets_in_documents"] = [gold_offsets_in_documents] * len(df_answers) + df_answers["gold_document_ids"] = [gold_document_ids] * len(df_answers) + df_answers["gold_contexts"] = [gold_contexts] * len(df_answers) + df_answers["gold_answers_exact_match"] = df_answers.map_rows( + lambda row: [calculate_em_str(gold_answer, row["answer"]) for gold_answer in gold_answers] + ) + df_answers["gold_answers_f1"] = df_answers.map_rows( + lambda row: [calculate_f1_str(gold_answer, row["answer"]) for gold_answer in gold_answers] + ) + df_answers["gold_contexts_similarity"] = df_answers.map_rows( + lambda row: [ + calculate_context_similarity( + str(gold_context), # could be dataframe + str(row["context"]) if row["context"] is not None else "", # could be dataframe + min_length=context_matching_min_length, + boost_split_overlaps=context_matching_boost_split_overlaps, + ) + for gold_context in gold_contexts + ] + ) df_answers["gold_documents_id_match"] = df_answers.map_rows( + lambda row: [1.0 if row["document_id"] == gold_id else 0.0 for gold_id in gold_document_ids] + ) + + if custom_document_id_field is not None: + df_answers["gold_custom_document_ids"] = [gold_custom_document_ids] * len(df_answers) + df_answers["custom_document_id"] = [ + answer.meta.get(custom_document_id_field, "") for answer in answers + ] + df_answers["gold_documents_id_match"] = df_answers.map_rows( + lambda row: [ + 1.0 if row["custom_document_id"] == gold_custom_id else 0.0 + for gold_custom_id in gold_custom_document_ids + ] + ) + + # answer_scope: any + df_answers["exact_match"] = df_answers.map_rows( + lambda row: max(row["gold_answers_exact_match"] + [0.0]) + ) + df_answers["f1"] = df_answers.map_rows(lambda row: max(row["gold_answers_f1"] + [0.0])) + + # answer_scope: context + df_answers["exact_match_context_scope"] = df_answers.map_rows( + lambda row: max( + em + for em, sim in zip( + row["gold_answers_exact_match"] + [0.0], row["gold_contexts_similarity"] + [100] + ) + if sim > context_matching_threshold + ) + ) + df_answers["f1_context_scope"] = df_answers.map_rows( + lambda row: max( + f1 + for f1, sim in zip(row["gold_answers_f1"] + [0.0], row["gold_contexts_similarity"] + [100]) + if sim > context_matching_threshold + ) + ) + + # answer_scope: document_id + df_answers["exact_match_document_id_scope"] = df_answers.map_rows( + lambda row: max( + em + for em, doc_match in zip( + row["gold_answers_exact_match"] + [0.0], row["gold_documents_id_match"] + [1.0] + ) + if doc_match == 1.0 + ) + ) + df_answers["f1_document_id_scope"] = df_answers.map_rows( + lambda row: max( + f1 + for f1, doc_match in zip( + row["gold_answers_f1"] + [0.0], row["gold_documents_id_match"] + [1.0] + ) + if doc_match == 1.0 + ) + ) + + # answer_scope: document_id_and_context + df_answers["exact_match_document_id_and_context_scope"] = df_answers.map_rows( + lambda row: max( + f1 + for f1, sim, doc_match in zip( + row["gold_answers_exact_match"] + [0.0], + row["gold_contexts_similarity"] + [100], + row["gold_documents_id_match"] + [1.0], + ) + if sim > context_matching_threshold and doc_match == 1.0 + ) + ) + df_answers["f1_document_id_and_context_scope"] = df_answers.map_rows( + lambda row: max( + f1 + for f1, sim, doc_match in zip( + row["gold_answers_f1"] + [0.0], + row["gold_contexts_similarity"] + [100], + row["gold_documents_id_match"] + [1.0], + ) + if sim > context_matching_threshold and doc_match == 1.0 + ) + ) + + # add general info + df_answers["type"] = "answer" + df_answers["node"] = node_name + df_answers["multilabel_id"] = query_labels.id + df_answers["query"] = query + df_answers["filters"] = json.dumps(query_labels.filters, sort_keys=True).encode() + df_answers["eval_mode"] = "isolated" if "isolated" in field_name else "integrated" + partial_dfs.append(df_answers) + + # if node returned documents, include document specific info: + # - the document_id + # - the content of the document + # - the gold document ids + # - the gold document contents + # - the gold_id_match metric depicting whether one of the gold document ids matches the document + # - the answer_match metric depicting whether the document contains the answer + # - the gold_id_or_answer_match metric depicting whether one of the former two conditions are met + for field_name in ["documents", "documents_isolated"]: + df_docs = pd.DataFrame() + documents = node_output.get(field_name, None) + if documents is not None: + if isinstance(documents[i], list): + documents = documents[i] + if len(documents) == 0: + # add dummy document if there was no document retrieved, so query does not get lost in dataframe + documents = [Document(content="", id="")] + document_cols_to_keep = ["content", "id"] + df_docs = pd.DataFrame(documents, columns=document_cols_to_keep) + df_docs.map_rows = partial(df_docs.apply, axis=1) + df_docs.rename(columns={"id": "document_id", "content": "context"}, inplace=True) + df_docs["gold_document_ids"] = [gold_document_ids] * len(df_docs) + df_docs["gold_contexts"] = [gold_contexts] * len(df_docs) + df_docs["gold_contexts_similarity"] = df_docs.map_rows( lambda row: [ - 1.0 if row["custom_document_id"] == gold_custom_id else 0.0 - for gold_custom_id in gold_custom_document_ids + calculate_context_similarity( + str(gold_context) if isinstance(gold_context, pd.DataFrame) else gold_context, + str(row["context"]) + if isinstance(row["context"], pd.DataFrame) + else row["context"] or "", + min_length=context_matching_min_length, + boost_split_overlaps=context_matching_boost_split_overlaps, + ) + for gold_context in gold_contexts ] ) - - # answer_scope: any - df_answers["exact_match"] = df_answers.map_rows( - lambda row: max(row["gold_answers_exact_match"] + [0.0]) - ) - df_answers["f1"] = df_answers.map_rows(lambda row: max(row["gold_answers_f1"] + [0.0])) - - # answer_scope: context - df_answers["exact_match_context_scope"] = df_answers.map_rows( - lambda row: max( - em - for em, sim in zip( - row["gold_answers_exact_match"] + [0.0], row["gold_contexts_similarity"] + [100] - ) - if sim > context_matching_threshold - ) - ) - df_answers["f1_context_scope"] = df_answers.map_rows( - lambda row: max( - f1 - for f1, sim in zip(row["gold_answers_f1"] + [0.0], row["gold_contexts_similarity"] + [100]) - if sim > context_matching_threshold - ) - ) - - # answer_scope: document_id - df_answers["exact_match_document_id_scope"] = df_answers.map_rows( - lambda row: max( - em - for em, doc_match in zip( - row["gold_answers_exact_match"] + [0.0], row["gold_documents_id_match"] + [1.0] - ) - if doc_match == 1.0 - ) - ) - df_answers["f1_document_id_scope"] = df_answers.map_rows( - lambda row: max( - f1 - for f1, doc_match in zip(row["gold_answers_f1"] + [0.0], row["gold_documents_id_match"] + [1.0]) - if doc_match == 1.0 - ) - ) - - # answer_scope: document_id_and_context - df_answers["exact_match_document_id_and_context_scope"] = df_answers.map_rows( - lambda row: max( - f1 - for f1, sim, doc_match in zip( - row["gold_answers_exact_match"] + [0.0], - row["gold_contexts_similarity"] + [100], - row["gold_documents_id_match"] + [1.0], - ) - if sim > context_matching_threshold and doc_match == 1.0 - ) - ) - df_answers["f1_document_id_and_context_scope"] = df_answers.map_rows( - lambda row: max( - f1 - for f1, sim, doc_match in zip( - row["gold_answers_f1"] + [0.0], - row["gold_contexts_similarity"] + [100], - row["gold_documents_id_match"] + [1.0], - ) - if sim > context_matching_threshold and doc_match == 1.0 - ) - ) - - # add general info - df_answers["type"] = "answer" - df_answers["node"] = node_name - df_answers["multilabel_id"] = query_labels.id - df_answers["query"] = query - df_answers["filters"] = json.dumps(query_labels.filters, sort_keys=True).encode() - df_answers["eval_mode"] = "isolated" if "isolated" in field_name else "integrated" - partial_dfs.append(df_answers) - - # if node returned documents, include document specific info: - # - the document_id - # - the content of the document - # - the gold document ids - # - the gold document contents - # - the gold_id_match metric depicting whether one of the gold document ids matches the document - # - the answer_match metric depicting whether the document contains the answer - # - the gold_id_or_answer_match metric depicting whether one of the former two conditions are met - for field_name in ["documents", "documents_isolated"]: - df_docs = pd.DataFrame() - documents = node_output.get(field_name, None) - if documents is not None: - if len(documents) == 0: - # add dummy document if there was no document retrieved, so query does not get lost in dataframe - documents = [Document(content="", id="")] - document_cols_to_keep = ["content", "id"] - df_docs = pd.DataFrame(documents, columns=document_cols_to_keep) - df_docs.map_rows = partial(df_docs.apply, axis=1) - df_docs.rename(columns={"id": "document_id", "content": "context"}, inplace=True) - df_docs["gold_document_ids"] = [gold_document_ids] * len(df_docs) - df_docs["gold_contexts"] = [gold_contexts] * len(df_docs) - df_docs["gold_contexts_similarity"] = df_docs.map_rows( - lambda row: [ - calculate_context_similarity( - str(gold_context) if isinstance(gold_context, pd.DataFrame) else gold_context, - str(row["context"]) if isinstance(row["context"], pd.DataFrame) else row["context"] or "", - min_length=context_matching_min_length, - boost_split_overlaps=context_matching_boost_split_overlaps, - ) - for gold_context in gold_contexts - ] - ) - df_docs["gold_documents_id_match"] = df_docs.map_rows( - lambda row: [1.0 if row["document_id"] == gold_id else 0.0 for gold_id in gold_document_ids] - ) - - if custom_document_id_field is not None: - df_docs["gold_custom_document_ids"] = [gold_custom_document_ids] * len(df_docs) - df_docs["custom_document_id"] = [ - document.meta.get(custom_document_id_field, "") for document in documents - ] df_docs["gold_documents_id_match"] = df_docs.map_rows( + lambda row: [1.0 if row["document_id"] == gold_id else 0.0 for gold_id in gold_document_ids] + ) + + if custom_document_id_field is not None: + df_docs["gold_custom_document_ids"] = [gold_custom_document_ids] * len(df_docs) + df_docs["custom_document_id"] = [ + document.meta.get(custom_document_id_field, "") for document in documents + ] + df_docs["gold_documents_id_match"] = df_docs.map_rows( + lambda row: [ + 1.0 if row["custom_document_id"] == gold_custom_id else 0.0 + for gold_custom_id in gold_custom_document_ids + ] + ) + + df_docs["gold_answers_match"] = df_docs.map_rows( lambda row: [ - 1.0 if row["custom_document_id"] == gold_custom_id else 0.0 - for gold_custom_id in gold_custom_document_ids + 1.0 if gold_answer != "" and gold_answer in row["context"] else 0.0 + for gold_answer in gold_answers ] ) - df_docs["gold_answers_match"] = df_docs.map_rows( - lambda row: [ - 1.0 if gold_answer != "" and gold_answer in row["context"] else 0.0 - for gold_answer in gold_answers - ] - ) + # document_relevance_criterion: "document_id" + df_docs["gold_id_match"] = df_docs.map_rows(lambda row: max(row["gold_documents_id_match"] + [0.0])) - # document_relevance_criterion: "document_id" - df_docs["gold_id_match"] = df_docs.map_rows(lambda row: max(row["gold_documents_id_match"] + [0.0])) + # document_relevance_criterion: "answer", + df_docs["answer_match"] = df_docs.map_rows(lambda row: max(row["gold_answers_match"] + [0.0])) - # document_relevance_criterion: "answer", - df_docs["answer_match"] = df_docs.map_rows(lambda row: max(row["gold_answers_match"] + [0.0])) + # document_relevance_criterion: "document_id_or_answer", + df_docs["gold_id_or_answer_match"] = df_docs.map_rows( + lambda row: max(row["gold_id_match"], row["answer_match"]) + ) - # document_relevance_criterion: "document_id_or_answer", - df_docs["gold_id_or_answer_match"] = df_docs.map_rows( - lambda row: max(row["gold_id_match"], row["answer_match"]) - ) + # document_relevance_criterion: "document_id_and_answer", + df_docs["gold_id_and_answer_match"] = df_docs.map_rows( + lambda row: min(row["gold_id_match"], row["answer_match"]) + ) - # document_relevance_criterion: "document_id_and_answer", - df_docs["gold_id_and_answer_match"] = df_docs.map_rows( - lambda row: min(row["gold_id_match"], row["answer_match"]) - ) + # document_relevance_criterion: "context", + df_docs["context_match"] = df_docs.map_rows( + lambda row: 1.0 + if any(sim for sim in row["gold_contexts_similarity"] if sim > context_matching_threshold) + else 0.0 + ) - # document_relevance_criterion: "context", - df_docs["context_match"] = df_docs.map_rows( - lambda row: 1.0 - if any(sim for sim in row["gold_contexts_similarity"] if sim > context_matching_threshold) - else 0.0 - ) + # document_relevance_criterion: "document_id_or_context", + df_docs["gold_id_or_context_match"] = df_docs.map_rows( + lambda row: max(row["gold_id_match"], row["context_match"]) + ) - # document_relevance_criterion: "document_id_or_context", - df_docs["gold_id_or_context_match"] = df_docs.map_rows( - lambda row: max(row["gold_id_match"], row["context_match"]) - ) + # document_relevance_criterion: "document_id_and_context", + df_docs["gold_id_and_context_match"] = df_docs.map_rows( + lambda row: min(row["gold_id_match"], row["context_match"]) + ) - # document_relevance_criterion: "document_id_and_context", - df_docs["gold_id_and_context_match"] = df_docs.map_rows( - lambda row: min(row["gold_id_match"], row["context_match"]) - ) + # document_relevance_criterion: "document_id_and_context_and_answer", + df_docs["gold_id_and_context_and_answer_match"] = df_docs.map_rows( + lambda row: min(row["gold_id_match"], row["context_match"], row["answer_match"]) + ) - # document_relevance_criterion: "document_id_and_context_and_answer", - df_docs["gold_id_and_context_and_answer_match"] = df_docs.map_rows( - lambda row: min(row["gold_id_match"], row["context_match"], row["answer_match"]) - ) + # document_relevance_criterion: "context_and_answer", + df_docs["context_and_answer_match"] = df_docs.map_rows( + lambda row: min(row["context_match"], row["answer_match"]) + ) - # document_relevance_criterion: "context_and_answer", - df_docs["context_and_answer_match"] = df_docs.map_rows( - lambda row: min(row["context_match"], row["answer_match"]) - ) + df_docs["rank"] = np.arange(1, len(df_docs) + 1) - df_docs["rank"] = np.arange(1, len(df_docs) + 1) + # add general info + df_docs["type"] = "document" + df_docs["node"] = node_name + df_docs["multilabel_id"] = query_labels.id + df_docs["query"] = query + df_docs["filters"] = json.dumps(query_labels.filters, sort_keys=True).encode() + df_docs["eval_mode"] = "isolated" if "isolated" in field_name else "integrated" + partial_dfs.append(df_docs) - # add general info - df_docs["type"] = "document" - df_docs["node"] = node_name - df_docs["multilabel_id"] = query_labels.id - df_docs["query"] = query - df_docs["filters"] = json.dumps(query_labels.filters, sort_keys=True).encode() - df_docs["eval_mode"] = "isolated" if "isolated" in field_name else "integrated" - partial_dfs.append(df_docs) + if len(partial_dfs) == 0: + return pd.DataFrame() return pd.concat(partial_dfs, ignore_index=True).reset_index() diff --git a/haystack/pipelines/standard_pipelines.py b/haystack/pipelines/standard_pipelines.py index 4c177a7dd..e96181e69 100644 --- a/haystack/pipelines/standard_pipelines.py +++ b/haystack/pipelines/standard_pipelines.py @@ -217,6 +217,65 @@ class BaseStandardPipeline(ABC): ) return output + def eval_batch( + self, + labels: List[MultiLabel], + params: Optional[dict] = None, + sas_model_name_or_path: Optional[str] = None, + sas_batch_size: int = 32, + sas_use_gpu: bool = True, + add_isolated_node_eval: bool = False, + custom_document_id_field: Optional[str] = None, + context_matching_min_length: int = 100, + context_matching_boost_split_overlaps: bool = True, + context_matching_threshold: float = 65.0, + ) -> EvaluationResult: + + """ + Evaluates the pipeline by running the pipeline once per query in the debug mode + and putting together all data that is needed for evaluation, for example, calculating metrics. + + To calculate SAS (Semantic Answer Similarity) metrics, specify `sas_model_name_or_path`. + + You can control the scope within which an Answer or a Document is considered correct afterwards (see `document_scope` and `answer_scope` params in `EvaluationResult.calculate_metrics()`). + For some of these scopes, you need to add the following information during `eval()`: + - `custom_document_id_field` parameter to select a custom document ID from document's metadata for ID matching (only affects 'document_id' scopes). + - `context_matching_...` parameter to fine-tune the fuzzy matching mechanism that determines whether text contexts match each other (only affects 'context' scopes, default values should work most of the time). + + :param labels: The labels to evaluate on. + :param params: Parameters for the `retriever` and `reader`. For instance, + params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}. + :param sas_model_name_or_path: Sentence transformers semantic textual similarity model you want to use for the SAS value calculation. + It should be a path or a string pointing to downloadable models. + :param sas_batch_size: Number of prediction label pairs to encode at once by cross encoder or sentence transformer while calculating SAS. + :param sas_use_gpu: Whether to use a GPU or the CPU for calculating semantic answer similarity. + Falls back to CPU if no GPU is available. + :param add_isolated_node_eval: Whether to additionally evaluate the reader based on labels as input, instead of the output of the previous node in the pipeline. + :param custom_document_id_field: Custom field name within `Document`'s `meta` which identifies the document and is used as a criterion for matching documents to labels during evaluation. + This is especially useful if you want to match documents on other criteria (for example, file names) than the default document IDs, as these could be heavily influenced by preprocessing. + If not set, the default `Document`'s `id` is used as the criterion for matching documents to labels. + :param context_matching_min_length: The minimum string length context and candidate need to have to be scored. + Returns 0.0 otherwise. + :param context_matching_boost_split_overlaps: Whether to boost split overlaps (for example, [AB] <-> [BC]) that result from different preprocessing parameters. + If we detect that the score is near a half match and the matching part of the candidate is at its boundaries, + we cut the context on the same side, recalculate the score, and take the mean of both. + Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total. + :param context_matching_threshold: Score threshold that candidates must surpass to be included into the result list. Range: [0,100] + """ + output = self.pipeline.eval_batch( + labels=labels, + params=params, + sas_model_name_or_path=sas_model_name_or_path, + sas_batch_size=sas_batch_size, + sas_use_gpu=sas_use_gpu, + add_isolated_node_eval=add_isolated_node_eval, + custom_document_id_field=custom_document_id_field, + context_matching_boost_split_overlaps=context_matching_boost_split_overlaps, + context_matching_min_length=context_matching_min_length, + context_matching_threshold=context_matching_threshold, + ) + return output + def print_eval_report( self, eval_result: EvaluationResult, diff --git a/test/pipelines/test_eval.py b/test/pipelines/test_eval.py index c3c44bd49..4df42e74d 100644 --- a/test/pipelines/test_eval.py +++ b/test/pipelines/test_eval.py @@ -54,7 +54,7 @@ def test_generativeqa_calculate_metrics( @pytest.mark.parametrize("retriever_with_docs", ["embedding"], indirect=True) def test_summarizer_calculate_metrics(document_store_with_docs: ElasticsearchDocumentStore, retriever_with_docs): document_store_with_docs.update_embeddings(retriever=retriever_with_docs) - summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distill-pegasus-xsum-16-4", use_gpu=-1) + summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distill-pegasus-xsum-16-4", use_gpu=False) pipeline = SearchSummarizationPipeline( retriever=retriever_with_docs, summarizer=summarizer, return_in_answer_format=True ) @@ -1277,7 +1277,6 @@ def test_multi_retriever_pipeline_with_asymmetric_qa_eval(document_store_with_do assert "ESRetriever" in eval_result assert "DPRRetriever" in eval_result - assert "DPRRetriever" in eval_result assert "QAReader" in eval_result assert len(eval_result) == 3 diff --git a/test/pipelines/test_eval_batch.py b/test/pipelines/test_eval_batch.py new file mode 100644 index 000000000..f5fe5ea38 --- /dev/null +++ b/test/pipelines/test_eval_batch.py @@ -0,0 +1,1043 @@ +import logging +import pytest +import sys +from haystack.document_stores.memory import InMemoryDocumentStore +from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore +from haystack.nodes.preprocessor import PreProcessor +from haystack.nodes.evaluator import EvalAnswers, EvalDocuments +from haystack.nodes.query_classifier.transformers import TransformersQueryClassifier +from haystack.nodes.retriever.dense import DensePassageRetriever +from haystack.nodes.retriever.sparse import BM25Retriever +from haystack.nodes.summarizer.transformers import TransformersSummarizer +from haystack.pipelines.base import Pipeline +from haystack.pipelines import ExtractiveQAPipeline, GenerativeQAPipeline, SearchSummarizationPipeline +from haystack.pipelines.standard_pipelines import ( + DocumentSearchPipeline, + FAQPipeline, + RetrieverQuestionGenerationPipeline, + TranslationWrapperPipeline, +) +from haystack.nodes.translator.transformers import TransformersTranslator +from haystack.schema import Answer, Document, EvaluationResult, Label, MultiLabel, Span + +from ..conftest import SAMPLES_PATH + + +@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Causes OOM on windows github runner") +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("retriever_with_docs", ["embedding"], indirect=True) +def test_generativeqa_calculate_metrics( + document_store_with_docs: InMemoryDocumentStore, rag_generator, retriever_with_docs +): + document_store_with_docs.update_embeddings(retriever=retriever_with_docs) + pipeline = GenerativeQAPipeline(generator=rag_generator, retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) + + metrics = eval_result.calculate_metrics(document_scope="document_id") + + assert "Retriever" in eval_result + assert "Generator" in eval_result + assert len(eval_result) == 2 + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + assert metrics["Generator"]["exact_match"] == 0.0 + assert metrics["Generator"]["f1"] == 1.0 / 3 + + +@pytest.mark.skipif(sys.platform in ["win32", "cygwin"], reason="Causes OOM on windows github runner") +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("retriever_with_docs", ["embedding"], indirect=True) +def test_summarizer_calculate_metrics(document_store_with_docs: ElasticsearchDocumentStore, retriever_with_docs): + document_store_with_docs.update_embeddings(retriever=retriever_with_docs) + summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distill-pegasus-xsum-16-4", use_gpu=False) + pipeline = SearchSummarizationPipeline( + retriever=retriever_with_docs, summarizer=summarizer, return_in_answer_format=True + ) + eval_result: EvaluationResult = pipeline.eval_batch( + labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}, context_matching_min_length=10 + ) + + metrics = eval_result.calculate_metrics(document_scope="context") + + assert "Retriever" in eval_result + assert "Summarizer" in eval_result + assert len(eval_result) == 2 + + assert metrics["Retriever"]["mrr"] == 1.0 + assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4) + assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4) + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 1.0 + assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4) + assert metrics["Summarizer"]["mrr"] == 1.0 + assert metrics["Summarizer"]["map"] == 0.735 + assert metrics["Summarizer"]["recall_multi_hit"] == 0.8 + assert metrics["Summarizer"]["recall_single_hit"] == 1.0 + assert metrics["Summarizer"]["precision"] == 0.8 + assert metrics["Summarizer"]["ndcg"] == pytest.approx(0.8422, 1e-4) + + +EVAL_LABELS = [ + MultiLabel( + labels=[ + Label( + query="Who lives in Berlin?", + answer=Answer(answer="Carla", offsets_in_context=[Span(11, 16)]), + document=Document( + id="a0747b83aea0b60c4b114b15476dd32d", + content_type="text", + content="My name is Carla and I live in Berlin", + ), + is_correct_answer=True, + is_correct_document=True, + origin="gold-label", + ) + ] + ), + MultiLabel( + labels=[ + Label( + query="Who lives in Munich?", + answer=Answer(answer="Carla", offsets_in_context=[Span(11, 16)]), + document=Document( + id="something_else", content_type="text", content="My name is Carla and I live in Munich" + ), + is_correct_answer=True, + is_correct_document=True, + origin="gold-label", + ) + ] + ), +] + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path): + labels = EVAL_LABELS[:1] + + pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) + eval_result = pipeline.eval_batch(labels=labels, params={"Retriever": {"top_k": 5}}) + + metrics = eval_result.calculate_metrics(document_scope="document_id") + + reader_result = eval_result["Reader"] + retriever_result = eval_result["Retriever"] + + assert ( + reader_result[reader_result["rank"] == 1]["answer"].iloc[0] + in reader_result[reader_result["rank"] == 1]["gold_answers"].iloc[0] + ) + assert ( + retriever_result[retriever_result["rank"] == 1]["document_id"].iloc[0] + in retriever_result[retriever_result["rank"] == 1]["gold_document_ids"].iloc[0] + ) + assert metrics["Reader"]["exact_match"] == 1.0 + assert metrics["Reader"]["f1"] == 1.0 + assert metrics["Retriever"]["mrr"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 0.2 + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["ndcg"] == 1.0 + + eval_result.save(tmp_path) + saved_eval_result = EvaluationResult.load(tmp_path) + metrics = saved_eval_result.calculate_metrics(document_scope="document_id") + + assert ( + reader_result[reader_result["rank"] == 1]["answer"].iloc[0] + in reader_result[reader_result["rank"] == 1]["gold_answers"].iloc[0] + ) + assert ( + retriever_result[retriever_result["rank"] == 1]["document_id"].iloc[0] + in retriever_result[retriever_result["rank"] == 1]["gold_document_ids"].iloc[0] + ) + assert metrics["Reader"]["exact_match"] == 1.0 + assert metrics["Reader"]["f1"] == 1.0 + assert metrics["Retriever"]["mrr"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 0.2 + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["ndcg"] == 1.0 + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_extractive_qa_eval_multiple_queries(reader, retriever_with_docs, tmp_path): + pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) + + metrics = eval_result.calculate_metrics(document_scope="document_id") + + reader_result = eval_result["Reader"] + retriever_result = eval_result["Retriever"] + + reader_berlin = reader_result[reader_result["query"] == "Who lives in Berlin?"] + reader_munich = reader_result[reader_result["query"] == "Who lives in Munich?"] + + retriever_berlin = retriever_result[retriever_result["query"] == "Who lives in Berlin?"] + retriever_munich = retriever_result[retriever_result["query"] == "Who lives in Munich?"] + + assert ( + reader_berlin[reader_berlin["rank"] == 1]["answer"].iloc[0] + in reader_berlin[reader_berlin["rank"] == 1]["gold_answers"].iloc[0] + ) + assert ( + retriever_berlin[retriever_berlin["rank"] == 1]["document_id"].iloc[0] + in retriever_berlin[retriever_berlin["rank"] == 1]["gold_document_ids"].iloc[0] + ) + assert ( + reader_munich[reader_munich["rank"] == 1]["answer"].iloc[0] + not in reader_munich[reader_munich["rank"] == 1]["gold_answers"].iloc[0] + ) + assert ( + retriever_munich[retriever_munich["rank"] == 1]["document_id"].iloc[0] + not in retriever_munich[retriever_munich["rank"] == 1]["gold_document_ids"].iloc[0] + ) + assert metrics["Reader"]["exact_match"] == 1.0 + assert metrics["Reader"]["f1"] == 1.0 + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + + eval_result.save(tmp_path) + saved_eval_result = EvaluationResult.load(tmp_path) + metrics = saved_eval_result.calculate_metrics(document_scope="document_id") + + assert ( + reader_berlin[reader_berlin["rank"] == 1]["answer"].iloc[0] + in reader_berlin[reader_berlin["rank"] == 1]["gold_answers"].iloc[0] + ) + assert ( + retriever_berlin[retriever_berlin["rank"] == 1]["document_id"].iloc[0] + in retriever_berlin[retriever_berlin["rank"] == 1]["gold_document_ids"].iloc[0] + ) + assert ( + reader_munich[reader_munich["rank"] == 1]["answer"].iloc[0] + not in reader_munich[reader_munich["rank"] == 1]["gold_answers"].iloc[0] + ) + assert ( + retriever_munich[retriever_munich["rank"] == 1]["document_id"].iloc[0] + not in retriever_munich[retriever_munich["rank"] == 1]["gold_document_ids"].iloc[0] + ) + assert metrics["Reader"]["exact_match"] == 1.0 + assert metrics["Reader"]["f1"] == 1.0 + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_extractive_qa_eval_sas(reader, retriever_with_docs): + pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch( + labels=EVAL_LABELS, + params={"Retriever": {"top_k": 5}}, + sas_model_name_or_path="sentence-transformers/paraphrase-MiniLM-L3-v2", + ) + + metrics = eval_result.calculate_metrics(document_scope="document_id") + + assert metrics["Reader"]["exact_match"] == 1.0 + assert metrics["Reader"]["f1"] == 1.0 + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + assert "sas" in metrics["Reader"] + assert metrics["Reader"]["sas"] == pytest.approx(1.0) + + +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_reader_eval_in_pipeline(reader): + pipeline = Pipeline() + pipeline.add_node(component=reader, name="Reader", inputs=["Query"]) + eval_result: EvaluationResult = pipeline.eval_batch( + labels=EVAL_LABELS, + documents=[[label.document for label in multilabel.labels] for multilabel in EVAL_LABELS], + params={}, + ) + + metrics = eval_result.calculate_metrics(document_scope="document_id") + + assert metrics["Reader"]["exact_match"] == 1.0 + assert metrics["Reader"]["f1"] == 1.0 + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_extractive_qa_eval_document_scope(reader, retriever_with_docs): + pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch( + labels=EVAL_LABELS, + params={"Retriever": {"top_k": 5}}, + context_matching_min_length=20, # artificially set down min_length to see if context matching is working properly + ) + + metrics = eval_result.calculate_metrics(document_scope="document_id") + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + + metrics = eval_result.calculate_metrics(document_scope="context") + + assert metrics["Retriever"]["mrr"] == 1.0 + assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4) + assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4) + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 1.0 + assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4) + + metrics = eval_result.calculate_metrics(document_scope="document_id_and_context") + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + + metrics = eval_result.calculate_metrics(document_scope="document_id_or_context") + + assert metrics["Retriever"]["mrr"] == 1.0 + assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4) + assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4) + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 1.0 + assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4) + + metrics = eval_result.calculate_metrics(document_scope="answer") + + assert metrics["Retriever"]["mrr"] == 1.0 + assert metrics["Retriever"]["map"] == 0.75 + assert metrics["Retriever"]["recall_multi_hit"] == 0.75 + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 0.2 + assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4) + + metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer") + + assert metrics["Retriever"]["mrr"] == 1.0 + assert metrics["Retriever"]["map"] == 0.75 + assert metrics["Retriever"]["recall_multi_hit"] == 0.75 + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 0.2 + assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4) + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_extractive_qa_eval_answer_scope(reader, retriever_with_docs): + pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch( + labels=EVAL_LABELS, + params={"Retriever": {"top_k": 5}}, + sas_model_name_or_path="sentence-transformers/paraphrase-MiniLM-L3-v2", + context_matching_min_length=20, # artificially set down min_length to see if context matching is working properly + ) + + metrics = eval_result.calculate_metrics(answer_scope="any") + + assert metrics["Retriever"]["mrr"] == 1.0 + assert metrics["Retriever"]["map"] == 0.75 + assert metrics["Retriever"]["recall_multi_hit"] == 0.75 + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 0.2 + assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4) + assert metrics["Reader"]["exact_match"] == 1.0 + assert metrics["Reader"]["f1"] == 1.0 + assert metrics["Reader"]["sas"] == pytest.approx(1.0) + + metrics = eval_result.calculate_metrics(answer_scope="context") + + assert metrics["Retriever"]["mrr"] == 1.0 + assert metrics["Retriever"]["map"] == 0.75 + assert metrics["Retriever"]["recall_multi_hit"] == 0.75 + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 0.2 + assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4) + assert metrics["Reader"]["exact_match"] == 1.0 + assert metrics["Reader"]["f1"] == 1.0 + assert metrics["Reader"]["sas"] == pytest.approx(1.0) + + metrics = eval_result.calculate_metrics(answer_scope="document_id") + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + assert metrics["Reader"]["exact_match"] == 0.5 + assert metrics["Reader"]["f1"] == 0.5 + assert metrics["Reader"]["sas"] == pytest.approx(0.5) + + metrics = eval_result.calculate_metrics(answer_scope="document_id_and_context") + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + assert metrics["Reader"]["exact_match"] == 0.5 + assert metrics["Reader"]["f1"] == 0.5 + assert metrics["Reader"]["sas"] == pytest.approx(0.5) + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_extractive_qa_eval_answer_document_scope_combinations(reader, retriever_with_docs, caplog): + pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch( + labels=EVAL_LABELS, + params={"Retriever": {"top_k": 5}}, + sas_model_name_or_path="sentence-transformers/paraphrase-MiniLM-L3-v2", + context_matching_min_length=20, # artificially set down min_length to see if context matching is working properly + ) + + # valid values for non default answer_scopes + with caplog.at_level(logging.WARNING): + metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer", answer_scope="context") + metrics = eval_result.calculate_metrics(document_scope="answer", answer_scope="context") + assert "You specified a non-answer document_scope together with a non-default answer_scope" not in caplog.text + + with caplog.at_level(logging.WARNING): + metrics = eval_result.calculate_metrics(document_scope="document_id", answer_scope="context") + assert "You specified a non-answer document_scope together with a non-default answer_scope" in caplog.text + + with caplog.at_level(logging.WARNING): + metrics = eval_result.calculate_metrics(document_scope="context", answer_scope="context") + assert "You specified a non-answer document_scope together with a non-default answer_scope" in caplog.text + + with caplog.at_level(logging.WARNING): + metrics = eval_result.calculate_metrics(document_scope="document_id_and_context", answer_scope="context") + assert "You specified a non-answer document_scope together with a non-default answer_scope" in caplog.text + + with caplog.at_level(logging.WARNING): + metrics = eval_result.calculate_metrics(document_scope="document_id_or_context", answer_scope="context") + assert "You specified a non-answer document_scope together with a non-default answer_scope" in caplog.text + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs): + pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch( + labels=EVAL_LABELS, + params={"Retriever": {"top_k": 5}}, + sas_model_name_or_path="sentence-transformers/paraphrase-MiniLM-L3-v2", + ) + + metrics_top_1 = eval_result.calculate_metrics(simulated_top_k_reader=1, document_scope="document_id") + + assert metrics_top_1["Reader"]["exact_match"] == 0.5 + assert metrics_top_1["Reader"]["f1"] == 0.5 + assert metrics_top_1["Reader"]["sas"] == pytest.approx(0.5833, abs=1e-4) + assert metrics_top_1["Retriever"]["mrr"] == 0.5 + assert metrics_top_1["Retriever"]["map"] == 0.5 + assert metrics_top_1["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics_top_1["Retriever"]["recall_single_hit"] == 0.5 + assert metrics_top_1["Retriever"]["precision"] == 0.1 + assert metrics_top_1["Retriever"]["ndcg"] == 0.5 + + metrics_top_2 = eval_result.calculate_metrics(simulated_top_k_reader=2, document_scope="document_id") + + assert metrics_top_2["Reader"]["exact_match"] == 0.5 + assert metrics_top_2["Reader"]["f1"] == 0.5 + assert metrics_top_2["Reader"]["sas"] == pytest.approx(0.5833, abs=1e-4) + assert metrics_top_2["Retriever"]["mrr"] == 0.5 + assert metrics_top_2["Retriever"]["map"] == 0.5 + assert metrics_top_2["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics_top_2["Retriever"]["recall_single_hit"] == 0.5 + assert metrics_top_2["Retriever"]["precision"] == 0.1 + assert metrics_top_2["Retriever"]["ndcg"] == 0.5 + + metrics_top_3 = eval_result.calculate_metrics(simulated_top_k_reader=3, document_scope="document_id") + + assert metrics_top_3["Reader"]["exact_match"] == 1.0 + assert metrics_top_3["Reader"]["f1"] == 1.0 + assert metrics_top_3["Reader"]["sas"] == pytest.approx(1.0, abs=1e-4) + assert metrics_top_3["Retriever"]["mrr"] == 0.5 + assert metrics_top_3["Retriever"]["map"] == 0.5 + assert metrics_top_3["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics_top_3["Retriever"]["recall_single_hit"] == 0.5 + assert metrics_top_3["Retriever"]["precision"] == 0.1 + assert metrics_top_3["Retriever"]["ndcg"] == 0.5 + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_docs): + pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) + + metrics_top_10 = eval_result.calculate_metrics(document_scope="document_id") + + assert metrics_top_10["Reader"]["exact_match"] == 1.0 + assert metrics_top_10["Reader"]["f1"] == 1.0 + assert metrics_top_10["Retriever"]["mrr"] == 0.5 + assert metrics_top_10["Retriever"]["map"] == 0.5 + assert metrics_top_10["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics_top_10["Retriever"]["recall_single_hit"] == 0.5 + assert metrics_top_10["Retriever"]["precision"] == 0.1 + assert metrics_top_10["Retriever"]["ndcg"] == 0.5 + + metrics_top_1 = eval_result.calculate_metrics(simulated_top_k_retriever=1, document_scope="document_id") + + assert metrics_top_1["Reader"]["exact_match"] == 1.0 + assert metrics_top_1["Reader"]["f1"] == 1.0 + assert metrics_top_1["Retriever"]["mrr"] == 0.5 + assert metrics_top_1["Retriever"]["map"] == 0.5 + assert metrics_top_1["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics_top_1["Retriever"]["recall_single_hit"] == 0.5 + assert metrics_top_1["Retriever"]["precision"] == 0.5 + assert metrics_top_1["Retriever"]["ndcg"] == 0.5 + + metrics_top_2 = eval_result.calculate_metrics(simulated_top_k_retriever=2, document_scope="document_id") + + assert metrics_top_2["Reader"]["exact_match"] == 1.0 + assert metrics_top_2["Reader"]["f1"] == 1.0 + assert metrics_top_2["Retriever"]["mrr"] == 0.5 + assert metrics_top_2["Retriever"]["map"] == 0.5 + assert metrics_top_2["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics_top_2["Retriever"]["recall_single_hit"] == 0.5 + assert metrics_top_2["Retriever"]["precision"] == 0.25 + assert metrics_top_2["Retriever"]["ndcg"] == 0.5 + + metrics_top_3 = eval_result.calculate_metrics(simulated_top_k_retriever=3, document_scope="document_id") + + assert metrics_top_3["Reader"]["exact_match"] == 1.0 + assert metrics_top_3["Reader"]["f1"] == 1.0 + assert metrics_top_3["Retriever"]["mrr"] == 0.5 + assert metrics_top_3["Retriever"]["map"] == 0.5 + assert metrics_top_3["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics_top_3["Retriever"]["recall_single_hit"] == 0.5 + assert metrics_top_3["Retriever"]["precision"] == 1.0 / 6 + assert metrics_top_3["Retriever"]["ndcg"] == 0.5 + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriever_with_docs): + pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch(labels=EVAL_LABELS, params={"Retriever": {"top_k": 10}}) + + metrics_top_10 = eval_result.calculate_metrics(simulated_top_k_reader=1, document_scope="document_id") + + assert metrics_top_10["Reader"]["exact_match"] == 0.5 + assert metrics_top_10["Reader"]["f1"] == 0.5 + assert metrics_top_10["Retriever"]["mrr"] == 0.5 + assert metrics_top_10["Retriever"]["map"] == 0.5 + assert metrics_top_10["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics_top_10["Retriever"]["recall_single_hit"] == 0.5 + assert metrics_top_10["Retriever"]["precision"] == 0.1 + assert metrics_top_10["Retriever"]["ndcg"] == 0.5 + + metrics_top_1 = eval_result.calculate_metrics( + simulated_top_k_reader=1, simulated_top_k_retriever=1, document_scope="document_id" + ) + + assert metrics_top_1["Reader"]["exact_match"] == 1.0 + assert metrics_top_1["Reader"]["f1"] == 1.0 + + assert metrics_top_1["Retriever"]["mrr"] == 0.5 + assert metrics_top_1["Retriever"]["map"] == 0.5 + assert metrics_top_1["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics_top_1["Retriever"]["recall_single_hit"] == 0.5 + assert metrics_top_1["Retriever"]["precision"] == 0.5 + assert metrics_top_1["Retriever"]["ndcg"] == 0.5 + + metrics_top_2 = eval_result.calculate_metrics( + simulated_top_k_reader=1, simulated_top_k_retriever=2, document_scope="document_id" + ) + + assert metrics_top_2["Reader"]["exact_match"] == 0.5 + assert metrics_top_2["Reader"]["f1"] == 0.5 + assert metrics_top_2["Retriever"]["mrr"] == 0.5 + assert metrics_top_2["Retriever"]["map"] == 0.5 + assert metrics_top_2["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics_top_2["Retriever"]["recall_single_hit"] == 0.5 + assert metrics_top_2["Retriever"]["precision"] == 0.25 + assert metrics_top_2["Retriever"]["ndcg"] == 0.5 + + metrics_top_3 = eval_result.calculate_metrics( + simulated_top_k_reader=1, simulated_top_k_retriever=3, document_scope="document_id" + ) + + assert metrics_top_3["Reader"]["exact_match"] == 0.5 + assert metrics_top_3["Reader"]["f1"] == 0.5 + assert metrics_top_3["Retriever"]["mrr"] == 0.5 + assert metrics_top_3["Retriever"]["map"] == 0.5 + assert metrics_top_3["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics_top_3["Retriever"]["recall_single_hit"] == 0.5 + assert metrics_top_3["Retriever"]["precision"] == 1.0 / 6 + assert metrics_top_3["Retriever"]["ndcg"] == 0.5 + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_extractive_qa_eval_isolated(reader, retriever_with_docs): + pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch( + labels=EVAL_LABELS, + sas_model_name_or_path="sentence-transformers/paraphrase-MiniLM-L3-v2", + add_isolated_node_eval=True, + ) + + metrics_top_1 = eval_result.calculate_metrics(simulated_top_k_reader=1, document_scope="document_id") + + assert metrics_top_1["Reader"]["exact_match"] == 0.5 + assert metrics_top_1["Reader"]["f1"] == 0.5 + assert metrics_top_1["Reader"]["sas"] == pytest.approx(0.5833, abs=1e-4) + assert metrics_top_1["Retriever"]["mrr"] == 0.5 + assert metrics_top_1["Retriever"]["map"] == 0.5 + assert metrics_top_1["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics_top_1["Retriever"]["recall_single_hit"] == 0.5 + assert metrics_top_1["Retriever"]["precision"] == 1.0 / 10 + assert metrics_top_1["Retriever"]["ndcg"] == 0.5 + + metrics_top_1 = eval_result.calculate_metrics(simulated_top_k_reader=1, eval_mode="isolated") + + assert metrics_top_1["Reader"]["exact_match"] == 1.0 + assert metrics_top_1["Reader"]["f1"] == 1.0 + assert metrics_top_1["Reader"]["sas"] == pytest.approx(1.0, abs=1e-4) + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_extractive_qa_eval_wrong_examples(reader, retriever_with_docs): + + labels = [ + MultiLabel( + labels=[ + Label( + query="Who lives in Berlin?", + answer=Answer(answer="Carla", offsets_in_context=[Span(11, 16)]), + document=Document( + id="a0747b83aea0b60c4b114b15476dd32d", + content_type="text", + content="My name is Carla and I live in Berlin", + ), + is_correct_answer=True, + is_correct_document=True, + origin="gold-label", + ) + ] + ), + MultiLabel( + labels=[ + Label( + query="Who lives in Munich?", + answer=Answer(answer="Pete", offsets_in_context=[Span(11, 16)]), + document=Document( + id="something_else", content_type="text", content="My name is Pete and I live in Munich" + ), + is_correct_answer=True, + is_correct_document=True, + origin="gold-label", + ) + ] + ), + ] + + pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch(labels=labels, params={"Retriever": {"top_k": 5}}) + + wrongs_retriever = eval_result.wrong_examples(node="Retriever", n=1) + wrongs_reader = eval_result.wrong_examples(node="Reader", n=1) + + assert len(wrongs_retriever) == 1 + assert len(wrongs_reader) == 1 + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_extractive_qa_print_eval_report(reader, retriever_with_docs): + + labels = [ + MultiLabel( + labels=[ + Label( + query="Who lives in Berlin?", + answer=Answer(answer="Carla", offsets_in_context=[Span(11, 16)]), + document=Document( + id="a0747b83aea0b60c4b114b15476dd32d", + content_type="text", + content="My name is Carla and I live in Berlin", + ), + is_correct_answer=True, + is_correct_document=True, + origin="gold-label", + ) + ] + ), + MultiLabel( + labels=[ + Label( + query="Who lives in Munich?", + answer=Answer(answer="Pete", offsets_in_context=[Span(11, 16)]), + document=Document( + id="something_else", content_type="text", content="My name is Pete and I live in Munich" + ), + is_correct_answer=True, + is_correct_document=True, + origin="gold-label", + ) + ] + ), + ] + + pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch(labels=labels, params={"Retriever": {"top_k": 5}}) + pipeline.print_eval_report(eval_result) + + # in addition with labels as input to reader node rather than output of retriever node + eval_result: EvaluationResult = pipeline.eval_batch( + labels=labels, params={"Retriever": {"top_k": 5}}, add_isolated_node_eval=True + ) + pipeline.print_eval_report(eval_result) + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +def test_document_search_calculate_metrics(retriever_with_docs): + pipeline = DocumentSearchPipeline(retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) + + metrics = eval_result.calculate_metrics(document_scope="document_id") + + assert "Retriever" in eval_result + assert len(eval_result) == 1 + retriever_result = eval_result["Retriever"] + retriever_berlin = retriever_result[retriever_result["query"] == "Who lives in Berlin?"] + retriever_munich = retriever_result[retriever_result["query"] == "Who lives in Munich?"] + + assert ( + retriever_berlin[retriever_berlin["rank"] == 1]["document_id"].iloc[0] + in retriever_berlin[retriever_berlin["rank"] == 1]["gold_document_ids"].iloc[0] + ) + assert ( + retriever_munich[retriever_munich["rank"] == 1]["document_id"].iloc[0] + not in retriever_munich[retriever_munich["rank"] == 1]["gold_document_ids"].iloc[0] + ) + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +def test_faq_calculate_metrics(retriever_with_docs): + pipeline = FAQPipeline(retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval_batch(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) + + metrics = eval_result.calculate_metrics(document_scope="document_id") + + assert "Retriever" in eval_result + assert "Docs2Answers" in eval_result + assert len(eval_result) == 2 + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + assert metrics["Docs2Answers"]["exact_match"] == 0.0 + assert metrics["Docs2Answers"]["f1"] == 0.0 + + +# Commented out because of the following issue https://github.com/deepset-ai/haystack/issues/2964 +# @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +# @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +# @pytest.mark.parametrize("reader", ["farm"], indirect=True) +# def test_extractive_qa_eval_translation(reader, retriever_with_docs): +# +# # FIXME it makes no sense to have DE->EN input and DE->EN output, right? +# # Yet switching direction breaks the test. TO BE FIXED. +# input_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-de-en") +# output_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-de-en") +# +# pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) +# pipeline = TranslationWrapperPipeline( +# input_translator=input_translator, output_translator=output_translator, pipeline=pipeline +# ) +# eval_result: EvaluationResult = pipeline.eval_batch(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) +# +# metrics = eval_result.calculate_metrics(document_scope="document_id") +# +# assert "Retriever" in eval_result +# assert "Reader" in eval_result +# assert "OutputTranslator" in eval_result +# assert len(eval_result) == 3 +# +# assert metrics["Reader"]["exact_match"] == 1.0 +# assert metrics["Reader"]["f1"] == 1.0 +# assert metrics["Retriever"]["mrr"] == 0.5 +# assert metrics["Retriever"]["map"] == 0.5 +# assert metrics["Retriever"]["recall_multi_hit"] == 0.5 +# assert metrics["Retriever"]["recall_single_hit"] == 0.5 +# assert metrics["Retriever"]["precision"] == 0.1 +# assert metrics["Retriever"]["ndcg"] == 0.5 +# +# assert metrics["OutputTranslator"]["exact_match"] == 1.0 +# assert metrics["OutputTranslator"]["f1"] == 1.0 +# assert metrics["OutputTranslator"]["mrr"] == 0.5 +# assert metrics["OutputTranslator"]["map"] == 0.5 +# assert metrics["OutputTranslator"]["recall_multi_hit"] == 0.5 +# assert metrics["OutputTranslator"]["recall_single_hit"] == 0.5 +# assert metrics["OutputTranslator"]["precision"] == 0.1 +# assert metrics["OutputTranslator"]["ndcg"] == 0.5 + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +def test_question_generation_eval(retriever_with_docs, question_generator): + pipeline = RetrieverQuestionGenerationPipeline(retriever=retriever_with_docs, question_generator=question_generator) + + eval_result: EvaluationResult = pipeline.eval_batch(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) + + metrics = eval_result.calculate_metrics(document_scope="document_id") + + assert "Retriever" in eval_result + assert "Question Generator" in eval_result + assert len(eval_result) == 2 + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + + assert metrics["Question Generator"]["mrr"] == 0.5 + assert metrics["Question Generator"]["map"] == 0.5 + assert metrics["Question Generator"]["recall_multi_hit"] == 0.5 + assert metrics["Question Generator"]["recall_single_hit"] == 0.5 + assert metrics["Question Generator"]["precision"] == 0.1 + assert metrics["Question Generator"]["ndcg"] == 0.5 + + +# Commented out because of the following issue https://github.com/deepset-ai/haystack/issues/2962 +# @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) +# @pytest.mark.parametrize("reader", ["farm"], indirect=True) +# def test_qa_multi_retriever_pipeline_eval(document_store_with_docs, reader): +# es_retriever = BM25Retriever(document_store=document_store_with_docs) +# dpr_retriever = DensePassageRetriever(document_store_with_docs) +# document_store_with_docs.update_embeddings(retriever=dpr_retriever) +# +# # QA Pipeline with two retrievers, we always want QA output +# pipeline = Pipeline() +# pipeline.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]) +# pipeline.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) +# pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) +# pipeline.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"]) +# +# # EVAL_QUERIES: 2 go dpr way +# # in Berlin goes es way +# labels = EVAL_LABELS + [ +# MultiLabel( +# labels=[ +# Label( +# query="in Berlin", +# answer=Answer(answer="Carla", offsets_in_context=[Span(11, 16)]), +# document=Document( +# id="a0747b83aea0b60c4b114b15476dd32d", +# content_type="text", +# content="My name is Carla and I live in Berlin", +# ), +# is_correct_answer=True, +# is_correct_document=True, +# origin="gold-label", +# ) +# ] +# ) +# ] +# +# eval_result: EvaluationResult = pipeline.eval_batch( +# labels=labels, params={"ESRetriever": {"top_k": 5}, "DPRRetriever": {"top_k": 5}} +# ) +# +# metrics = eval_result.calculate_metrics(document_scope="document_id") +# +# assert "ESRetriever" in eval_result +# assert "DPRRetriever" in eval_result +# assert "QAReader" in eval_result +# assert len(eval_result) == 3 +# +# assert metrics["DPRRetriever"]["mrr"] == 0.5 +# assert metrics["DPRRetriever"]["map"] == 0.5 +# assert metrics["DPRRetriever"]["recall_multi_hit"] == 0.5 +# assert metrics["DPRRetriever"]["recall_single_hit"] == 0.5 +# assert metrics["DPRRetriever"]["precision"] == 0.1 +# assert metrics["DPRRetriever"]["ndcg"] == 0.5 +# +# assert metrics["ESRetriever"]["mrr"] == 1.0 +# assert metrics["ESRetriever"]["map"] == 1.0 +# assert metrics["ESRetriever"]["recall_multi_hit"] == 1.0 +# assert metrics["ESRetriever"]["recall_single_hit"] == 1.0 +# assert metrics["ESRetriever"]["precision"] == 0.2 +# assert metrics["ESRetriever"]["ndcg"] == 1.0 +# +# assert metrics["QAReader"]["exact_match"] == 1.0 +# assert metrics["QAReader"]["f1"] == 1.0 + + +# Commented out because of the following issue https://github.com/deepset-ai/haystack/issues/2962 +# @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) +# def test_multi_retriever_pipeline_eval(document_store_with_docs): +# es_retriever = BM25Retriever(document_store=document_store_with_docs) +# dpr_retriever = DensePassageRetriever(document_store_with_docs) +# document_store_with_docs.update_embeddings(retriever=dpr_retriever) +# +# # QA Pipeline with two retrievers, no QA output +# pipeline = Pipeline() +# pipeline.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]) +# pipeline.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) +# pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) +# +# # EVAL_QUERIES: 2 go dpr way +# # in Berlin goes es way +# labels = EVAL_LABELS + [ +# MultiLabel( +# labels=[ +# Label( +# query="in Berlin", +# answer=None, +# document=Document( +# id="a0747b83aea0b60c4b114b15476dd32d", +# content_type="text", +# content="My name is Carla and I live in Berlin", +# ), +# is_correct_answer=True, +# is_correct_document=True, +# origin="gold-label", +# ) +# ] +# ) +# ] +# +# eval_result: EvaluationResult = pipeline.eval_batch( +# labels=labels, params={"ESRetriever": {"top_k": 5}, "DPRRetriever": {"top_k": 5}} +# ) +# +# metrics = eval_result.calculate_metrics(document_scope="document_id") +# +# assert "ESRetriever" in eval_result +# assert "DPRRetriever" in eval_result +# assert len(eval_result) == 2 +# +# assert metrics["DPRRetriever"]["mrr"] == 0.5 +# assert metrics["DPRRetriever"]["map"] == 0.5 +# assert metrics["DPRRetriever"]["recall_multi_hit"] == 0.5 +# assert metrics["DPRRetriever"]["recall_single_hit"] == 0.5 +# assert metrics["DPRRetriever"]["precision"] == 0.1 +# assert metrics["DPRRetriever"]["ndcg"] == 0.5 +# +# assert metrics["ESRetriever"]["mrr"] == 1.0 +# assert metrics["ESRetriever"]["map"] == 1.0 +# assert metrics["ESRetriever"]["recall_multi_hit"] == 1.0 +# assert metrics["ESRetriever"]["recall_single_hit"] == 1.0 +# assert metrics["ESRetriever"]["precision"] == 0.2 +# assert metrics["ESRetriever"]["ndcg"] == 1.0 + + +# Commented out because of the following issue https://github.com/deepset-ai/haystack/issues/2962 +# @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) +# @pytest.mark.parametrize("reader", ["farm"], indirect=True) +# def test_multi_retriever_pipeline_with_asymmetric_qa_eval(document_store_with_docs, reader): +# es_retriever = BM25Retriever(document_store=document_store_with_docs) +# dpr_retriever = DensePassageRetriever(document_store_with_docs) +# document_store_with_docs.update_embeddings(retriever=dpr_retriever) +# +# # QA Pipeline with two retrievers, we only get QA output from dpr +# pipeline = Pipeline() +# pipeline.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]) +# pipeline.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) +# pipeline.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) +# pipeline.add_node(component=reader, name="QAReader", inputs=["DPRRetriever"]) +# +# # EVAL_QUERIES: 2 go dpr way +# # in Berlin goes es way +# labels = EVAL_LABELS + [ +# MultiLabel( +# labels=[ +# Label( +# query="in Berlin", +# answer=None, +# document=Document( +# id="a0747b83aea0b60c4b114b15476dd32d", +# content_type="text", +# content="My name is Carla and I live in Berlin", +# ), +# is_correct_answer=True, +# is_correct_document=True, +# origin="gold-label", +# ) +# ] +# ) +# ] +# +# eval_result: EvaluationResult = pipeline.eval_batch( +# labels=labels, params={"ESRetriever": {"top_k": 5}, "DPRRetriever": {"top_k": 5}} +# ) +# +# metrics = eval_result.calculate_metrics(document_scope="document_id") +# +# assert "ESRetriever" in eval_result +# assert "DPRRetriever" in eval_result +# assert "QAReader" in eval_result +# assert len(eval_result) == 3 +# +# assert metrics["DPRRetriever"]["mrr"] == 0.5 +# assert metrics["DPRRetriever"]["map"] == 0.5 +# assert metrics["DPRRetriever"]["recall_multi_hit"] == 0.5 +# assert metrics["DPRRetriever"]["recall_single_hit"] == 0.5 +# assert metrics["DPRRetriever"]["precision"] == 0.1 +# assert metrics["DPRRetriever"]["ndcg"] == 0.5 +# +# assert metrics["ESRetriever"]["mrr"] == 1.0 +# assert metrics["ESRetriever"]["map"] == 1.0 +# assert metrics["ESRetriever"]["recall_multi_hit"] == 1.0 +# assert metrics["ESRetriever"]["recall_single_hit"] == 1.0 +# assert metrics["ESRetriever"]["precision"] == 0.2 +# assert metrics["ESRetriever"]["ndcg"] == 1.0 +# +# assert metrics["QAReader"]["exact_match"] == 1.0 +# assert metrics["QAReader"]["f1"] == 1.0