Fix corrupted csv from EvaluationResult.save() (#2854)

* fix corrupted csv if text contains \r chars; make csv serialization configurable

* Update Documentation & Code Style

* incorporate feedback

* Update Documentation & Code Style

* adjust columns to be converted during loading

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
tstadel 2022-07-21 16:31:07 +02:00 committed by GitHub
parent e350781825
commit 11c46006df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 35 additions and 8 deletions

View File

@ -523,7 +523,7 @@ In Question Answering, to enforce that the retrieved document is considered corr
#### EvaluationResult.save #### EvaluationResult.save
```python ```python
def save(out_dir: Union[str, Path]) def save(out_dir: Union[str, Path], **to_csv_kwargs)
``` ```
Saves the evaluation result. Saves the evaluation result.
@ -533,6 +533,9 @@ The result of each node is saved in a separate csv with file name {node_name}.cs
**Arguments**: **Arguments**:
- `out_dir`: Path to the target folder the csvs will be saved. - `out_dir`: Path to the target folder the csvs will be saved.
- `to_csv_kwargs`: kwargs to be passed to pd.DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
This method uses different default values than pd.DataFrame.to_csv() for the following parameters:
index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
<a id="schema.EvaluationResult.load"></a> <a id="schema.EvaluationResult.load"></a>
@ -540,7 +543,7 @@ The result of each node is saved in a separate csv with file name {node_name}.cs
```python ```python
@classmethod @classmethod
def load(cls, load_dir: Union[str, Path]) def load(cls, load_dir: Union[str, Path], **read_csv_kwargs)
``` ```
Loads the evaluation result from disk. Expects one csv file per node. See save() for further information. Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.
@ -548,4 +551,8 @@ Loads the evaluation result from disk. Expects one csv file per node. See save()
**Arguments**: **Arguments**:
- `load_dir`: The directory containing the csv files. - `load_dir`: The directory containing the csv files.
- `read_csv_kwargs`: kwargs to be passed to pd.read_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.
This method uses different default values than pd.read_csv() for the following parameters:
header=0, converters=CONVERTERS
where CONVERTERS is a dictionary mapping all array typed columns to ast.literal_eval.

View File

@ -1,4 +1,5 @@
from __future__ import annotations from __future__ import annotations
import csv
import typing import typing
from typing import Any, Optional, Dict, List, Union from typing import Any, Optional, Dict, List, Union
@ -1346,12 +1347,15 @@ class EvaluationResult:
metrics_df = pd.DataFrame.from_records(metrics, index=documents["multilabel_id"].unique()) metrics_df = pd.DataFrame.from_records(metrics, index=documents["multilabel_id"].unique())
return metrics_df return metrics_df
def save(self, out_dir: Union[str, Path]): def save(self, out_dir: Union[str, Path], **to_csv_kwargs):
""" """
Saves the evaluation result. Saves the evaluation result.
The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder. The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder.
:param out_dir: Path to the target folder the csvs will be saved. :param out_dir: Path to the target folder the csvs will be saved.
:param to_csv_kwargs: kwargs to be passed to pd.DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
This method uses different default values than pd.DataFrame.to_csv() for the following parameters:
index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
""" """
out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir) out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir)
logger.info(f"Saving evaluation results to {out_dir}") logger.info(f"Saving evaluation results to {out_dir}")
@ -1359,29 +1363,45 @@ class EvaluationResult:
out_dir.mkdir(parents=True) out_dir.mkdir(parents=True)
for node_name, df in self.node_results.items(): for node_name, df in self.node_results.items():
target_path = out_dir / f"{node_name}.csv" target_path = out_dir / f"{node_name}.csv"
df.to_csv(target_path, index=False, header=True) default_to_csv_kwargs = {
"index": False,
"quoting": csv.QUOTE_NONNUMERIC, # avoids problems with \r chars in texts by enclosing all string values in quotes
}
to_csv_kwargs = {**default_to_csv_kwargs, **to_csv_kwargs}
df.to_csv(target_path, **to_csv_kwargs)
@classmethod @classmethod
def load(cls, load_dir: Union[str, Path]): def load(cls, load_dir: Union[str, Path], **read_csv_kwargs):
""" """
Loads the evaluation result from disk. Expects one csv file per node. See save() for further information. Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.
:param load_dir: The directory containing the csv files. :param load_dir: The directory containing the csv files.
:param read_csv_kwargs: kwargs to be passed to pd.read_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.
This method uses different default values than pd.read_csv() for the following parameters:
header=0, converters=CONVERTERS
where CONVERTERS is a dictionary mapping all array typed columns to ast.literal_eval.
""" """
load_dir = load_dir if isinstance(load_dir, Path) else Path(load_dir) load_dir = load_dir if isinstance(load_dir, Path) else Path(load_dir)
csv_files = [file for file in load_dir.iterdir() if file.is_file() and file.suffix == ".csv"] csv_files = [file for file in load_dir.iterdir() if file.is_file() and file.suffix == ".csv"]
cols_to_convert = [ cols_to_convert = [
"filters",
"gold_document_ids", "gold_document_ids",
"gold_custom_document_ids",
"gold_contexts", "gold_contexts",
"gold_answers", "gold_answers",
"gold_documents_id_match",
"gold_offsets_in_documents", "gold_offsets_in_documents",
"gold_answers_exact_match", "gold_answers_exact_match",
"gold_answers_f1", "gold_answers_f1",
"gold_answers_document_id_match", "gold_answers_sas",
"gold_context_similarity", "gold_answers_match",
"gold_contexts_similarity",
"offsets_in_document",
] ]
converters = dict.fromkeys(cols_to_convert, ast.literal_eval) converters = dict.fromkeys(cols_to_convert, ast.literal_eval)
node_results = {file.stem: pd.read_csv(file, header=0, converters=converters) for file in csv_files} default_read_csv_kwargs = {"converters": converters, "header": 0}
read_csv_kwargs = {**default_read_csv_kwargs, **read_csv_kwargs}
node_results = {file.stem: pd.read_csv(file, **read_csv_kwargs) for file in csv_files}
# backward compatibility mappings # backward compatibility mappings
for df in node_results.values(): for df in node_results.values():
df.rename(columns={"gold_document_contents": "gold_contexts", "content": "context"}, inplace=True) df.rename(columns={"gold_document_contents": "gold_contexts", "content": "context"}, inplace=True)