Fix corrupted csv from EvaluationResult.save() (#2854)

* fix corrupted csv if text contains \r chars; make csv serialization configurable

* Update Documentation & Code Style

* incorporate feedback

* Update Documentation & Code Style

* adjust columns to be converted during loading

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
tstadel 2022-07-21 16:31:07 +02:00 committed by GitHub
parent e350781825
commit 11c46006df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 35 additions and 8 deletions

View File

@ -523,7 +523,7 @@ In Question Answering, to enforce that the retrieved document is considered corr
#### EvaluationResult.save
```python
def save(out_dir: Union[str, Path])
def save(out_dir: Union[str, Path], **to_csv_kwargs)
```
Saves the evaluation result.
@ -533,6 +533,9 @@ The result of each node is saved in a separate csv with file name {node_name}.cs
**Arguments**:
- `out_dir`: Path to the target folder the csvs will be saved.
- `to_csv_kwargs`: kwargs to be passed to pd.DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
This method uses different default values than pd.DataFrame.to_csv() for the following parameters:
index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
<a id="schema.EvaluationResult.load"></a>
@ -540,7 +543,7 @@ The result of each node is saved in a separate csv with file name {node_name}.cs
```python
@classmethod
def load(cls, load_dir: Union[str, Path])
def load(cls, load_dir: Union[str, Path], **read_csv_kwargs)
```
Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.
@ -548,4 +551,8 @@ Loads the evaluation result from disk. Expects one csv file per node. See save()
**Arguments**:
- `load_dir`: The directory containing the csv files.
- `read_csv_kwargs`: kwargs to be passed to pd.read_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.
This method uses different default values than pd.read_csv() for the following parameters:
header=0, converters=CONVERTERS
where CONVERTERS is a dictionary mapping all array typed columns to ast.literal_eval.

View File

@ -1,4 +1,5 @@
from __future__ import annotations
import csv
import typing
from typing import Any, Optional, Dict, List, Union
@ -1346,12 +1347,15 @@ class EvaluationResult:
metrics_df = pd.DataFrame.from_records(metrics, index=documents["multilabel_id"].unique())
return metrics_df
def save(self, out_dir: Union[str, Path]):
def save(self, out_dir: Union[str, Path], **to_csv_kwargs):
"""
Saves the evaluation result.
The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder.
:param out_dir: Path to the target folder the csvs will be saved.
:param to_csv_kwargs: kwargs to be passed to pd.DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
This method uses different default values than pd.DataFrame.to_csv() for the following parameters:
index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
"""
out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir)
logger.info(f"Saving evaluation results to {out_dir}")
@ -1359,29 +1363,45 @@ class EvaluationResult:
out_dir.mkdir(parents=True)
for node_name, df in self.node_results.items():
target_path = out_dir / f"{node_name}.csv"
df.to_csv(target_path, index=False, header=True)
default_to_csv_kwargs = {
"index": False,
"quoting": csv.QUOTE_NONNUMERIC, # avoids problems with \r chars in texts by enclosing all string values in quotes
}
to_csv_kwargs = {**default_to_csv_kwargs, **to_csv_kwargs}
df.to_csv(target_path, **to_csv_kwargs)
@classmethod
def load(cls, load_dir: Union[str, Path]):
def load(cls, load_dir: Union[str, Path], **read_csv_kwargs):
"""
Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.
:param load_dir: The directory containing the csv files.
:param read_csv_kwargs: kwargs to be passed to pd.read_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.
This method uses different default values than pd.read_csv() for the following parameters:
header=0, converters=CONVERTERS
where CONVERTERS is a dictionary mapping all array typed columns to ast.literal_eval.
"""
load_dir = load_dir if isinstance(load_dir, Path) else Path(load_dir)
csv_files = [file for file in load_dir.iterdir() if file.is_file() and file.suffix == ".csv"]
cols_to_convert = [
"filters",
"gold_document_ids",
"gold_custom_document_ids",
"gold_contexts",
"gold_answers",
"gold_documents_id_match",
"gold_offsets_in_documents",
"gold_answers_exact_match",
"gold_answers_f1",
"gold_answers_document_id_match",
"gold_context_similarity",
"gold_answers_sas",
"gold_answers_match",
"gold_contexts_similarity",
"offsets_in_document",
]
converters = dict.fromkeys(cols_to_convert, ast.literal_eval)
node_results = {file.stem: pd.read_csv(file, header=0, converters=converters) for file in csv_files}
default_read_csv_kwargs = {"converters": converters, "header": 0}
read_csv_kwargs = {**default_read_csv_kwargs, **read_csv_kwargs}
node_results = {file.stem: pd.read_csv(file, **read_csv_kwargs) for file in csv_files}
# backward compatibility mappings
for df in node_results.values():
df.rename(columns={"gold_document_contents": "gold_contexts", "content": "context"}, inplace=True)