mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-16 18:39:03 +00:00
Fix corrupted csv from EvaluationResult.save()
(#2854)
* fix corrupted csv if text contains \r chars; make csv serialization configurable * Update Documentation & Code Style * incorporate feedback * Update Documentation & Code Style * adjust columns to be converted during loading Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
e350781825
commit
11c46006df
@ -523,7 +523,7 @@ In Question Answering, to enforce that the retrieved document is considered corr
|
||||
#### EvaluationResult.save
|
||||
|
||||
```python
|
||||
def save(out_dir: Union[str, Path])
|
||||
def save(out_dir: Union[str, Path], **to_csv_kwargs)
|
||||
```
|
||||
|
||||
Saves the evaluation result.
|
||||
@ -533,6 +533,9 @@ The result of each node is saved in a separate csv with file name {node_name}.cs
|
||||
**Arguments**:
|
||||
|
||||
- `out_dir`: Path to the target folder the csvs will be saved.
|
||||
- `to_csv_kwargs`: kwargs to be passed to pd.DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
|
||||
This method uses different default values than pd.DataFrame.to_csv() for the following parameters:
|
||||
index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
|
||||
|
||||
<a id="schema.EvaluationResult.load"></a>
|
||||
|
||||
@ -540,7 +543,7 @@ The result of each node is saved in a separate csv with file name {node_name}.cs
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def load(cls, load_dir: Union[str, Path])
|
||||
def load(cls, load_dir: Union[str, Path], **read_csv_kwargs)
|
||||
```
|
||||
|
||||
Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.
|
||||
@ -548,4 +551,8 @@ Loads the evaluation result from disk. Expects one csv file per node. See save()
|
||||
**Arguments**:
|
||||
|
||||
- `load_dir`: The directory containing the csv files.
|
||||
- `read_csv_kwargs`: kwargs to be passed to pd.read_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.
|
||||
This method uses different default values than pd.read_csv() for the following parameters:
|
||||
header=0, converters=CONVERTERS
|
||||
where CONVERTERS is a dictionary mapping all array typed columns to ast.literal_eval.
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
from __future__ import annotations
|
||||
import csv
|
||||
|
||||
import typing
|
||||
from typing import Any, Optional, Dict, List, Union
|
||||
@ -1346,12 +1347,15 @@ class EvaluationResult:
|
||||
metrics_df = pd.DataFrame.from_records(metrics, index=documents["multilabel_id"].unique())
|
||||
return metrics_df
|
||||
|
||||
def save(self, out_dir: Union[str, Path]):
|
||||
def save(self, out_dir: Union[str, Path], **to_csv_kwargs):
|
||||
"""
|
||||
Saves the evaluation result.
|
||||
The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder.
|
||||
|
||||
:param out_dir: Path to the target folder the csvs will be saved.
|
||||
:param to_csv_kwargs: kwargs to be passed to pd.DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
|
||||
This method uses different default values than pd.DataFrame.to_csv() for the following parameters:
|
||||
index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
|
||||
"""
|
||||
out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir)
|
||||
logger.info(f"Saving evaluation results to {out_dir}")
|
||||
@ -1359,29 +1363,45 @@ class EvaluationResult:
|
||||
out_dir.mkdir(parents=True)
|
||||
for node_name, df in self.node_results.items():
|
||||
target_path = out_dir / f"{node_name}.csv"
|
||||
df.to_csv(target_path, index=False, header=True)
|
||||
default_to_csv_kwargs = {
|
||||
"index": False,
|
||||
"quoting": csv.QUOTE_NONNUMERIC, # avoids problems with \r chars in texts by enclosing all string values in quotes
|
||||
}
|
||||
to_csv_kwargs = {**default_to_csv_kwargs, **to_csv_kwargs}
|
||||
df.to_csv(target_path, **to_csv_kwargs)
|
||||
|
||||
@classmethod
|
||||
def load(cls, load_dir: Union[str, Path]):
|
||||
def load(cls, load_dir: Union[str, Path], **read_csv_kwargs):
|
||||
"""
|
||||
Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.
|
||||
|
||||
:param load_dir: The directory containing the csv files.
|
||||
:param read_csv_kwargs: kwargs to be passed to pd.read_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.
|
||||
This method uses different default values than pd.read_csv() for the following parameters:
|
||||
header=0, converters=CONVERTERS
|
||||
where CONVERTERS is a dictionary mapping all array typed columns to ast.literal_eval.
|
||||
"""
|
||||
load_dir = load_dir if isinstance(load_dir, Path) else Path(load_dir)
|
||||
csv_files = [file for file in load_dir.iterdir() if file.is_file() and file.suffix == ".csv"]
|
||||
cols_to_convert = [
|
||||
"filters",
|
||||
"gold_document_ids",
|
||||
"gold_custom_document_ids",
|
||||
"gold_contexts",
|
||||
"gold_answers",
|
||||
"gold_documents_id_match",
|
||||
"gold_offsets_in_documents",
|
||||
"gold_answers_exact_match",
|
||||
"gold_answers_f1",
|
||||
"gold_answers_document_id_match",
|
||||
"gold_context_similarity",
|
||||
"gold_answers_sas",
|
||||
"gold_answers_match",
|
||||
"gold_contexts_similarity",
|
||||
"offsets_in_document",
|
||||
]
|
||||
converters = dict.fromkeys(cols_to_convert, ast.literal_eval)
|
||||
node_results = {file.stem: pd.read_csv(file, header=0, converters=converters) for file in csv_files}
|
||||
default_read_csv_kwargs = {"converters": converters, "header": 0}
|
||||
read_csv_kwargs = {**default_read_csv_kwargs, **read_csv_kwargs}
|
||||
node_results = {file.stem: pd.read_csv(file, **read_csv_kwargs) for file in csv_files}
|
||||
# backward compatibility mappings
|
||||
for df in node_results.values():
|
||||
df.rename(columns={"gold_document_contents": "gold_contexts", "content": "context"}, inplace=True)
|
||||
|
Loading…
x
Reference in New Issue
Block a user