mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-17 02:48:30 +00:00
Fix corrupted csv from EvaluationResult.save()
(#2854)
* fix corrupted csv if text contains \r chars; make csv serialization configurable * Update Documentation & Code Style * incorporate feedback * Update Documentation & Code Style * adjust columns to be converted during loading Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
e350781825
commit
11c46006df
@ -523,7 +523,7 @@ In Question Answering, to enforce that the retrieved document is considered corr
|
|||||||
#### EvaluationResult.save
|
#### EvaluationResult.save
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def save(out_dir: Union[str, Path])
|
def save(out_dir: Union[str, Path], **to_csv_kwargs)
|
||||||
```
|
```
|
||||||
|
|
||||||
Saves the evaluation result.
|
Saves the evaluation result.
|
||||||
@ -533,6 +533,9 @@ The result of each node is saved in a separate csv with file name {node_name}.cs
|
|||||||
**Arguments**:
|
**Arguments**:
|
||||||
|
|
||||||
- `out_dir`: Path to the target folder the csvs will be saved.
|
- `out_dir`: Path to the target folder the csvs will be saved.
|
||||||
|
- `to_csv_kwargs`: kwargs to be passed to pd.DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
|
||||||
|
This method uses different default values than pd.DataFrame.to_csv() for the following parameters:
|
||||||
|
index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
|
||||||
|
|
||||||
<a id="schema.EvaluationResult.load"></a>
|
<a id="schema.EvaluationResult.load"></a>
|
||||||
|
|
||||||
@ -540,7 +543,7 @@ The result of each node is saved in a separate csv with file name {node_name}.cs
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, load_dir: Union[str, Path])
|
def load(cls, load_dir: Union[str, Path], **read_csv_kwargs)
|
||||||
```
|
```
|
||||||
|
|
||||||
Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.
|
Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.
|
||||||
@ -548,4 +551,8 @@ Loads the evaluation result from disk. Expects one csv file per node. See save()
|
|||||||
**Arguments**:
|
**Arguments**:
|
||||||
|
|
||||||
- `load_dir`: The directory containing the csv files.
|
- `load_dir`: The directory containing the csv files.
|
||||||
|
- `read_csv_kwargs`: kwargs to be passed to pd.read_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.
|
||||||
|
This method uses different default values than pd.read_csv() for the following parameters:
|
||||||
|
header=0, converters=CONVERTERS
|
||||||
|
where CONVERTERS is a dictionary mapping all array typed columns to ast.literal_eval.
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import csv
|
||||||
|
|
||||||
import typing
|
import typing
|
||||||
from typing import Any, Optional, Dict, List, Union
|
from typing import Any, Optional, Dict, List, Union
|
||||||
@ -1346,12 +1347,15 @@ class EvaluationResult:
|
|||||||
metrics_df = pd.DataFrame.from_records(metrics, index=documents["multilabel_id"].unique())
|
metrics_df = pd.DataFrame.from_records(metrics, index=documents["multilabel_id"].unique())
|
||||||
return metrics_df
|
return metrics_df
|
||||||
|
|
||||||
def save(self, out_dir: Union[str, Path]):
|
def save(self, out_dir: Union[str, Path], **to_csv_kwargs):
|
||||||
"""
|
"""
|
||||||
Saves the evaluation result.
|
Saves the evaluation result.
|
||||||
The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder.
|
The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder.
|
||||||
|
|
||||||
:param out_dir: Path to the target folder the csvs will be saved.
|
:param out_dir: Path to the target folder the csvs will be saved.
|
||||||
|
:param to_csv_kwargs: kwargs to be passed to pd.DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
|
||||||
|
This method uses different default values than pd.DataFrame.to_csv() for the following parameters:
|
||||||
|
index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
|
||||||
"""
|
"""
|
||||||
out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir)
|
out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir)
|
||||||
logger.info(f"Saving evaluation results to {out_dir}")
|
logger.info(f"Saving evaluation results to {out_dir}")
|
||||||
@ -1359,29 +1363,45 @@ class EvaluationResult:
|
|||||||
out_dir.mkdir(parents=True)
|
out_dir.mkdir(parents=True)
|
||||||
for node_name, df in self.node_results.items():
|
for node_name, df in self.node_results.items():
|
||||||
target_path = out_dir / f"{node_name}.csv"
|
target_path = out_dir / f"{node_name}.csv"
|
||||||
df.to_csv(target_path, index=False, header=True)
|
default_to_csv_kwargs = {
|
||||||
|
"index": False,
|
||||||
|
"quoting": csv.QUOTE_NONNUMERIC, # avoids problems with \r chars in texts by enclosing all string values in quotes
|
||||||
|
}
|
||||||
|
to_csv_kwargs = {**default_to_csv_kwargs, **to_csv_kwargs}
|
||||||
|
df.to_csv(target_path, **to_csv_kwargs)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, load_dir: Union[str, Path]):
|
def load(cls, load_dir: Union[str, Path], **read_csv_kwargs):
|
||||||
"""
|
"""
|
||||||
Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.
|
Loads the evaluation result from disk. Expects one csv file per node. See save() for further information.
|
||||||
|
|
||||||
:param load_dir: The directory containing the csv files.
|
:param load_dir: The directory containing the csv files.
|
||||||
|
:param read_csv_kwargs: kwargs to be passed to pd.read_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.
|
||||||
|
This method uses different default values than pd.read_csv() for the following parameters:
|
||||||
|
header=0, converters=CONVERTERS
|
||||||
|
where CONVERTERS is a dictionary mapping all array typed columns to ast.literal_eval.
|
||||||
"""
|
"""
|
||||||
load_dir = load_dir if isinstance(load_dir, Path) else Path(load_dir)
|
load_dir = load_dir if isinstance(load_dir, Path) else Path(load_dir)
|
||||||
csv_files = [file for file in load_dir.iterdir() if file.is_file() and file.suffix == ".csv"]
|
csv_files = [file for file in load_dir.iterdir() if file.is_file() and file.suffix == ".csv"]
|
||||||
cols_to_convert = [
|
cols_to_convert = [
|
||||||
|
"filters",
|
||||||
"gold_document_ids",
|
"gold_document_ids",
|
||||||
|
"gold_custom_document_ids",
|
||||||
"gold_contexts",
|
"gold_contexts",
|
||||||
"gold_answers",
|
"gold_answers",
|
||||||
|
"gold_documents_id_match",
|
||||||
"gold_offsets_in_documents",
|
"gold_offsets_in_documents",
|
||||||
"gold_answers_exact_match",
|
"gold_answers_exact_match",
|
||||||
"gold_answers_f1",
|
"gold_answers_f1",
|
||||||
"gold_answers_document_id_match",
|
"gold_answers_sas",
|
||||||
"gold_context_similarity",
|
"gold_answers_match",
|
||||||
|
"gold_contexts_similarity",
|
||||||
|
"offsets_in_document",
|
||||||
]
|
]
|
||||||
converters = dict.fromkeys(cols_to_convert, ast.literal_eval)
|
converters = dict.fromkeys(cols_to_convert, ast.literal_eval)
|
||||||
node_results = {file.stem: pd.read_csv(file, header=0, converters=converters) for file in csv_files}
|
default_read_csv_kwargs = {"converters": converters, "header": 0}
|
||||||
|
read_csv_kwargs = {**default_read_csv_kwargs, **read_csv_kwargs}
|
||||||
|
node_results = {file.stem: pd.read_csv(file, **read_csv_kwargs) for file in csv_files}
|
||||||
# backward compatibility mappings
|
# backward compatibility mappings
|
||||||
for df in node_results.values():
|
for df in node_results.values():
|
||||||
df.rename(columns={"gold_document_contents": "gold_contexts", "content": "context"}, inplace=True)
|
df.rename(columns={"gold_document_contents": "gold_contexts", "content": "context"}, inplace=True)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user