mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 03:57:19 +00:00
Using PreProcessor functions on eval data (#751)
* Add eval data splitting * Adjust for split by passage, add test and test data, adjust docstrings, add max_docs to highler level fct
This commit is contained in:
parent
aa8a3666c3
commit
4803da009a
@ -4,6 +4,7 @@ from pathlib import Path
|
||||
from typing import Any, Optional, Dict, List, Union
|
||||
from haystack import Document, Label, MultiLabel
|
||||
from haystack.preprocessor.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl
|
||||
from haystack.preprocessor.preprocessor import PreProcessor
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -140,28 +141,43 @@ class BaseDocumentStore(ABC):
|
||||
pass
|
||||
|
||||
def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_index: str = "label",
|
||||
batch_size: Optional[int] = None):
|
||||
batch_size: Optional[int] = None, preprocessor: Optional[PreProcessor] = None,
|
||||
max_docs: Union[int, bool] = None):
|
||||
"""
|
||||
Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
|
||||
If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise
|
||||
from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors.
|
||||
|
||||
:param filename: Name of the file containing evaluation data (json or jsonl)
|
||||
:type filename: str
|
||||
:param doc_index: Elasticsearch index where evaluation documents should be stored
|
||||
:type doc_index: str
|
||||
:param label_index: Elasticsearch index where labeled questions should be stored
|
||||
:type label_index: str
|
||||
:param batch_size: Number of documents that are loaded and processed at a time.
|
||||
Only works with jsonl formatted files. Setting batch_size and
|
||||
using a json formatted file will convert the json to jsonl prior
|
||||
to adding eval data.
|
||||
:type batch_size: int
|
||||
:param batch_size: Optional number of documents that are loaded and processed at a time.
|
||||
When set to None (default) all documents are processed at once.
|
||||
:param preprocessor: Optional PreProcessor to preprocess evaluation documents.
|
||||
It can be used for splitting documents into passages (and assigning labels to corresponding passages).
|
||||
Currently the PreProcessor does not support split_by sentence, cleaning nor split_overlap != 0.
|
||||
When set to None (default) preprocessing is disabled.
|
||||
:param max_docs: Optional number of documents that will be loaded.
|
||||
When set to None (default) all available eval documents are used.
|
||||
|
||||
"""
|
||||
# TODO improve support for PreProcessor when adding eval data
|
||||
if preprocessor is not None:
|
||||
assert preprocessor.split_by != "sentence", f"Split by sentence not supported.\n" \
|
||||
f"Please set 'split_by' to either 'word' or 'passage' in the supplied PreProcessor."
|
||||
assert preprocessor.split_overlap == 0, f"Overlapping documents are currently not supported when adding eval data.\n" \
|
||||
f"Please set 'split_overlap=0' in the supplied PreProcessor."
|
||||
assert preprocessor.clean_empty_lines == False, f"clean_empty_lines currently not supported when adding eval data.\n" \
|
||||
f"Please set 'clean_empty_lines=False' in the supplied PreProcessor."
|
||||
assert preprocessor.clean_whitespace == False, f"clean_whitespace is currently not supported when adding eval data.\n" \
|
||||
f"Please set 'clean_whitespace=False' in the supplied PreProcessor."
|
||||
assert preprocessor.clean_header_footer == False, f"clean_header_footer is currently not supported when adding eval data.\n" \
|
||||
f"Please set 'clean_header_footer=False' in the supplied PreProcessor."
|
||||
|
||||
file_path = Path(filename)
|
||||
if file_path.suffix == ".json":
|
||||
if batch_size is None:
|
||||
docs, labels = eval_data_from_json(filename)
|
||||
docs, labels = eval_data_from_json(filename, max_docs=max_docs, preprocessor=preprocessor)
|
||||
self.write_documents(docs, index=doc_index)
|
||||
self.write_labels(labels, index=label_index)
|
||||
else:
|
||||
@ -172,7 +188,7 @@ class BaseDocumentStore(ABC):
|
||||
self.add_eval_data(jsonl_filename, doc_index, label_index, batch_size)
|
||||
|
||||
elif file_path.suffix == ".jsonl":
|
||||
for docs, labels in eval_data_from_jsonl(filename, batch_size):
|
||||
for docs, labels in eval_data_from_jsonl(filename, batch_size, max_docs=max_docs, preprocessor=preprocessor):
|
||||
if docs:
|
||||
self.write_documents(docs, index=doc_index)
|
||||
if labels:
|
||||
|
||||
@ -16,12 +16,13 @@ from haystack.file_converter.pdf import PDFToTextConverter
|
||||
from haystack.file_converter.tika import TikaConverter
|
||||
from haystack import Document, Label
|
||||
from haystack.file_converter.txt import TextConverter
|
||||
from haystack.preprocessor.preprocessor import PreProcessor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tuple[List[Document], List[Label]]:
|
||||
def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None) -> Tuple[List[Document], List[Label]]:
|
||||
"""
|
||||
Read Documents + Labels from a SQuAD-style file.
|
||||
Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
|
||||
@ -44,7 +45,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tup
|
||||
if len(docs) > max_docs:
|
||||
break
|
||||
# Extracting paragraphs and their labels from a SQuAD document dict
|
||||
cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document)
|
||||
cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document, preprocessor)
|
||||
docs.extend(cur_docs)
|
||||
labels.extend(cur_labels)
|
||||
|
||||
@ -52,7 +53,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tup
|
||||
|
||||
|
||||
def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
|
||||
max_docs: Union[int, bool] = None) -> Generator[Tuple[List[Document], List[Label]], None, None]:
|
||||
max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None) -> Generator[Tuple[List[Document], List[Label]], None, None]:
|
||||
"""
|
||||
Read Documents + Labels from a SQuAD-style file in jsonl format, i.e. one document per line.
|
||||
Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
|
||||
@ -76,7 +77,7 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
|
||||
break
|
||||
# Extracting paragraphs and their labels from a SQuAD document dict
|
||||
document_dict = json.loads(document)
|
||||
cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document_dict)
|
||||
cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document_dict, preprocessor)
|
||||
docs.extend(cur_docs)
|
||||
labels.extend(cur_labels)
|
||||
|
||||
@ -89,50 +90,96 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
|
||||
yield docs, labels
|
||||
|
||||
|
||||
def _extract_docs_and_labels_from_dict(document_dict: Dict):
|
||||
def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PreProcessor = None):
|
||||
docs = []
|
||||
labels = []
|
||||
|
||||
# get all extra fields from document level (e.g. title)
|
||||
meta_doc = {k: v for k, v in document_dict.items() if k not in ("paragraphs", "title")}
|
||||
for paragraph in document_dict["paragraphs"]:
|
||||
## Create Metadata
|
||||
cur_meta = {"name": document_dict.get("title", None)}
|
||||
# all other fields from paragraph level
|
||||
meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
|
||||
cur_meta.update(meta_paragraph)
|
||||
# meta from parent document
|
||||
cur_meta.update(meta_doc)
|
||||
# Create Document
|
||||
cur_doc = Document(text=paragraph["context"], meta=cur_meta)
|
||||
docs.append(cur_doc)
|
||||
|
||||
# Get Labels
|
||||
## Create Document
|
||||
cur_doc = Document(text=paragraph["context"], meta=cur_meta)
|
||||
if preprocessor is not None:
|
||||
splits_dicts = preprocessor.process(cur_doc.to_dict())
|
||||
# we need to pull in _split_id into the document id for unique reference in labels
|
||||
# todo: PreProcessor should work on Documents instead of dicts
|
||||
splits = []
|
||||
offset = 0
|
||||
for d in splits_dicts:
|
||||
id = f"{d['id']}-{d['meta']['_split_id']}"
|
||||
d["meta"]["_split_offset"] = offset
|
||||
offset += len(d["text"])
|
||||
# offset correction based on splitting method
|
||||
if preprocessor.split_by == "word":
|
||||
offset += 1
|
||||
elif preprocessor.split_by == "passage":
|
||||
offset += 2
|
||||
else:
|
||||
raise NotImplementedError
|
||||
mydoc = Document(text=d["text"],
|
||||
id=id,
|
||||
meta=d["meta"])
|
||||
splits.append(mydoc)
|
||||
else:
|
||||
splits = [cur_doc]
|
||||
docs.extend(splits)
|
||||
|
||||
## Assign Labels to corresponding documents
|
||||
for qa in paragraph["qas"]:
|
||||
if len(qa["answers"]) > 0:
|
||||
if not qa["is_impossible"]:
|
||||
for answer in qa["answers"]:
|
||||
ans = answer["text"]
|
||||
ans_position = cur_doc.text[answer["answer_start"]:answer["answer_start"]+len(ans)]
|
||||
if ans != ans_position:
|
||||
logger.warning(f"Answer Text and Answer position mismatch. Skipping Answer")
|
||||
break
|
||||
# find corresponding document or split
|
||||
if len(splits) == 1:
|
||||
cur_id = splits[0].id
|
||||
cur_ans_start = answer["answer_start"]
|
||||
else:
|
||||
for s in splits:
|
||||
# If answer start offset is contained in passage we assign the label to that passage
|
||||
if (answer["answer_start"] >= s.meta["_split_offset"]) and (answer["answer_start"] < (s.meta["_split_offset"] + len(s.text))):
|
||||
cur_id = s.id
|
||||
cur_ans_start = answer["answer_start"] - s.meta["_split_offset"]
|
||||
# If a document is splitting an answer we add the whole answer text to the document
|
||||
if s.text[cur_ans_start:cur_ans_start+len(ans)] != ans:
|
||||
s.text = s.text[:cur_ans_start] + ans
|
||||
break
|
||||
label = Label(
|
||||
question=qa["question"],
|
||||
answer=answer["text"],
|
||||
answer=ans,
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
document_id=cur_doc.id,
|
||||
offset_start_in_doc=answer["answer_start"],
|
||||
document_id=cur_id,
|
||||
offset_start_in_doc=cur_ans_start,
|
||||
no_answer=qa["is_impossible"],
|
||||
origin="gold_label",
|
||||
)
|
||||
labels.append(label)
|
||||
else:
|
||||
label = Label(
|
||||
question=qa["question"],
|
||||
answer="",
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
document_id=cur_doc.id,
|
||||
offset_start_in_doc=0,
|
||||
no_answer=qa["is_impossible"],
|
||||
origin="gold_label",
|
||||
)
|
||||
labels.append(label)
|
||||
# for no_answer we need to assign each split as not fitting to the question
|
||||
for s in splits:
|
||||
label = Label(
|
||||
question=qa["question"],
|
||||
answer="",
|
||||
is_correct_answer=True,
|
||||
is_correct_document=True,
|
||||
document_id=s.id,
|
||||
offset_start_in_doc=0,
|
||||
no_answer=qa["is_impossible"],
|
||||
origin="gold_label",
|
||||
)
|
||||
labels.append(label)
|
||||
|
||||
return docs, labels
|
||||
|
||||
|
||||
@ -15,6 +15,10 @@
|
||||
{
|
||||
"answer_start": 42,
|
||||
"text": "Abdul"
|
||||
},
|
||||
{
|
||||
"answer_start": 11,
|
||||
"text": "Carla and I live together with Abdul"
|
||||
}
|
||||
],
|
||||
"id": 7211011040021040393,
|
||||
|
||||
33
test/samples/squad/tiny_passages.json
Normal file
33
test/samples/squad/tiny_passages.json
Normal file
@ -0,0 +1,33 @@
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"title": "test1",
|
||||
"paragraphs": [
|
||||
{
|
||||
"context": "My name is Carla and I live together with Abdul in Berlin. \n\nThis is a new passage saying Leila lives in Berlin, too.",
|
||||
"qas": [
|
||||
{
|
||||
"answers": [
|
||||
{
|
||||
"answer_start": 11,
|
||||
"text": "Carla"
|
||||
},
|
||||
{
|
||||
"answer_start": 42,
|
||||
"text": "Abdul"
|
||||
},
|
||||
{
|
||||
"answer_start": 89,
|
||||
"text": "Leila"
|
||||
}
|
||||
],
|
||||
"id": 7211011040021040393,
|
||||
"question": "Who lives in Berlin?",
|
||||
"is_impossible": false
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -1,5 +1,7 @@
|
||||
import pytest
|
||||
from haystack.document_store.base import BaseDocumentStore
|
||||
from haystack.document_store.memory import InMemoryDocumentStore
|
||||
from haystack.preprocessor.preprocessor import PreProcessor
|
||||
from haystack.finder import Finder
|
||||
|
||||
|
||||
@ -159,4 +161,51 @@ def test_eval_finder(document_store: BaseDocumentStore, reader, retriever):
|
||||
|
||||
# clean up
|
||||
document_store.delete_all_documents(index="test_eval_document")
|
||||
document_store.delete_all_documents(index="test_feedback")
|
||||
document_store.delete_all_documents(index="test_feedback")
|
||||
|
||||
@pytest.mark.elasticsearch
|
||||
def test_eval_data_splitting(document_store):
|
||||
# splitting by word
|
||||
document_store.delete_all_documents(index="test_eval_document")
|
||||
document_store.delete_all_documents(index="test_feedback")
|
||||
|
||||
preprocessor = PreProcessor(
|
||||
clean_empty_lines=False,
|
||||
clean_whitespace=False,
|
||||
clean_header_footer=False,
|
||||
split_by="word",
|
||||
split_length=4,
|
||||
split_overlap=0,
|
||||
split_respect_sentence_boundary=False
|
||||
)
|
||||
|
||||
document_store.add_eval_data(filename="samples/squad/tiny.json",
|
||||
doc_index="test_eval_document",
|
||||
label_index="test_feedback",
|
||||
preprocessor=preprocessor)
|
||||
labels = document_store.get_all_labels_aggregated(index="test_feedback")
|
||||
docs = document_store.get_all_documents(index="test_eval_document")
|
||||
assert len(docs) == 5
|
||||
assert len(set(labels[0].multiple_document_ids)) == 2
|
||||
|
||||
# splitting by passage
|
||||
document_store.delete_all_documents(index="test_eval_document")
|
||||
document_store.delete_all_documents(index="test_feedback")
|
||||
|
||||
preprocessor = PreProcessor(
|
||||
clean_empty_lines=False,
|
||||
clean_whitespace=False,
|
||||
clean_header_footer=False,
|
||||
split_by="passage",
|
||||
split_length=1,
|
||||
split_overlap=0,
|
||||
split_respect_sentence_boundary=False
|
||||
)
|
||||
|
||||
document_store.add_eval_data(filename="samples/squad/tiny_passages.json",
|
||||
doc_index="test_eval_document",
|
||||
label_index="test_feedback",
|
||||
preprocessor=preprocessor)
|
||||
docs = document_store.get_all_documents(index="test_eval_document")
|
||||
assert len(docs) == 2
|
||||
assert len(docs[1].text) == 56
|
||||
Loading…
x
Reference in New Issue
Block a user