Using PreProcessor functions on eval data (#751)

* Add eval data splitting

* Adjust for split by passage, add test and test data, adjust docstrings, add max_docs to highler level fct
This commit is contained in:
Timo Moeller 2021-01-20 14:40:10 +01:00 committed by GitHub
parent aa8a3666c3
commit 4803da009a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 185 additions and 36 deletions

View File

@ -4,6 +4,7 @@ from pathlib import Path
from typing import Any, Optional, Dict, List, Union
from haystack import Document, Label, MultiLabel
from haystack.preprocessor.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl
from haystack.preprocessor.preprocessor import PreProcessor
logger = logging.getLogger(__name__)
@ -140,28 +141,43 @@ class BaseDocumentStore(ABC):
pass
def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_index: str = "label",
batch_size: Optional[int] = None):
batch_size: Optional[int] = None, preprocessor: Optional[PreProcessor] = None,
max_docs: Union[int, bool] = None):
"""
Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise
from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors.
:param filename: Name of the file containing evaluation data (json or jsonl)
:type filename: str
:param doc_index: Elasticsearch index where evaluation documents should be stored
:type doc_index: str
:param label_index: Elasticsearch index where labeled questions should be stored
:type label_index: str
:param batch_size: Number of documents that are loaded and processed at a time.
Only works with jsonl formatted files. Setting batch_size and
using a json formatted file will convert the json to jsonl prior
to adding eval data.
:type batch_size: int
:param batch_size: Optional number of documents that are loaded and processed at a time.
When set to None (default) all documents are processed at once.
:param preprocessor: Optional PreProcessor to preprocess evaluation documents.
It can be used for splitting documents into passages (and assigning labels to corresponding passages).
Currently the PreProcessor does not support split_by sentence, cleaning nor split_overlap != 0.
When set to None (default) preprocessing is disabled.
:param max_docs: Optional number of documents that will be loaded.
When set to None (default) all available eval documents are used.
"""
# TODO improve support for PreProcessor when adding eval data
if preprocessor is not None:
assert preprocessor.split_by != "sentence", f"Split by sentence not supported.\n" \
f"Please set 'split_by' to either 'word' or 'passage' in the supplied PreProcessor."
assert preprocessor.split_overlap == 0, f"Overlapping documents are currently not supported when adding eval data.\n" \
f"Please set 'split_overlap=0' in the supplied PreProcessor."
assert preprocessor.clean_empty_lines == False, f"clean_empty_lines currently not supported when adding eval data.\n" \
f"Please set 'clean_empty_lines=False' in the supplied PreProcessor."
assert preprocessor.clean_whitespace == False, f"clean_whitespace is currently not supported when adding eval data.\n" \
f"Please set 'clean_whitespace=False' in the supplied PreProcessor."
assert preprocessor.clean_header_footer == False, f"clean_header_footer is currently not supported when adding eval data.\n" \
f"Please set 'clean_header_footer=False' in the supplied PreProcessor."
file_path = Path(filename)
if file_path.suffix == ".json":
if batch_size is None:
docs, labels = eval_data_from_json(filename)
docs, labels = eval_data_from_json(filename, max_docs=max_docs, preprocessor=preprocessor)
self.write_documents(docs, index=doc_index)
self.write_labels(labels, index=label_index)
else:
@ -172,7 +188,7 @@ class BaseDocumentStore(ABC):
self.add_eval_data(jsonl_filename, doc_index, label_index, batch_size)
elif file_path.suffix == ".jsonl":
for docs, labels in eval_data_from_jsonl(filename, batch_size):
for docs, labels in eval_data_from_jsonl(filename, batch_size, max_docs=max_docs, preprocessor=preprocessor):
if docs:
self.write_documents(docs, index=doc_index)
if labels:

View File

@ -16,12 +16,13 @@ from haystack.file_converter.pdf import PDFToTextConverter
from haystack.file_converter.tika import TikaConverter
from haystack import Document, Label
from haystack.file_converter.txt import TextConverter
from haystack.preprocessor.preprocessor import PreProcessor
logger = logging.getLogger(__name__)
def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tuple[List[Document], List[Label]]:
def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None) -> Tuple[List[Document], List[Label]]:
"""
Read Documents + Labels from a SQuAD-style file.
Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
@ -44,7 +45,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tup
if len(docs) > max_docs:
break
# Extracting paragraphs and their labels from a SQuAD document dict
cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document)
cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document, preprocessor)
docs.extend(cur_docs)
labels.extend(cur_labels)
@ -52,7 +53,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tup
def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
max_docs: Union[int, bool] = None) -> Generator[Tuple[List[Document], List[Label]], None, None]:
max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None) -> Generator[Tuple[List[Document], List[Label]], None, None]:
"""
Read Documents + Labels from a SQuAD-style file in jsonl format, i.e. one document per line.
Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
@ -76,7 +77,7 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
break
# Extracting paragraphs and their labels from a SQuAD document dict
document_dict = json.loads(document)
cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document_dict)
cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document_dict, preprocessor)
docs.extend(cur_docs)
labels.extend(cur_labels)
@ -89,50 +90,96 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
yield docs, labels
def _extract_docs_and_labels_from_dict(document_dict: Dict):
def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PreProcessor = None):
docs = []
labels = []
# get all extra fields from document level (e.g. title)
meta_doc = {k: v for k, v in document_dict.items() if k not in ("paragraphs", "title")}
for paragraph in document_dict["paragraphs"]:
## Create Metadata
cur_meta = {"name": document_dict.get("title", None)}
# all other fields from paragraph level
meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
cur_meta.update(meta_paragraph)
# meta from parent document
cur_meta.update(meta_doc)
# Create Document
cur_doc = Document(text=paragraph["context"], meta=cur_meta)
docs.append(cur_doc)
# Get Labels
## Create Document
cur_doc = Document(text=paragraph["context"], meta=cur_meta)
if preprocessor is not None:
splits_dicts = preprocessor.process(cur_doc.to_dict())
# we need to pull in _split_id into the document id for unique reference in labels
# todo: PreProcessor should work on Documents instead of dicts
splits = []
offset = 0
for d in splits_dicts:
id = f"{d['id']}-{d['meta']['_split_id']}"
d["meta"]["_split_offset"] = offset
offset += len(d["text"])
# offset correction based on splitting method
if preprocessor.split_by == "word":
offset += 1
elif preprocessor.split_by == "passage":
offset += 2
else:
raise NotImplementedError
mydoc = Document(text=d["text"],
id=id,
meta=d["meta"])
splits.append(mydoc)
else:
splits = [cur_doc]
docs.extend(splits)
## Assign Labels to corresponding documents
for qa in paragraph["qas"]:
if len(qa["answers"]) > 0:
if not qa["is_impossible"]:
for answer in qa["answers"]:
ans = answer["text"]
ans_position = cur_doc.text[answer["answer_start"]:answer["answer_start"]+len(ans)]
if ans != ans_position:
logger.warning(f"Answer Text and Answer position mismatch. Skipping Answer")
break
# find corresponding document or split
if len(splits) == 1:
cur_id = splits[0].id
cur_ans_start = answer["answer_start"]
else:
for s in splits:
# If answer start offset is contained in passage we assign the label to that passage
if (answer["answer_start"] >= s.meta["_split_offset"]) and (answer["answer_start"] < (s.meta["_split_offset"] + len(s.text))):
cur_id = s.id
cur_ans_start = answer["answer_start"] - s.meta["_split_offset"]
# If a document is splitting an answer we add the whole answer text to the document
if s.text[cur_ans_start:cur_ans_start+len(ans)] != ans:
s.text = s.text[:cur_ans_start] + ans
break
label = Label(
question=qa["question"],
answer=answer["text"],
answer=ans,
is_correct_answer=True,
is_correct_document=True,
document_id=cur_doc.id,
offset_start_in_doc=answer["answer_start"],
document_id=cur_id,
offset_start_in_doc=cur_ans_start,
no_answer=qa["is_impossible"],
origin="gold_label",
)
labels.append(label)
else:
label = Label(
question=qa["question"],
answer="",
is_correct_answer=True,
is_correct_document=True,
document_id=cur_doc.id,
offset_start_in_doc=0,
no_answer=qa["is_impossible"],
origin="gold_label",
)
labels.append(label)
# for no_answer we need to assign each split as not fitting to the question
for s in splits:
label = Label(
question=qa["question"],
answer="",
is_correct_answer=True,
is_correct_document=True,
document_id=s.id,
offset_start_in_doc=0,
no_answer=qa["is_impossible"],
origin="gold_label",
)
labels.append(label)
return docs, labels

View File

@ -15,6 +15,10 @@
{
"answer_start": 42,
"text": "Abdul"
},
{
"answer_start": 11,
"text": "Carla and I live together with Abdul"
}
],
"id": 7211011040021040393,

View File

@ -0,0 +1,33 @@
{
"data": [
{
"title": "test1",
"paragraphs": [
{
"context": "My name is Carla and I live together with Abdul in Berlin. \n\nThis is a new passage saying Leila lives in Berlin, too.",
"qas": [
{
"answers": [
{
"answer_start": 11,
"text": "Carla"
},
{
"answer_start": 42,
"text": "Abdul"
},
{
"answer_start": 89,
"text": "Leila"
}
],
"id": 7211011040021040393,
"question": "Who lives in Berlin?",
"is_impossible": false
}
]
}
]
}
]
}

View File

@ -1,5 +1,7 @@
import pytest
from haystack.document_store.base import BaseDocumentStore
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.preprocessor.preprocessor import PreProcessor
from haystack.finder import Finder
@ -159,4 +161,51 @@ def test_eval_finder(document_store: BaseDocumentStore, reader, retriever):
# clean up
document_store.delete_all_documents(index="test_eval_document")
document_store.delete_all_documents(index="test_feedback")
document_store.delete_all_documents(index="test_feedback")
@pytest.mark.elasticsearch
def test_eval_data_splitting(document_store):
# splitting by word
document_store.delete_all_documents(index="test_eval_document")
document_store.delete_all_documents(index="test_feedback")
preprocessor = PreProcessor(
clean_empty_lines=False,
clean_whitespace=False,
clean_header_footer=False,
split_by="word",
split_length=4,
split_overlap=0,
split_respect_sentence_boundary=False
)
document_store.add_eval_data(filename="samples/squad/tiny.json",
doc_index="test_eval_document",
label_index="test_feedback",
preprocessor=preprocessor)
labels = document_store.get_all_labels_aggregated(index="test_feedback")
docs = document_store.get_all_documents(index="test_eval_document")
assert len(docs) == 5
assert len(set(labels[0].multiple_document_ids)) == 2
# splitting by passage
document_store.delete_all_documents(index="test_eval_document")
document_store.delete_all_documents(index="test_feedback")
preprocessor = PreProcessor(
clean_empty_lines=False,
clean_whitespace=False,
clean_header_footer=False,
split_by="passage",
split_length=1,
split_overlap=0,
split_respect_sentence_boundary=False
)
document_store.add_eval_data(filename="samples/squad/tiny_passages.json",
doc_index="test_eval_document",
label_index="test_feedback",
preprocessor=preprocessor)
docs = document_store.get_all_documents(index="test_eval_document")
assert len(docs) == 2
assert len(docs[1].text) == 56