Using PreProcessor functions on eval data (#751)

* Add eval data splitting * Adjust for split by passage, add test and test data, adjust docstrings, add max_docs to highler level fct
2026-01-06 03:57:19 +00:00 · 2021-01-20 14:40:10 +01:00 · 2021-01-20 14:40:10 +01:00 · 4803da009a
commit 4803da009a
parent aa8a3666c3
5 changed files with 185 additions and 36 deletions
--- a/haystack/document_store/base.py
+++ b/haystack/document_store/base.py
@ -4,6 +4,7 @@ from pathlib import Path
 from typing import Any, Optional, Dict, List, Union
 from haystack import Document, Label, MultiLabel
 from haystack.preprocessor.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl
+from haystack.preprocessor.preprocessor import PreProcessor


 logger = logging.getLogger(__name__)
@ -140,28 +141,43 @@ class BaseDocumentStore(ABC):
        pass

    def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_index: str = "label",
-                      batch_size: Optional[int] = None):
+                      batch_size: Optional[int] = None, preprocessor: Optional[PreProcessor] = None,
+                      max_docs: Union[int, bool] = None):
        """
        Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
        If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise
        from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors.

        :param filename: Name of the file containing evaluation data (json or jsonl)
-        :type filename: str
        :param doc_index: Elasticsearch index where evaluation documents should be stored
-        :type doc_index: str
        :param label_index: Elasticsearch index where labeled questions should be stored
-        :type label_index: str
-        :param batch_size: Number of documents that are loaded and processed at a time.
-                           Only works with jsonl formatted files. Setting batch_size and
-                           using a json formatted file will convert the json to jsonl prior
-                           to adding eval data.
-        :type batch_size: int
+        :param batch_size: Optional number of documents that are loaded and processed at a time.
+                           When set to None (default) all documents are processed at once.
+        :param preprocessor: Optional PreProcessor to preprocess evaluation documents.
+                             It can be used for splitting documents into passages (and assigning labels to corresponding passages).
+                             Currently the PreProcessor does not support split_by sentence, cleaning nor split_overlap != 0.
+                             When set to None (default) preprocessing is disabled.
+        :param max_docs: Optional number of documents that will be loaded.
+                         When set to None (default) all available eval documents are used.
+
        """
+        # TODO improve support for PreProcessor when adding eval data
+        if preprocessor is not None:
+            assert preprocessor.split_by != "sentence", f"Split by sentence not supported.\n" \
+                                                    f"Please set 'split_by' to either 'word' or 'passage' in the supplied PreProcessor."
+            assert preprocessor.split_overlap == 0, f"Overlapping documents are currently not supported when adding eval data.\n" \
+                                                    f"Please set 'split_overlap=0' in the supplied PreProcessor."
+            assert preprocessor.clean_empty_lines == False, f"clean_empty_lines currently not supported when adding eval data.\n" \
+                                                    f"Please set 'clean_empty_lines=False' in the supplied PreProcessor."
+            assert preprocessor.clean_whitespace == False, f"clean_whitespace is currently not supported when adding eval data.\n" \
+                                                    f"Please set 'clean_whitespace=False' in the supplied PreProcessor."
+            assert preprocessor.clean_header_footer == False, f"clean_header_footer is currently not supported when adding eval data.\n" \
+                                                    f"Please set 'clean_header_footer=False' in the supplied PreProcessor."
+
        file_path = Path(filename)
        if file_path.suffix == ".json":
            if batch_size is None:
-                docs, labels = eval_data_from_json(filename)
+                docs, labels = eval_data_from_json(filename, max_docs=max_docs, preprocessor=preprocessor)
                self.write_documents(docs, index=doc_index)
                self.write_labels(labels, index=label_index)
            else:
@ -172,7 +188,7 @@ class BaseDocumentStore(ABC):
                self.add_eval_data(jsonl_filename, doc_index, label_index, batch_size)

        elif file_path.suffix == ".jsonl":
-            for docs, labels in eval_data_from_jsonl(filename, batch_size):
+            for docs, labels in eval_data_from_jsonl(filename, batch_size, max_docs=max_docs, preprocessor=preprocessor):
                if docs:
                    self.write_documents(docs, index=doc_index)
                if labels:
--- a/haystack/preprocessor/utils.py
+++ b/haystack/preprocessor/utils.py
@ -16,12 +16,13 @@ from haystack.file_converter.pdf import PDFToTextConverter
 from haystack.file_converter.tika import TikaConverter
 from haystack import Document, Label
 from haystack.file_converter.txt import TextConverter
+from haystack.preprocessor.preprocessor import PreProcessor

 logger = logging.getLogger(__name__)



-def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tuple[List[Document], List[Label]]:
+def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None) -> Tuple[List[Document], List[Label]]:
    """
    Read Documents + Labels from a SQuAD-style file.
    Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
@ -44,7 +45,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tup
                if len(docs) > max_docs:
                    break
            # Extracting paragraphs and their labels from a SQuAD document dict
-            cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document)
+            cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document, preprocessor)
            docs.extend(cur_docs)
            labels.extend(cur_labels)

@ -52,7 +53,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tup


 def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
-                         max_docs: Union[int, bool] = None) -> Generator[Tuple[List[Document], List[Label]], None, None]:
+                         max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None) -> Generator[Tuple[List[Document], List[Label]], None, None]:
    """
    Read Documents + Labels from a SQuAD-style file in jsonl format, i.e. one document per line.
    Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
@ -76,7 +77,7 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
                    break
            # Extracting paragraphs and their labels from a SQuAD document dict
            document_dict = json.loads(document)
-            cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document_dict)
+            cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document_dict, preprocessor)
            docs.extend(cur_docs)
            labels.extend(cur_labels)

@ -89,50 +90,96 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
    yield docs, labels


-def _extract_docs_and_labels_from_dict(document_dict: Dict):
+def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PreProcessor = None):
    docs = []
    labels = []

    # get all extra fields from document level (e.g. title)
    meta_doc = {k: v for k, v in document_dict.items() if k not in ("paragraphs", "title")}
    for paragraph in document_dict["paragraphs"]:
+        ## Create Metadata
        cur_meta = {"name": document_dict.get("title", None)}
        # all other fields from paragraph level
        meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
        cur_meta.update(meta_paragraph)
        # meta from parent document
        cur_meta.update(meta_doc)
-        # Create Document
-        cur_doc = Document(text=paragraph["context"], meta=cur_meta)
-        docs.append(cur_doc)

-        # Get Labels
+        ## Create Document
+        cur_doc = Document(text=paragraph["context"], meta=cur_meta)
+        if preprocessor is not None:
+            splits_dicts = preprocessor.process(cur_doc.to_dict())
+            # we need to pull in _split_id into the document id for unique reference in labels
+            # todo: PreProcessor should work on Documents instead of dicts
+            splits = []
+            offset = 0
+            for d in splits_dicts:
+                id = f"{d['id']}-{d['meta']['_split_id']}"
+                d["meta"]["_split_offset"] = offset
+                offset += len(d["text"])
+                # offset correction based on splitting method
+                if preprocessor.split_by == "word":
+                    offset += 1
+                elif preprocessor.split_by == "passage":
+                    offset += 2
+                else:
+                    raise NotImplementedError
+                mydoc = Document(text=d["text"],
+                                 id=id,
+                                 meta=d["meta"])
+                splits.append(mydoc)
+        else:
+            splits = [cur_doc]
+        docs.extend(splits)
+
+        ## Assign Labels to corresponding documents
        for qa in paragraph["qas"]:
-            if len(qa["answers"]) > 0:
+            if not qa["is_impossible"]:
                for answer in qa["answers"]:
+                    ans = answer["text"]
+                    ans_position = cur_doc.text[answer["answer_start"]:answer["answer_start"]+len(ans)]
+                    if ans != ans_position:
+                        logger.warning(f"Answer Text and Answer position mismatch. Skipping Answer")
+                        break
+                    # find corresponding document or split
+                    if len(splits) == 1:
+                        cur_id = splits[0].id
+                        cur_ans_start = answer["answer_start"]
+                    else:
+                        for s in splits:
+                            # If answer start offset is contained in passage we assign the label to that passage
+                            if (answer["answer_start"] >= s.meta["_split_offset"]) and (answer["answer_start"] < (s.meta["_split_offset"] + len(s.text))):
+                                cur_id = s.id
+                                cur_ans_start = answer["answer_start"] - s.meta["_split_offset"]
+                                # If a document is splitting an answer we add the whole answer text to the document
+                                if s.text[cur_ans_start:cur_ans_start+len(ans)] != ans:
+                                    s.text = s.text[:cur_ans_start] + ans
+                                break
                    label = Label(
                        question=qa["question"],
-                        answer=answer["text"],
+                        answer=ans,
                        is_correct_answer=True,
                        is_correct_document=True,
-                        document_id=cur_doc.id,
-                        offset_start_in_doc=answer["answer_start"],
+                        document_id=cur_id,
+                        offset_start_in_doc=cur_ans_start,
                        no_answer=qa["is_impossible"],
                        origin="gold_label",
                    )
                    labels.append(label)
            else:
-                label = Label(
-                    question=qa["question"],
-                    answer="",
-                    is_correct_answer=True,
-                    is_correct_document=True,
-                    document_id=cur_doc.id,
-                    offset_start_in_doc=0,
-                    no_answer=qa["is_impossible"],
-                    origin="gold_label",
-                )
-                labels.append(label)
+                # for no_answer we need to assign each split as not fitting to the question
+                for s in splits:
+                    label = Label(
+                        question=qa["question"],
+                        answer="",
+                        is_correct_answer=True,
+                        is_correct_document=True,
+                        document_id=s.id,
+                        offset_start_in_doc=0,
+                        no_answer=qa["is_impossible"],
+                        origin="gold_label",
+                    )
+                    labels.append(label)

    return docs, labels

--- a/test/samples/squad/tiny.json
+++ b/test/samples/squad/tiny.json
@ -15,6 +15,10 @@
                {
                  "answer_start": 42,
                  "text": "Abdul"
+                },
+                {
+                  "answer_start": 11,
+                  "text": "Carla and I live together with Abdul"
                }
              ],
              "id": 7211011040021040393,
--- a/test/samples/squad/tiny_passages.json
+++ b/test/samples/squad/tiny_passages.json
@ -0,0 +1,33 @@
+{
+  "data": [
+   {
+      "title": "test1",
+      "paragraphs": [
+        {
+          "context": "My name is Carla and I live together with Abdul in Berlin. \n\nThis is a new passage saying Leila lives in Berlin, too.",
+          "qas": [
+            {
+              "answers": [
+                {
+                  "answer_start": 11,
+                  "text": "Carla"
+                },
+                {
+                  "answer_start": 42,
+                  "text": "Abdul"
+                },
+                {
+                  "answer_start": 89,
+                  "text": "Leila"
+                }
+              ],
+              "id": 7211011040021040393,
+              "question": "Who lives in Berlin?",
+              "is_impossible": false
+            }
+          ]
+        }
+      ]
+    }
+  ]
+}
--- a/test/test_eval.py
+++ b/test/test_eval.py
@ -1,5 +1,7 @@
 import pytest
 from haystack.document_store.base import BaseDocumentStore
+from haystack.document_store.memory import InMemoryDocumentStore
+from haystack.preprocessor.preprocessor import PreProcessor
 from haystack.finder import Finder


@ -159,4 +161,51 @@ def test_eval_finder(document_store: BaseDocumentStore, reader, retriever):

    # clean up
    document_store.delete_all_documents(index="test_eval_document")
-    document_store.delete_all_documents(index="test_feedback")
+    document_store.delete_all_documents(index="test_feedback")
+
+@pytest.mark.elasticsearch
+def test_eval_data_splitting(document_store):
+    # splitting by word
+    document_store.delete_all_documents(index="test_eval_document")
+    document_store.delete_all_documents(index="test_feedback")
+
+    preprocessor = PreProcessor(
+        clean_empty_lines=False,
+        clean_whitespace=False,
+        clean_header_footer=False,
+        split_by="word",
+        split_length=4,
+        split_overlap=0,
+        split_respect_sentence_boundary=False
+    )
+
+    document_store.add_eval_data(filename="samples/squad/tiny.json",
+                                 doc_index="test_eval_document",
+                                 label_index="test_feedback",
+                                 preprocessor=preprocessor)
+    labels = document_store.get_all_labels_aggregated(index="test_feedback")
+    docs = document_store.get_all_documents(index="test_eval_document")
+    assert len(docs) == 5
+    assert len(set(labels[0].multiple_document_ids)) == 2
+
+    # splitting by passage
+    document_store.delete_all_documents(index="test_eval_document")
+    document_store.delete_all_documents(index="test_feedback")
+
+    preprocessor = PreProcessor(
+        clean_empty_lines=False,
+        clean_whitespace=False,
+        clean_header_footer=False,
+        split_by="passage",
+        split_length=1,
+        split_overlap=0,
+        split_respect_sentence_boundary=False
+    )
+
+    document_store.add_eval_data(filename="samples/squad/tiny_passages.json",
+                                 doc_index="test_eval_document",
+                                 label_index="test_feedback",
+                                 preprocessor=preprocessor)
+    docs = document_store.get_all_documents(index="test_eval_document")
+    assert len(docs) == 2
+    assert len(docs[1].text) == 56