Let SquadData support data from Annotation Tool (#2329)

* Support data from Annotation Tool

* Update Documentation & Code Style

* Incorporate reviewer feedback

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
This commit is contained in:
Branden Chan 2022-03-22 10:17:25 +01:00 committed by GitHub
parent 7ffeccece6
commit 6233dfce2f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -5,6 +5,7 @@ import json
import random import random
import pandas as pd import pandas as pd
from tqdm import tqdm from tqdm import tqdm
import mmh3
from haystack.schema import Document, Label from haystack.schema import Document, Label
from haystack.modeling.data_handler.processor import _read_squad_file from haystack.modeling.data_handler.processor import _read_squad_file
@ -109,9 +110,10 @@ class SquadData:
"""Convert a list of SQuAD document dictionaries into a pandas dataframe (each row is one annotation)""" """Convert a list of SQuAD document dictionaries into a pandas dataframe (each row is one annotation)"""
flat = [] flat = []
for document in data: for document in data:
title = document["title"] title = document.get("title", "")
for paragraph in document["paragraphs"]: for paragraph in document["paragraphs"]:
context = paragraph["context"] context = paragraph["context"]
document_id = paragraph.get("document_id", "{:02x}".format(mmh3.hash128(str(context), signed=False)))
for question in paragraph["qas"]: for question in paragraph["qas"]:
q = question["question"] q = question["question"]
id = question["id"] id = question["id"]
@ -127,6 +129,7 @@ class SquadData:
"answer_text": "", "answer_text": "",
"answer_start": None, "answer_start": None,
"is_impossible": is_impossible, "is_impossible": is_impossible,
"document_id": document_id,
} }
) )
# For span answer samples # For span answer samples
@ -143,6 +146,7 @@ class SquadData:
"answer_text": answer_text, "answer_text": answer_text,
"answer_start": answer_start, "answer_start": answer_start,
"is_impossible": is_impossible, "is_impossible": is_impossible,
"document_id": document_id,
} }
) )
df = pd.DataFrame.from_records(flat) df = pd.DataFrame.from_records(flat)