mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-02 18:59:28 +00:00
Let SquadData support data from Annotation Tool (#2329)
* Support data from Annotation Tool * Update Documentation & Code Style * Incorporate reviewer feedback * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
This commit is contained in:
parent
7ffeccece6
commit
6233dfce2f
@ -5,6 +5,7 @@ import json
|
||||
import random
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
import mmh3
|
||||
|
||||
from haystack.schema import Document, Label
|
||||
from haystack.modeling.data_handler.processor import _read_squad_file
|
||||
@ -109,9 +110,10 @@ class SquadData:
|
||||
"""Convert a list of SQuAD document dictionaries into a pandas dataframe (each row is one annotation)"""
|
||||
flat = []
|
||||
for document in data:
|
||||
title = document["title"]
|
||||
title = document.get("title", "")
|
||||
for paragraph in document["paragraphs"]:
|
||||
context = paragraph["context"]
|
||||
document_id = paragraph.get("document_id", "{:02x}".format(mmh3.hash128(str(context), signed=False)))
|
||||
for question in paragraph["qas"]:
|
||||
q = question["question"]
|
||||
id = question["id"]
|
||||
@ -127,6 +129,7 @@ class SquadData:
|
||||
"answer_text": "",
|
||||
"answer_start": None,
|
||||
"is_impossible": is_impossible,
|
||||
"document_id": document_id,
|
||||
}
|
||||
)
|
||||
# For span answer samples
|
||||
@ -143,6 +146,7 @@ class SquadData:
|
||||
"answer_text": answer_text,
|
||||
"answer_start": answer_start,
|
||||
"is_impossible": is_impossible,
|
||||
"document_id": document_id,
|
||||
}
|
||||
)
|
||||
df = pd.DataFrame.from_records(flat)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user