mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-14 17:13:03 +00:00
Let SquadData support data from Annotation Tool (#2329)
* Support data from Annotation Tool * Update Documentation & Code Style * Incorporate reviewer feedback * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
This commit is contained in:
parent
7ffeccece6
commit
6233dfce2f
@ -5,6 +5,7 @@ import json
|
|||||||
import random
|
import random
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
import mmh3
|
||||||
|
|
||||||
from haystack.schema import Document, Label
|
from haystack.schema import Document, Label
|
||||||
from haystack.modeling.data_handler.processor import _read_squad_file
|
from haystack.modeling.data_handler.processor import _read_squad_file
|
||||||
@ -109,9 +110,10 @@ class SquadData:
|
|||||||
"""Convert a list of SQuAD document dictionaries into a pandas dataframe (each row is one annotation)"""
|
"""Convert a list of SQuAD document dictionaries into a pandas dataframe (each row is one annotation)"""
|
||||||
flat = []
|
flat = []
|
||||||
for document in data:
|
for document in data:
|
||||||
title = document["title"]
|
title = document.get("title", "")
|
||||||
for paragraph in document["paragraphs"]:
|
for paragraph in document["paragraphs"]:
|
||||||
context = paragraph["context"]
|
context = paragraph["context"]
|
||||||
|
document_id = paragraph.get("document_id", "{:02x}".format(mmh3.hash128(str(context), signed=False)))
|
||||||
for question in paragraph["qas"]:
|
for question in paragraph["qas"]:
|
||||||
q = question["question"]
|
q = question["question"]
|
||||||
id = question["id"]
|
id = question["id"]
|
||||||
@ -127,6 +129,7 @@ class SquadData:
|
|||||||
"answer_text": "",
|
"answer_text": "",
|
||||||
"answer_start": None,
|
"answer_start": None,
|
||||||
"is_impossible": is_impossible,
|
"is_impossible": is_impossible,
|
||||||
|
"document_id": document_id,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
# For span answer samples
|
# For span answer samples
|
||||||
@ -143,6 +146,7 @@ class SquadData:
|
|||||||
"answer_text": answer_text,
|
"answer_text": answer_text,
|
||||||
"answer_start": answer_start,
|
"answer_start": answer_start,
|
||||||
"is_impossible": is_impossible,
|
"is_impossible": is_impossible,
|
||||||
|
"document_id": document_id,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
df = pd.DataFrame.from_records(flat)
|
df = pd.DataFrame.from_records(flat)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user