mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-18 12:43:28 +00:00
Fix error when is_impossible not is_impossible and json dump encoding error (#868)
* Fix error when is_impossible not is_impossible and json dump encoding in multilingual data Fixing #867 * Fix file encoding, all file open with utf-8
This commit is contained in:
parent
762f194b27
commit
db75498278
@ -142,7 +142,7 @@ class Crawler(BaseComponent):
|
|||||||
if base_url:
|
if base_url:
|
||||||
data['meta']['base_url'] = base_url
|
data['meta']['base_url'] = base_url
|
||||||
data['text'] = text
|
data['text'] = text
|
||||||
with open(file_path, 'w') as f:
|
with open(file_path, 'w', encoding='utf-8') as f:
|
||||||
f.write(str(data))
|
f.write(str(data))
|
||||||
paths.append(file_path)
|
paths.append(file_path)
|
||||||
|
|
||||||
|
@ -198,7 +198,7 @@ class Pipeline(ABC):
|
|||||||
variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
|
variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
|
||||||
`_` sign must be used to specify nested hierarchical properties.
|
`_` sign must be used to specify nested hierarchical properties.
|
||||||
"""
|
"""
|
||||||
with open(path, "r") as stream:
|
with open(path, "r", encoding='utf-8') as stream:
|
||||||
data = yaml.safe_load(stream)
|
data = yaml.safe_load(stream)
|
||||||
|
|
||||||
if pipeline_name is None:
|
if pipeline_name is None:
|
||||||
|
@ -36,7 +36,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, prepro
|
|||||||
labels = []
|
labels = []
|
||||||
problematic_ids = []
|
problematic_ids = []
|
||||||
|
|
||||||
with open(filename, "r") as file:
|
with open(filename, "r", encoding='utf-8') as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
if "title" not in data["data"][0]:
|
if "title" not in data["data"][0]:
|
||||||
logger.warning(f"No title information found for documents in QA file: {filename}")
|
logger.warning(f"No title information found for documents in QA file: {filename}")
|
||||||
@ -75,7 +75,7 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
|
|||||||
labels = []
|
labels = []
|
||||||
problematic_ids = []
|
problematic_ids = []
|
||||||
|
|
||||||
with open(filename, "r") as file:
|
with open(filename, "r", encoding='utf-8') as file:
|
||||||
for document in file:
|
for document in file:
|
||||||
if max_docs:
|
if max_docs:
|
||||||
if len(docs) > max_docs:
|
if len(docs) > max_docs:
|
||||||
@ -145,7 +145,7 @@ def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PrePro
|
|||||||
|
|
||||||
## Assign Labels to corresponding documents
|
## Assign Labels to corresponding documents
|
||||||
for qa in paragraph["qas"]:
|
for qa in paragraph["qas"]:
|
||||||
if not qa["is_impossible"]:
|
if not qa.get("is_impossible", False):
|
||||||
for answer in qa["answers"]:
|
for answer in qa["answers"]:
|
||||||
ans = answer["text"]
|
ans = answer["text"]
|
||||||
ans_position = cur_doc.text[answer["answer_start"]:answer["answer_start"]+len(ans)]
|
ans_position = cur_doc.text[answer["answer_start"]:answer["answer_start"]+len(ans)]
|
||||||
@ -397,7 +397,7 @@ def squad_json_to_jsonl(squad_file: str, output_file: str):
|
|||||||
:param output_file: Name of output file (SQuAD in jsonl format)
|
:param output_file: Name of output file (SQuAD in jsonl format)
|
||||||
:type output_file: str
|
:type output_file: str
|
||||||
"""
|
"""
|
||||||
with open(squad_file) as json_file, open(output_file, "w") as jsonl_file:
|
with open(squad_file, encoding='utf-8') as json_file, open(output_file, "w", encoding='utf-8') as jsonl_file:
|
||||||
squad_json = json.load(json_file)
|
squad_json = json.load(json_file)
|
||||||
|
|
||||||
for doc in squad_json["data"]:
|
for doc in squad_json["data"]:
|
||||||
|
@ -139,8 +139,8 @@ def add_is_impossible(squad_data: dict, json_file_path: Path):
|
|||||||
question["is_impossible"] = False
|
question["is_impossible"] = False
|
||||||
|
|
||||||
squad_data["data"] = squad_articles
|
squad_data["data"] = squad_articles
|
||||||
with open(new_path, "w") as filo:
|
with open(new_path, "w", encoding='utf-8') as filo:
|
||||||
json.dump(squad_data, filo, indent=4)
|
json.dump(squad_data, filo, indent=4, ensure_ascii=False)
|
||||||
|
|
||||||
return new_path, squad_data
|
return new_path, squad_data
|
||||||
|
|
||||||
@ -219,8 +219,8 @@ def save_dataset(iter_dpr: Iterator, dpr_output_filename: Path,
|
|||||||
else:
|
else:
|
||||||
dataset_splits = {dpr_output_filename: iter_dpr}
|
dataset_splits = {dpr_output_filename: iter_dpr}
|
||||||
for path, set_iter in dataset_splits.items():
|
for path, set_iter in dataset_splits.items():
|
||||||
with open(path, "w") as json_ds:
|
with open(path, "w", encoding='utf-8') as json_ds:
|
||||||
json.dump(list(set_iter), json_ds, indent=4)
|
json.dump(list(set_iter), json_ds, indent=4, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
def get_hard_negative_contexts(retriever: BaseRetriever, question: str, answers: List[str],
|
def get_hard_negative_contexts(retriever: BaseRetriever, question: str, answers: List[str],
|
||||||
@ -242,7 +242,7 @@ def load_squad_file(squad_file_path: Path):
|
|||||||
if not squad_file_path.exists():
|
if not squad_file_path.exists():
|
||||||
raise FileNotFoundError
|
raise FileNotFoundError
|
||||||
|
|
||||||
with open(squad_file_path) as squad_file:
|
with open(squad_file_path, encoding='utf-8') as squad_file:
|
||||||
squad_data = json.load(squad_file)
|
squad_data = json.load(squad_file)
|
||||||
|
|
||||||
# squad_data["data"] = squad_data["data"][:10] # sample
|
# squad_data["data"] = squad_data["data"][:10] # sample
|
||||||
|
@ -70,7 +70,7 @@ def convert_labels_to_squad(labels_file: str):
|
|||||||
:param labels_file: path for export file from the labeling tool
|
:param labels_file: path for export file from the labeling tool
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
with open(labels_file) as label_file:
|
with open(labels_file, encoding='utf-8') as label_file:
|
||||||
labels = json.load(label_file)
|
labels = json.load(label_file)
|
||||||
|
|
||||||
labels_grouped_by_documents = defaultdict(list)
|
labels_grouped_by_documents = defaultdict(list)
|
||||||
@ -112,7 +112,7 @@ def convert_labels_to_squad(labels_file: str):
|
|||||||
|
|
||||||
labels_in_squad_format["data"].append(squad_format_label)
|
labels_in_squad_format["data"].append(squad_format_label)
|
||||||
|
|
||||||
with open("labels_in_squad_format.json", "w+") as outfile:
|
with open("labels_in_squad_format.json", "w+", encoding='utf-8') as outfile:
|
||||||
json.dump(labels_in_squad_format, outfile)
|
json.dump(labels_in_squad_format, outfile)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user