Fix error when is_impossible not is_impossible and json dump encoding error (#868)

* Fix error when is_impossible not is_impossible and json dump encoding in multilingual data

Fixing #867

* Fix file encoding, all file open with utf-8
This commit is contained in:
Eric Lam 2021-03-02 20:54:58 +08:00 committed by GitHub
parent 762f194b27
commit db75498278
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 13 additions and 13 deletions

View File

@ -142,7 +142,7 @@ class Crawler(BaseComponent):
if base_url:
data['meta']['base_url'] = base_url
data['text'] = text
with open(file_path, 'w') as f:
with open(file_path, 'w', encoding='utf-8') as f:
f.write(str(data))
paths.append(file_path)

View File

@ -198,7 +198,7 @@ class Pipeline(ABC):
variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
`_` sign must be used to specify nested hierarchical properties.
"""
with open(path, "r") as stream:
with open(path, "r", encoding='utf-8') as stream:
data = yaml.safe_load(stream)
if pipeline_name is None:

View File

@ -36,7 +36,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, prepro
labels = []
problematic_ids = []
with open(filename, "r") as file:
with open(filename, "r", encoding='utf-8') as file:
data = json.load(file)
if "title" not in data["data"][0]:
logger.warning(f"No title information found for documents in QA file: {filename}")
@ -75,7 +75,7 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
labels = []
problematic_ids = []
with open(filename, "r") as file:
with open(filename, "r", encoding='utf-8') as file:
for document in file:
if max_docs:
if len(docs) > max_docs:
@ -145,7 +145,7 @@ def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PrePro
## Assign Labels to corresponding documents
for qa in paragraph["qas"]:
if not qa["is_impossible"]:
if not qa.get("is_impossible", False):
for answer in qa["answers"]:
ans = answer["text"]
ans_position = cur_doc.text[answer["answer_start"]:answer["answer_start"]+len(ans)]
@ -397,7 +397,7 @@ def squad_json_to_jsonl(squad_file: str, output_file: str):
:param output_file: Name of output file (SQuAD in jsonl format)
:type output_file: str
"""
with open(squad_file) as json_file, open(output_file, "w") as jsonl_file:
with open(squad_file, encoding='utf-8') as json_file, open(output_file, "w", encoding='utf-8') as jsonl_file:
squad_json = json.load(json_file)
for doc in squad_json["data"]:

View File

@ -139,8 +139,8 @@ def add_is_impossible(squad_data: dict, json_file_path: Path):
question["is_impossible"] = False
squad_data["data"] = squad_articles
with open(new_path, "w") as filo:
json.dump(squad_data, filo, indent=4)
with open(new_path, "w", encoding='utf-8') as filo:
json.dump(squad_data, filo, indent=4, ensure_ascii=False)
return new_path, squad_data
@ -219,8 +219,8 @@ def save_dataset(iter_dpr: Iterator, dpr_output_filename: Path,
else:
dataset_splits = {dpr_output_filename: iter_dpr}
for path, set_iter in dataset_splits.items():
with open(path, "w") as json_ds:
json.dump(list(set_iter), json_ds, indent=4)
with open(path, "w", encoding='utf-8') as json_ds:
json.dump(list(set_iter), json_ds, indent=4, ensure_ascii=False)
def get_hard_negative_contexts(retriever: BaseRetriever, question: str, answers: List[str],
@ -242,7 +242,7 @@ def load_squad_file(squad_file_path: Path):
if not squad_file_path.exists():
raise FileNotFoundError
with open(squad_file_path) as squad_file:
with open(squad_file_path, encoding='utf-8') as squad_file:
squad_data = json.load(squad_file)
# squad_data["data"] = squad_data["data"][:10] # sample

View File

@ -70,7 +70,7 @@ def convert_labels_to_squad(labels_file: str):
:param labels_file: path for export file from the labeling tool
:return:
"""
with open(labels_file) as label_file:
with open(labels_file, encoding='utf-8') as label_file:
labels = json.load(label_file)
labels_grouped_by_documents = defaultdict(list)
@ -112,7 +112,7 @@ def convert_labels_to_squad(labels_file: str):
labels_in_squad_format["data"].append(squad_format_label)
with open("labels_in_squad_format.json", "w+") as outfile:
with open("labels_in_squad_format.json", "w+", encoding='utf-8') as outfile:
json.dump(labels_in_squad_format, outfile)