From db754982786ace319ff9aa1dc4f03c07f51a6d94 Mon Sep 17 00:00:00 2001 From: Eric Lam Date: Tue, 2 Mar 2021 20:54:58 +0800 Subject: [PATCH] Fix error when is_impossible not is_impossible and json dump encoding error (#868) * Fix error when is_impossible not is_impossible and json dump encoding in multilingual data Fixing #867 * Fix file encoding, all file open with utf-8 --- haystack/connector/crawler.py | 2 +- haystack/pipeline.py | 2 +- haystack/preprocessor/utils.py | 8 ++++---- haystack/retriever/squad_to_dpr.py | 10 +++++----- haystack/utils.py | 4 ++-- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/haystack/connector/crawler.py b/haystack/connector/crawler.py index bd406fab3..fbc3add9a 100644 --- a/haystack/connector/crawler.py +++ b/haystack/connector/crawler.py @@ -142,7 +142,7 @@ class Crawler(BaseComponent): if base_url: data['meta']['base_url'] = base_url data['text'] = text - with open(file_path, 'w') as f: + with open(file_path, 'w', encoding='utf-8') as f: f.write(str(data)) paths.append(file_path) diff --git a/haystack/pipeline.py b/haystack/pipeline.py index cd22ba2eb..c136dbbaa 100644 --- a/haystack/pipeline.py +++ b/haystack/pipeline.py @@ -198,7 +198,7 @@ class Pipeline(ABC): variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an `_` sign must be used to specify nested hierarchical properties. """ - with open(path, "r") as stream: + with open(path, "r", encoding='utf-8') as stream: data = yaml.safe_load(stream) if pipeline_name is None: diff --git a/haystack/preprocessor/utils.py b/haystack/preprocessor/utils.py index a6b6ff1a2..27b864905 100644 --- a/haystack/preprocessor/utils.py +++ b/haystack/preprocessor/utils.py @@ -36,7 +36,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, prepro labels = [] problematic_ids = [] - with open(filename, "r") as file: + with open(filename, "r", encoding='utf-8') as file: data = json.load(file) if "title" not in data["data"][0]: logger.warning(f"No title information found for documents in QA file: {filename}") @@ -75,7 +75,7 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None, labels = [] problematic_ids = [] - with open(filename, "r") as file: + with open(filename, "r", encoding='utf-8') as file: for document in file: if max_docs: if len(docs) > max_docs: @@ -145,7 +145,7 @@ def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PrePro ## Assign Labels to corresponding documents for qa in paragraph["qas"]: - if not qa["is_impossible"]: + if not qa.get("is_impossible", False): for answer in qa["answers"]: ans = answer["text"] ans_position = cur_doc.text[answer["answer_start"]:answer["answer_start"]+len(ans)] @@ -397,7 +397,7 @@ def squad_json_to_jsonl(squad_file: str, output_file: str): :param output_file: Name of output file (SQuAD in jsonl format) :type output_file: str """ - with open(squad_file) as json_file, open(output_file, "w") as jsonl_file: + with open(squad_file, encoding='utf-8') as json_file, open(output_file, "w", encoding='utf-8') as jsonl_file: squad_json = json.load(json_file) for doc in squad_json["data"]: diff --git a/haystack/retriever/squad_to_dpr.py b/haystack/retriever/squad_to_dpr.py index b298320df..a260eb891 100644 --- a/haystack/retriever/squad_to_dpr.py +++ b/haystack/retriever/squad_to_dpr.py @@ -139,8 +139,8 @@ def add_is_impossible(squad_data: dict, json_file_path: Path): question["is_impossible"] = False squad_data["data"] = squad_articles - with open(new_path, "w") as filo: - json.dump(squad_data, filo, indent=4) + with open(new_path, "w", encoding='utf-8') as filo: + json.dump(squad_data, filo, indent=4, ensure_ascii=False) return new_path, squad_data @@ -219,8 +219,8 @@ def save_dataset(iter_dpr: Iterator, dpr_output_filename: Path, else: dataset_splits = {dpr_output_filename: iter_dpr} for path, set_iter in dataset_splits.items(): - with open(path, "w") as json_ds: - json.dump(list(set_iter), json_ds, indent=4) + with open(path, "w", encoding='utf-8') as json_ds: + json.dump(list(set_iter), json_ds, indent=4, ensure_ascii=False) def get_hard_negative_contexts(retriever: BaseRetriever, question: str, answers: List[str], @@ -242,7 +242,7 @@ def load_squad_file(squad_file_path: Path): if not squad_file_path.exists(): raise FileNotFoundError - with open(squad_file_path) as squad_file: + with open(squad_file_path, encoding='utf-8') as squad_file: squad_data = json.load(squad_file) # squad_data["data"] = squad_data["data"][:10] # sample diff --git a/haystack/utils.py b/haystack/utils.py index 285fbdd86..e29bcc093 100644 --- a/haystack/utils.py +++ b/haystack/utils.py @@ -70,7 +70,7 @@ def convert_labels_to_squad(labels_file: str): :param labels_file: path for export file from the labeling tool :return: """ - with open(labels_file) as label_file: + with open(labels_file, encoding='utf-8') as label_file: labels = json.load(label_file) labels_grouped_by_documents = defaultdict(list) @@ -112,7 +112,7 @@ def convert_labels_to_squad(labels_file: str): labels_in_squad_format["data"].append(squad_format_label) - with open("labels_in_squad_format.json", "w+") as outfile: + with open("labels_in_squad_format.json", "w+", encoding='utf-8') as outfile: json.dump(labels_in_squad_format, outfile)