From db754982786ace319ff9aa1dc4f03c07f51a6d94 Mon Sep 17 00:00:00 2001
From: Eric Lam <voidful.stack@gmail.com>
Date: Tue, 2 Mar 2021 20:54:58 +0800
Subject: [PATCH] Fix error when is_impossible not is_impossible and json dump
 encoding error (#868)

* Fix error when is_impossible not is_impossible and json dump encoding in multilingual data

Fixing #867

* Fix file encoding, all file open with utf-8
---
 haystack/connector/crawler.py      |  2 +-
 haystack/pipeline.py               |  2 +-
 haystack/preprocessor/utils.py     |  8 ++++----
 haystack/retriever/squad_to_dpr.py | 10 +++++-----
 haystack/utils.py                  |  4 ++--
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/haystack/connector/crawler.py b/haystack/connector/crawler.py
index bd406fab3..fbc3add9a 100644
--- a/haystack/connector/crawler.py
+++ b/haystack/connector/crawler.py
@@ -142,7 +142,7 @@ class Crawler(BaseComponent):
             if base_url:
                 data['meta']['base_url'] = base_url
             data['text'] = text
-            with open(file_path, 'w') as f:
+            with open(file_path, 'w', encoding='utf-8') as f:
                 f.write(str(data))
             paths.append(file_path)
 
diff --git a/haystack/pipeline.py b/haystack/pipeline.py
index cd22ba2eb..c136dbbaa 100644
--- a/haystack/pipeline.py
+++ b/haystack/pipeline.py
@@ -198,7 +198,7 @@ class Pipeline(ABC):
                                              variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
                                              `_` sign must be used to specify nested hierarchical properties.
         """
-        with open(path, "r") as stream:
+        with open(path, "r", encoding='utf-8') as stream:
             data = yaml.safe_load(stream)
 
         if pipeline_name is None:
diff --git a/haystack/preprocessor/utils.py b/haystack/preprocessor/utils.py
index a6b6ff1a2..27b864905 100644
--- a/haystack/preprocessor/utils.py
+++ b/haystack/preprocessor/utils.py
@@ -36,7 +36,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, prepro
     labels = []
     problematic_ids = []
 
-    with open(filename, "r") as file:
+    with open(filename, "r", encoding='utf-8') as file:
         data = json.load(file)
         if "title" not in data["data"][0]:
             logger.warning(f"No title information found for documents in QA file: {filename}")
@@ -75,7 +75,7 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
     labels = []
     problematic_ids = []
 
-    with open(filename, "r") as file:
+    with open(filename, "r", encoding='utf-8') as file:
         for document in file:
             if max_docs:
                 if len(docs) > max_docs:
@@ -145,7 +145,7 @@ def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PrePro
 
         ## Assign Labels to corresponding documents
         for qa in paragraph["qas"]:
-            if not qa["is_impossible"]:
+            if not qa.get("is_impossible", False):
                 for answer in qa["answers"]:
                     ans = answer["text"]
                     ans_position = cur_doc.text[answer["answer_start"]:answer["answer_start"]+len(ans)]
@@ -397,7 +397,7 @@ def squad_json_to_jsonl(squad_file: str, output_file: str):
     :param output_file: Name of output file (SQuAD in jsonl format)
     :type output_file: str
     """
-    with open(squad_file) as json_file, open(output_file, "w") as jsonl_file:
+    with open(squad_file, encoding='utf-8') as json_file, open(output_file, "w", encoding='utf-8') as jsonl_file:
         squad_json = json.load(json_file)
 
         for doc in squad_json["data"]:
diff --git a/haystack/retriever/squad_to_dpr.py b/haystack/retriever/squad_to_dpr.py
index b298320df..a260eb891 100644
--- a/haystack/retriever/squad_to_dpr.py
+++ b/haystack/retriever/squad_to_dpr.py
@@ -139,8 +139,8 @@ def add_is_impossible(squad_data: dict, json_file_path: Path):
                 question["is_impossible"] = False
 
     squad_data["data"] = squad_articles
-    with open(new_path, "w") as filo:
-        json.dump(squad_data, filo, indent=4)
+    with open(new_path, "w", encoding='utf-8') as filo:
+        json.dump(squad_data, filo, indent=4, ensure_ascii=False)
 
     return new_path, squad_data
 
@@ -219,8 +219,8 @@ def save_dataset(iter_dpr: Iterator, dpr_output_filename: Path,
     else:
         dataset_splits = {dpr_output_filename: iter_dpr}
     for path, set_iter in dataset_splits.items():
-        with open(path, "w") as json_ds:
-            json.dump(list(set_iter), json_ds, indent=4)
+        with open(path, "w", encoding='utf-8') as json_ds:
+            json.dump(list(set_iter), json_ds, indent=4, ensure_ascii=False)
 
 
 def get_hard_negative_contexts(retriever: BaseRetriever, question: str, answers: List[str],
@@ -242,7 +242,7 @@ def load_squad_file(squad_file_path: Path):
     if not squad_file_path.exists():
         raise FileNotFoundError
 
-    with open(squad_file_path) as squad_file:
+    with open(squad_file_path, encoding='utf-8') as squad_file:
         squad_data = json.load(squad_file)
 
     # squad_data["data"] = squad_data["data"][:10]  # sample
diff --git a/haystack/utils.py b/haystack/utils.py
index 285fbdd86..e29bcc093 100644
--- a/haystack/utils.py
+++ b/haystack/utils.py
@@ -70,7 +70,7 @@ def convert_labels_to_squad(labels_file: str):
     :param labels_file: path for export file from the labeling tool
     :return:
     """
-    with open(labels_file) as label_file:
+    with open(labels_file, encoding='utf-8') as label_file:
         labels = json.load(label_file)
 
     labels_grouped_by_documents = defaultdict(list)
@@ -112,7 +112,7 @@ def convert_labels_to_squad(labels_file: str):
 
         labels_in_squad_format["data"].append(squad_format_label)
 
-    with open("labels_in_squad_format.json", "w+") as outfile:
+    with open("labels_in_squad_format.json", "w+", encoding='utf-8') as outfile:
         json.dump(labels_in_squad_format, outfile)