Fix error when is_impossible not is_impossible and json dump encoding error (#868)

* Fix error when is_impossible not is_impossible and json dump encoding in multilingual data Fixing #867 * Fix file encoding, all file open with utf-8
2025-09-18 04:33:34 +00:00 · 2021-03-02 20:54:58 +08:00 · 2021-03-02 20:54:58 +08:00 · db75498278
commit db75498278
parent 762f194b27
5 changed files with 13 additions and 13 deletions
--- a/haystack/connector/crawler.py
+++ b/haystack/connector/crawler.py
@ -142,7 +142,7 @@ class Crawler(BaseComponent):
            if base_url:
                data['meta']['base_url'] = base_url
            data['text'] = text
-            with open(file_path, 'w') as f:
+            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(str(data))
            paths.append(file_path)

--- a/haystack/pipeline.py
+++ b/haystack/pipeline.py
@ -198,7 +198,7 @@ class Pipeline(ABC):
                                             variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
                                             `_` sign must be used to specify nested hierarchical properties.
        """
-        with open(path, "r") as stream:
+        with open(path, "r", encoding='utf-8') as stream:
            data = yaml.safe_load(stream)

        if pipeline_name is None:
--- a/haystack/preprocessor/utils.py
+++ b/haystack/preprocessor/utils.py
@ -36,7 +36,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, prepro
    labels = []
    problematic_ids = []

-    with open(filename, "r") as file:
+    with open(filename, "r", encoding='utf-8') as file:
        data = json.load(file)
        if "title" not in data["data"][0]:
            logger.warning(f"No title information found for documents in QA file: {filename}")
@ -75,7 +75,7 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
    labels = []
    problematic_ids = []

-    with open(filename, "r") as file:
+    with open(filename, "r", encoding='utf-8') as file:
        for document in file:
            if max_docs:
                if len(docs) > max_docs:
@ -145,7 +145,7 @@ def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PrePro

        ## Assign Labels to corresponding documents
        for qa in paragraph["qas"]:
-            if not qa["is_impossible"]:
+            if not qa.get("is_impossible", False):
                for answer in qa["answers"]:
                    ans = answer["text"]
                    ans_position = cur_doc.text[answer["answer_start"]:answer["answer_start"]+len(ans)]
@ -397,7 +397,7 @@ def squad_json_to_jsonl(squad_file: str, output_file: str):
    :param output_file: Name of output file (SQuAD in jsonl format)
    :type output_file: str
    """
-    with open(squad_file) as json_file, open(output_file, "w") as jsonl_file:
+    with open(squad_file, encoding='utf-8') as json_file, open(output_file, "w", encoding='utf-8') as jsonl_file:
        squad_json = json.load(json_file)

        for doc in squad_json["data"]:
--- a/haystack/retriever/squad_to_dpr.py
+++ b/haystack/retriever/squad_to_dpr.py
@ -139,8 +139,8 @@ def add_is_impossible(squad_data: dict, json_file_path: Path):
                question["is_impossible"] = False

    squad_data["data"] = squad_articles
-    with open(new_path, "w") as filo:
-        json.dump(squad_data, filo, indent=4)
+    with open(new_path, "w", encoding='utf-8') as filo:
+        json.dump(squad_data, filo, indent=4, ensure_ascii=False)

    return new_path, squad_data

@ -219,8 +219,8 @@ def save_dataset(iter_dpr: Iterator, dpr_output_filename: Path,
    else:
        dataset_splits = {dpr_output_filename: iter_dpr}
    for path, set_iter in dataset_splits.items():
-        with open(path, "w") as json_ds:
-            json.dump(list(set_iter), json_ds, indent=4)
+        with open(path, "w", encoding='utf-8') as json_ds:
+            json.dump(list(set_iter), json_ds, indent=4, ensure_ascii=False)


 def get_hard_negative_contexts(retriever: BaseRetriever, question: str, answers: List[str],
@ -242,7 +242,7 @@ def load_squad_file(squad_file_path: Path):
    if not squad_file_path.exists():
        raise FileNotFoundError

-    with open(squad_file_path) as squad_file:
+    with open(squad_file_path, encoding='utf-8') as squad_file:
        squad_data = json.load(squad_file)

    # squad_data["data"] = squad_data["data"][:10]  # sample
--- a/haystack/utils.py
+++ b/haystack/utils.py
@ -70,7 +70,7 @@ def convert_labels_to_squad(labels_file: str):
    :param labels_file: path for export file from the labeling tool
    :return:
    """
-    with open(labels_file) as label_file:
+    with open(labels_file, encoding='utf-8') as label_file:
        labels = json.load(label_file)

    labels_grouped_by_documents = defaultdict(list)
@ -112,7 +112,7 @@ def convert_labels_to_squad(labels_file: str):

        labels_in_squad_format["data"].append(squad_format_label)

-    with open("labels_in_squad_format.json", "w+") as outfile:
+    with open("labels_in_squad_format.json", "w+", encoding='utf-8') as outfile:
        json.dump(labels_in_squad_format, outfile)