mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-18 04:33:34 +00:00
Fix error when is_impossible not is_impossible and json dump encoding error (#868)
* Fix error when is_impossible not is_impossible and json dump encoding in multilingual data Fixing #867 * Fix file encoding, all file open with utf-8
This commit is contained in:
parent
762f194b27
commit
db75498278
@ -142,7 +142,7 @@ class Crawler(BaseComponent):
|
||||
if base_url:
|
||||
data['meta']['base_url'] = base_url
|
||||
data['text'] = text
|
||||
with open(file_path, 'w') as f:
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(str(data))
|
||||
paths.append(file_path)
|
||||
|
||||
|
@ -198,7 +198,7 @@ class Pipeline(ABC):
|
||||
variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
|
||||
`_` sign must be used to specify nested hierarchical properties.
|
||||
"""
|
||||
with open(path, "r") as stream:
|
||||
with open(path, "r", encoding='utf-8') as stream:
|
||||
data = yaml.safe_load(stream)
|
||||
|
||||
if pipeline_name is None:
|
||||
|
@ -36,7 +36,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, prepro
|
||||
labels = []
|
||||
problematic_ids = []
|
||||
|
||||
with open(filename, "r") as file:
|
||||
with open(filename, "r", encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
if "title" not in data["data"][0]:
|
||||
logger.warning(f"No title information found for documents in QA file: {filename}")
|
||||
@ -75,7 +75,7 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
|
||||
labels = []
|
||||
problematic_ids = []
|
||||
|
||||
with open(filename, "r") as file:
|
||||
with open(filename, "r", encoding='utf-8') as file:
|
||||
for document in file:
|
||||
if max_docs:
|
||||
if len(docs) > max_docs:
|
||||
@ -145,7 +145,7 @@ def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PrePro
|
||||
|
||||
## Assign Labels to corresponding documents
|
||||
for qa in paragraph["qas"]:
|
||||
if not qa["is_impossible"]:
|
||||
if not qa.get("is_impossible", False):
|
||||
for answer in qa["answers"]:
|
||||
ans = answer["text"]
|
||||
ans_position = cur_doc.text[answer["answer_start"]:answer["answer_start"]+len(ans)]
|
||||
@ -397,7 +397,7 @@ def squad_json_to_jsonl(squad_file: str, output_file: str):
|
||||
:param output_file: Name of output file (SQuAD in jsonl format)
|
||||
:type output_file: str
|
||||
"""
|
||||
with open(squad_file) as json_file, open(output_file, "w") as jsonl_file:
|
||||
with open(squad_file, encoding='utf-8') as json_file, open(output_file, "w", encoding='utf-8') as jsonl_file:
|
||||
squad_json = json.load(json_file)
|
||||
|
||||
for doc in squad_json["data"]:
|
||||
|
@ -139,8 +139,8 @@ def add_is_impossible(squad_data: dict, json_file_path: Path):
|
||||
question["is_impossible"] = False
|
||||
|
||||
squad_data["data"] = squad_articles
|
||||
with open(new_path, "w") as filo:
|
||||
json.dump(squad_data, filo, indent=4)
|
||||
with open(new_path, "w", encoding='utf-8') as filo:
|
||||
json.dump(squad_data, filo, indent=4, ensure_ascii=False)
|
||||
|
||||
return new_path, squad_data
|
||||
|
||||
@ -219,8 +219,8 @@ def save_dataset(iter_dpr: Iterator, dpr_output_filename: Path,
|
||||
else:
|
||||
dataset_splits = {dpr_output_filename: iter_dpr}
|
||||
for path, set_iter in dataset_splits.items():
|
||||
with open(path, "w") as json_ds:
|
||||
json.dump(list(set_iter), json_ds, indent=4)
|
||||
with open(path, "w", encoding='utf-8') as json_ds:
|
||||
json.dump(list(set_iter), json_ds, indent=4, ensure_ascii=False)
|
||||
|
||||
|
||||
def get_hard_negative_contexts(retriever: BaseRetriever, question: str, answers: List[str],
|
||||
@ -242,7 +242,7 @@ def load_squad_file(squad_file_path: Path):
|
||||
if not squad_file_path.exists():
|
||||
raise FileNotFoundError
|
||||
|
||||
with open(squad_file_path) as squad_file:
|
||||
with open(squad_file_path, encoding='utf-8') as squad_file:
|
||||
squad_data = json.load(squad_file)
|
||||
|
||||
# squad_data["data"] = squad_data["data"][:10] # sample
|
||||
|
@ -70,7 +70,7 @@ def convert_labels_to_squad(labels_file: str):
|
||||
:param labels_file: path for export file from the labeling tool
|
||||
:return:
|
||||
"""
|
||||
with open(labels_file) as label_file:
|
||||
with open(labels_file, encoding='utf-8') as label_file:
|
||||
labels = json.load(label_file)
|
||||
|
||||
labels_grouped_by_documents = defaultdict(list)
|
||||
@ -112,7 +112,7 @@ def convert_labels_to_squad(labels_file: str):
|
||||
|
||||
labels_in_squad_format["data"].append(squad_format_label)
|
||||
|
||||
with open("labels_in_squad_format.json", "w+") as outfile:
|
||||
with open("labels_in_squad_format.json", "w+", encoding='utf-8') as outfile:
|
||||
json.dump(labels_in_squad_format, outfile)
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user