openspg/python/knext/examples/medical/builder/model/dataset/convert_util.py

# -*- coding: utf-8 -*-
#
#  Copyright 2023 Ant Group CO., Ltd.
#
#  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
#  in compliance with the License. You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software distributed under the License
#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
#  or implied.

import argparse
import json

from knext.core.schema import Schema


def get_schema(spg_type_name):
    schema = Schema()
    spg_type = schema.query_spg_type(spg_type_name)
    return spg_type


def get_re_prompt(template_path, spg_type) -> str:
    """
    生成RE任务的Prompt
    """
    schema_text = ""
    for k, v in spg_type.properties.items():
        if v.name in ["id", "description"]:
            continue
        spo = "\"subject\":\"{}\",\"predicate\":\"{}\",\"object\":\"{}\"".format(spg_type.name_zh, v.name_zh,
                                                                                 v.object_type_name_zh)
        spo = "{" + spo + "}\n"
        schema_text = schema_text + spo

    f = open(template_path, "r")
    prompt_template = json.load(f)
    prompt = prompt_template["re"].replace("${schema}", schema_text)
    return prompt


def get_ner_prompt(template_path, spg_type) -> str:
    """
    生成NER任务的Prompt
    """
    f = open(template_path, "r")
    prompt_template = json.load(f)
    prompt = prompt_template["ner"].replace("${schema}", f'[{spg_type.name}:{spg_type.name_zh}]')
    return prompt


def process(src_path: str, tgt_path: str, entity_type: str, template_path: str, task_type: str):
    spg_type = get_schema(entity_type)

    writer = open(tgt_path, "w", encoding="utf-8")
    with open(src_path, "r", encoding="utf-8") as reader:
        for line in reader:
            print(line)
            record = json.loads(line)
            if task_type == 'RE':
                prompt_template = get_re_prompt(template_path, spg_type)
                instruct = prompt_template.replace("${input}", record["input"])
            elif task_type == 'NER':
                prompt_template = get_ner_prompt(template_path, spg_type)
                instruct = prompt_template.replace("${input}", record["input"])
            else:
                raise KeyError

            record = {'content': instruct, 'summary': record['output']}
            writer.write(json.dumps(record, ensure_ascii=False) + "\n")


if __name__ == "__main__":
    '''

    python convert_util.py \
        --entity_type Medical.Disease \
        --task_type RE \
        --src_path RE/sample.json \
        --tgt_path RE/processed.json \
        --template_path ../../../schema/prompt.json

    '''

    parse = argparse.ArgumentParser()
    parse.add_argument("--entity_type", type=str)
    parse.add_argument("--task_type", type=str, choices=['RE', 'NER'])
    parse.add_argument("--src_path", type=str, default="NER/sample.json")
    parse.add_argument("--tgt_path", type=str, default="NER/processed.json")
    parse.add_argument("--template_path", type=str, default="../../../schema/prompt.json")

    options = parse.parse_args()
    options = vars(options)
    process(**options)
init commit 2023-10-26 10:34:08 +08:00			`# -- coding: utf-8 --`
			`#`
			`# Copyright 2023 Ant Group CO., Ltd.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except`
			`# in compliance with the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software distributed under the License`
			`# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express`
			`# or implied.`

			`import argparse`
			`import json`

			`from knext.core.schema import Schema`


			`def get_schema(spg_type_name):`
			`schema = Schema()`
			`spg_type = schema.query_spg_type(spg_type_name)`
			`return spg_type`


			`def get_re_prompt(template_path, spg_type) -> str:`
			`"""`
			`生成RE任务的Prompt`
			`"""`
			`schema_text = ""`
			`for k, v in spg_type.properties.items():`
			`if v.name in ["id", "description"]:`
			`continue`
			`spo = "\"subject\":\"{}\",\"predicate\":\"{}\",\"object\":\"{}\"".format(spg_type.name_zh, v.name_zh,`
			`v.object_type_name_zh)`
			`spo = "{" + spo + "}\n"`
			`schema_text = schema_text + spo`

			`f = open(template_path, "r")`
			`prompt_template = json.load(f)`
			`prompt = prompt_template["re"].replace("${schema}", schema_text)`
			`return prompt`


			`def get_ner_prompt(template_path, spg_type) -> str:`
			`"""`
			`生成NER任务的Prompt`
			`"""`
			`f = open(template_path, "r")`
			`prompt_template = json.load(f)`
			`prompt = prompt_template["ner"].replace("${schema}", f'[{spg_type.name}:{spg_type.name_zh}]')`
			`return prompt`


			`def process(src_path: str, tgt_path: str, entity_type: str, template_path: str, task_type: str):`
			`spg_type = get_schema(entity_type)`

			`writer = open(tgt_path, "w", encoding="utf-8")`
			`with open(src_path, "r", encoding="utf-8") as reader:`
			`for line in reader:`
			`print(line)`
			`record = json.loads(line)`
			`if task_type == 'RE':`
			`prompt_template = get_re_prompt(template_path, spg_type)`
			`instruct = prompt_template.replace("${input}", record["input"])`
			`elif task_type == 'NER':`
			`prompt_template = get_ner_prompt(template_path, spg_type)`
			`instruct = prompt_template.replace("${input}", record["input"])`
			`else:`
			`raise KeyError`

			`record = {'content': instruct, 'summary': record['output']}`
			`writer.write(json.dumps(record, ensure_ascii=False) + "\n")`


			`if __name__ == "__main__":`
			`'''`

			`python convert_util.py \`
			`--entity_type Medical.Disease \`
			`--task_type RE \`
			`--src_path RE/sample.json \`
			`--tgt_path RE/processed.json \`
			`--template_path ../../../schema/prompt.json`

			`'''`

			`parse = argparse.ArgumentParser()`
			`parse.add_argument("--entity_type", type=str)`
			`parse.add_argument("--task_type", type=str, choices=['RE', 'NER'])`
			`parse.add_argument("--src_path", type=str, default="NER/sample.json")`
			`parse.add_argument("--tgt_path", type=str, default="NER/processed.json")`
			`parse.add_argument("--template_path", type=str, default="../../../schema/prompt.json")`

			`options = parse.parse_args()`
			`options = vars(options)`
			`process(**options)`