mirror of
https://github.com/OpenSPG/KAG.git
synced 2025-11-22 13:20:28 +00:00
* add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit 9705951d066b282ac49f0e1972559b646e7f906d. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
256 lines
8.0 KiB
Python
256 lines
8.0 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright 2023 OpenSPG Authors
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
|
|
# in compliance with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
# or implied.
|
|
from collections import OrderedDict
|
|
import re
|
|
import json
|
|
import os
|
|
import sys
|
|
from configparser import ConfigParser
|
|
from pathlib import Path
|
|
from ruamel.yaml import YAML
|
|
from typing import Optional
|
|
|
|
import click
|
|
|
|
from knext.common.utils import copytree, copyfile
|
|
from knext.project.client import ProjectClient
|
|
|
|
from knext.common.env import env, DEFAULT_HOST_ADDR
|
|
|
|
from kag.common.llm.llm_config_checker import LLMConfigChecker
|
|
from kag.common.vectorize_model.vectorize_model_config_checker import (
|
|
VectorizeModelConfigChecker,
|
|
)
|
|
from shutil import copy2
|
|
|
|
yaml = YAML()
|
|
|
|
|
|
def _render_template(namespace: str, tmpl: str, **kwargs):
|
|
config_path = kwargs.get("config_path", None)
|
|
project_dir = Path(namespace)
|
|
if not project_dir.exists():
|
|
project_dir.mkdir()
|
|
|
|
import kag.templates.project
|
|
|
|
src = Path(kag.templates.project.__path__[0])
|
|
copytree(
|
|
src,
|
|
project_dir.resolve(),
|
|
namespace=namespace,
|
|
root=namespace,
|
|
tmpl=tmpl,
|
|
**kwargs,
|
|
)
|
|
|
|
import kag.templates.schema
|
|
|
|
src = Path(kag.templates.schema.__path__[0]) / f"{{{{{tmpl}}}}}.schema.tmpl"
|
|
if not src.exists():
|
|
click.secho(
|
|
f"ERROR: No such schema template: {tmpl}.schema.tmpl",
|
|
fg="bright_red",
|
|
)
|
|
dst = project_dir.resolve() / "schema" / f"{{{{{tmpl}}}}}.schema.tmpl"
|
|
copyfile(src, dst, namespace=namespace, **{tmpl: namespace})
|
|
|
|
tmpls = [tmpl, "default"] if tmpl != "default" else [tmpl]
|
|
# find all .yaml files in project dir
|
|
config = yaml.load(Path(config_path).read_text() or "{}")
|
|
project_id = kwargs.get("id", None)
|
|
config["project"]["id"] = project_id
|
|
config_file_path = project_dir.resolve() / "kag_config.yaml"
|
|
with open(config_file_path, "w") as config_file:
|
|
yaml.dump(config, config_file)
|
|
return project_dir
|
|
|
|
|
|
def _recover_project(prj_path: str):
|
|
"""
|
|
Recover project by a project dir path.
|
|
"""
|
|
if not Path(prj_path).exists():
|
|
click.secho(f"ERROR: No such directory: {prj_path}", fg="bright_red")
|
|
sys.exit()
|
|
|
|
project_name = env.project_config.get("namespace", None)
|
|
namespace = env.project_config.get("namespace", None)
|
|
desc = env.project_config.get("description", None)
|
|
if not namespace:
|
|
click.secho(
|
|
f"ERROR: No project namespace found in {env.config_path}.",
|
|
fg="bright_red",
|
|
)
|
|
sys.exit()
|
|
|
|
client = ProjectClient()
|
|
project = client.get(namespace=namespace) or client.create(
|
|
name=project_name, desc=desc, namespace=namespace
|
|
)
|
|
|
|
env.config["project"]["id"] = project.id
|
|
env.dump()
|
|
|
|
click.secho(
|
|
f"Project [{project_name}] with namespace [{namespace}] was successfully recovered from [{prj_path}].",
|
|
fg="bright_green",
|
|
)
|
|
|
|
|
|
@click.option("--config_path", help="Path of config.", required=True)
|
|
@click.option(
|
|
"--tmpl",
|
|
help="Template of project, use default if not specified.",
|
|
default="default",
|
|
type=click.Choice(["default", "medical"], case_sensitive=False),
|
|
)
|
|
@click.option(
|
|
"--delete_cfg",
|
|
help="whether delete your defined .yaml file.",
|
|
default=True,
|
|
hidden=True,
|
|
)
|
|
def create_project(
|
|
config_path: str, tmpl: Optional[str] = None, delete_cfg: bool = False
|
|
):
|
|
"""
|
|
Create new project with a demo case.
|
|
"""
|
|
|
|
config = yaml.load(Path(config_path).read_text() or "{}")
|
|
project_config = config.get("project", {})
|
|
namespace = project_config.get("namespace", None)
|
|
name = project_config.get("namespace", None)
|
|
host_addr = project_config.get("host_addr", None)
|
|
|
|
if not namespace:
|
|
click.secho("ERROR: namespace is required.")
|
|
sys.exit()
|
|
|
|
if not re.match(r"^[A-Z][A-Za-z0-9]{0,15}$", namespace):
|
|
raise click.BadParameter(
|
|
f"Invalid namespace: {namespace}."
|
|
f" Must start with an uppercase letter, only contain letters and numbers, and have a maximum length of 16."
|
|
)
|
|
|
|
if not tmpl:
|
|
tmpl = "default"
|
|
|
|
project_id = None
|
|
if host_addr:
|
|
client = ProjectClient(host_addr=host_addr)
|
|
project = client.create(name=name, namespace=namespace)
|
|
|
|
if project and project.id:
|
|
project_id = project.id
|
|
else:
|
|
click.secho("ERROR: host_addr is required.", fg="bright_red")
|
|
sys.exit()
|
|
|
|
project_dir = _render_template(
|
|
namespace=namespace,
|
|
tmpl=tmpl,
|
|
id=project_id,
|
|
with_server=(host_addr is not None),
|
|
host_addr=host_addr,
|
|
name=name,
|
|
config_path=config_path,
|
|
delete_cfg=delete_cfg,
|
|
)
|
|
|
|
config = yaml.load((Path(project_dir) / "kag_config.yaml").read_text() or "{}")
|
|
client.update(id=project_id, config=json.dumps(config))
|
|
|
|
if delete_cfg and os.path.exists(config_path):
|
|
os.remove(config_path)
|
|
|
|
click.secho(
|
|
f"Project with namespace [{namespace}] was successfully created in {project_dir.resolve()} \n"
|
|
+ "You can checkout your project with: \n"
|
|
+ f" cd {project_dir}",
|
|
fg="bright_green",
|
|
)
|
|
|
|
|
|
@click.option("--host_addr", help="Address of spg server.", default=None)
|
|
@click.option("--proj_path", help="Path of project.", default=None)
|
|
def restore_project(host_addr, proj_path):
|
|
if host_addr is None:
|
|
host_addr = env.host_addr
|
|
if proj_path is None:
|
|
proj_path = env.project_path
|
|
proj_client = ProjectClient(host_addr=host_addr)
|
|
|
|
project_wanted = proj_client.get_by_namespace(namespace=env.namespace)
|
|
if not project_wanted:
|
|
if host_addr:
|
|
client = ProjectClient(host_addr=host_addr)
|
|
project = client.create(name=env.name, namespace=env.namespace)
|
|
project_id = project.id
|
|
else:
|
|
project_id = project_wanted.id
|
|
# write project id and host addr to kag_config.yaml
|
|
|
|
env.config["project"]["id"] = project_id
|
|
env.config["project"]["host_addr"] = host_addr
|
|
env.dump()
|
|
if proj_path:
|
|
_recover_project(proj_path)
|
|
update_project(proj_path)
|
|
|
|
|
|
@click.option("--proj_path", help="Path of config.", default=None)
|
|
def update_project(proj_path):
|
|
if not proj_path:
|
|
proj_path = env.project_path
|
|
client = ProjectClient(host_addr=env.host_addr)
|
|
|
|
llm_config_checker = LLMConfigChecker()
|
|
vectorize_model_config_checker = VectorizeModelConfigChecker()
|
|
llm_config = env.config.get("chat_llm", {})
|
|
vectorize_model_config = env.config.get("vectorizer", {})
|
|
try:
|
|
llm_config_checker.check(json.dumps(llm_config))
|
|
dim = vectorize_model_config_checker.check(json.dumps(vectorize_model_config))
|
|
env.config["vectorizer"]["vector_dimensions"] = dim
|
|
except Exception as e:
|
|
click.secho(f"Error: {e}", fg="bright_red")
|
|
sys.exit()
|
|
|
|
client.update(id=env.id, config=json.dumps(env.config))
|
|
click.secho(
|
|
f"Project [{env.name}] with namespace [{env.namespace}] was successfully updated from [{proj_path}].",
|
|
fg="bright_green",
|
|
)
|
|
|
|
@click.option("--host_addr", help="Address of spg server.", default=DEFAULT_HOST_ADDR)
|
|
def list_project(host_addr):
|
|
client = ProjectClient(
|
|
host_addr=host_addr
|
|
)
|
|
projects = client.get_all()
|
|
|
|
headers = ["Project Name", "Project ID"]
|
|
|
|
click.echo(click.style(f"{' | '.join(headers)}", fg="bright_green", bold=True))
|
|
click.echo(
|
|
click.style(
|
|
f"{'-' * (len(headers[0]) + len(headers[1]) + 3)}", fg="bright_green"
|
|
)
|
|
)
|
|
|
|
for project_name, project_id in projects.items():
|
|
click.echo(
|
|
click.style(f"{project_name:<20} | {project_id:<10}", fg="bright_green")
|
|
)
|