KAG/kag/common/utils.py

502 lines
14 KiB
Python
Raw Normal View History

2024-10-24 11:46:15 +08:00
# -*- coding: utf-8 -*-
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
feat(kag): update to v0.7 (#456) * add think cost * update csv scanner * add final rerank * add reasoner * add iterative planner * fix dpr search * fix dpr search * add reference data * move odps import * update requirement.txt * update 2wiki * add missing file * fix markdown reader * add iterative planning * update version * update runner * update 2wiki example * update bridge * merge solver and solver_new * add cur day * writer delete * update multi process * add missing files * fix report * add chunk retrieved executor * update try in stream runner result * add path * add math executor * update hotpotqa example * remove log * fix python coder solver * update hotpotqa example * fix python coder solver * update config * fix bad * add log * remove unused code * commit with task thought * move kag model to common * add default chat llm * fix * use static planner * support chunk graph node * add args * support naive rag * llm client support tool calls * add default async * add openai * fix result * fix markdown reader * fix thinker * update asyncio interface * feat(solver): add mcp support (#444) * 上传mcp client相关代码 * 1、完成一套mcp client的调用,从pipeline到planner、executor 2、允许json中传入多个mcp_server,通过大模型进行调用并选择 3、调通baidu_map_mcp的使用 * 1、schema * bugfix:删减冗余代码 --------- Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> * fix affairqa after solver refactor * fix affairqa after solver refactor * fix readme * add params * update version * update mcp executor * update mcp executor * solver add mcp executor * add missing file * add mpc executor * add executor * x * update * fix requirement * fix main llm config * fix solver * bugfix:修复invoke函数调用逻辑 * chg eva * update example * add kag layer * add step task * support dot refresh * support dot refresh * support dot refresh * support dot refresh * add retrieved num * add retrieved num * add pipelineconf * update ppr * update musique prompts * update * add to_dict for BuilderComponentData * async build * add deduce prompt * add deduce prompt * add deduce prompt * fix reader * add deduce prompt * add page thinker report * modify prmpt * add step status * add self cognition * add self cognition * add memory graph storage * add now time * update memory config * add now time * chg graph loader * 添加prqa数据集和代码 * bugfix:prqa调用逻辑修复 * optimize:优化代码逻辑,生成答案规范化 * add retry py code * update memory graph * update memory graph * fix * fix ner * add with_out_refer generator prompt * fix * close ckpt * fix query * fix query * update version * add llm checker * add llm checker * 1、上传evalutor.py以及修改gold_answer.json格式 2、优化代码逻辑 3、修改README.md文件 * update exp * update exp * rerank support * add static rewrite query * recall more chunks * fix graph load * add static rewrite query * fix bugs * add finish check * add finish check * add finish check * add finish check * 1、上传evalutor.py的结果 2、优化代码逻辑,优化readme文件 * add lf retry * add memory graph api * fix reader api * add ner * add metrics * fix bug * remove ner * add reraise fo retry * add edge prop to memory graph * add memory graph * 1、评测数据集结果修正 2、优化evaluator.py代码 3、删除结果不存在而gold_answer中有答案的问题 * 删除评测结果文件 * fix knext host addr * async eva * add lf prompt * add lf prompt * add config * add retry * add unknown check * add rc result * add rc result * add rc result * add rc result * 依据kag pipeline格式修改代码逻辑并通过测试 * bugfix:删除冗余代码 * fix report prompt * bugfix:触发重试机制 * bugfix:中文符号错误 * fix rethinker prompt * update version to 0.6.2b78 * update version * 1、修改evaluator.py,通过大模型计算准确率,符合最新调用逻辑 2、修改prompt,让没有回答的结果重复测试 * update affairqa for evaluate * update affairqa for evaluate * bugfix:修正数据集 * bugfix:修正数据集 * bugfix:修正数据集 * fix name conflict * bugfix:删除错误问题 * bugfix:文件名命名错误导致evaluator失败 * update for affairqa eval * bugfix:修改代码保持evaluate逻辑一致 * x * update for affairqa readme * remove temp eval scripts * bugfix for math deduce * merge 0.6.2_dev * merge 0.6.2_dev * fix * update client addr * updated version * update for affairqa eval * evaUtils 支持中文 * fix affairqa eval: * remove unused example * update kag config * fix default value * update readme * fix init * 注释信息修改,并添加部分class说明 * update example config * Tc 0.7.0 (#459) * 提交affairQA 代码 * fix affairqa eval --------- Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> * fix all examples * reformat --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>
2025-04-17 17:23:52 +08:00
# flake8: noqa
import datetime
import random
2024-10-24 11:46:15 +08:00
import re
feat(kag): update to v0.7 (#456) * add think cost * update csv scanner * add final rerank * add reasoner * add iterative planner * fix dpr search * fix dpr search * add reference data * move odps import * update requirement.txt * update 2wiki * add missing file * fix markdown reader * add iterative planning * update version * update runner * update 2wiki example * update bridge * merge solver and solver_new * add cur day * writer delete * update multi process * add missing files * fix report * add chunk retrieved executor * update try in stream runner result * add path * add math executor * update hotpotqa example * remove log * fix python coder solver * update hotpotqa example * fix python coder solver * update config * fix bad * add log * remove unused code * commit with task thought * move kag model to common * add default chat llm * fix * use static planner * support chunk graph node * add args * support naive rag * llm client support tool calls * add default async * add openai * fix result * fix markdown reader * fix thinker * update asyncio interface * feat(solver): add mcp support (#444) * 上传mcp client相关代码 * 1、完成一套mcp client的调用,从pipeline到planner、executor 2、允许json中传入多个mcp_server,通过大模型进行调用并选择 3、调通baidu_map_mcp的使用 * 1、schema * bugfix:删减冗余代码 --------- Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> * fix affairqa after solver refactor * fix affairqa after solver refactor * fix readme * add params * update version * update mcp executor * update mcp executor * solver add mcp executor * add missing file * add mpc executor * add executor * x * update * fix requirement * fix main llm config * fix solver * bugfix:修复invoke函数调用逻辑 * chg eva * update example * add kag layer * add step task * support dot refresh * support dot refresh * support dot refresh * support dot refresh * add retrieved num * add retrieved num * add pipelineconf * update ppr * update musique prompts * update * add to_dict for BuilderComponentData * async build * add deduce prompt * add deduce prompt * add deduce prompt * fix reader * add deduce prompt * add page thinker report * modify prmpt * add step status * add self cognition * add self cognition * add memory graph storage * add now time * update memory config * add now time * chg graph loader * 添加prqa数据集和代码 * bugfix:prqa调用逻辑修复 * optimize:优化代码逻辑,生成答案规范化 * add retry py code * update memory graph * update memory graph * fix * fix ner * add with_out_refer generator prompt * fix * close ckpt * fix query * fix query * update version * add llm checker * add llm checker * 1、上传evalutor.py以及修改gold_answer.json格式 2、优化代码逻辑 3、修改README.md文件 * update exp * update exp * rerank support * add static rewrite query * recall more chunks * fix graph load * add static rewrite query * fix bugs * add finish check * add finish check * add finish check * add finish check * 1、上传evalutor.py的结果 2、优化代码逻辑,优化readme文件 * add lf retry * add memory graph api * fix reader api * add ner * add metrics * fix bug * remove ner * add reraise fo retry * add edge prop to memory graph * add memory graph * 1、评测数据集结果修正 2、优化evaluator.py代码 3、删除结果不存在而gold_answer中有答案的问题 * 删除评测结果文件 * fix knext host addr * async eva * add lf prompt * add lf prompt * add config * add retry * add unknown check * add rc result * add rc result * add rc result * add rc result * 依据kag pipeline格式修改代码逻辑并通过测试 * bugfix:删除冗余代码 * fix report prompt * bugfix:触发重试机制 * bugfix:中文符号错误 * fix rethinker prompt * update version to 0.6.2b78 * update version * 1、修改evaluator.py,通过大模型计算准确率,符合最新调用逻辑 2、修改prompt,让没有回答的结果重复测试 * update affairqa for evaluate * update affairqa for evaluate * bugfix:修正数据集 * bugfix:修正数据集 * bugfix:修正数据集 * fix name conflict * bugfix:删除错误问题 * bugfix:文件名命名错误导致evaluator失败 * update for affairqa eval * bugfix:修改代码保持evaluate逻辑一致 * x * update for affairqa readme * remove temp eval scripts * bugfix for math deduce * merge 0.6.2_dev * merge 0.6.2_dev * fix * update client addr * updated version * update for affairqa eval * evaUtils 支持中文 * fix affairqa eval: * remove unused example * update kag config * fix default value * update readme * fix init * 注释信息修改,并添加部分class说明 * update example config * Tc 0.7.0 (#459) * 提交affairQA 代码 * fix affairqa eval --------- Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> * fix all examples * reformat --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>
2025-04-17 17:23:52 +08:00
import string
2024-10-24 11:46:15 +08:00
import sys
import json
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
import hashlib
2024-10-24 11:46:15 +08:00
import os
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
import tempfile
feat(kag): update to v0.7 (#456) * add think cost * update csv scanner * add final rerank * add reasoner * add iterative planner * fix dpr search * fix dpr search * add reference data * move odps import * update requirement.txt * update 2wiki * add missing file * fix markdown reader * add iterative planning * update version * update runner * update 2wiki example * update bridge * merge solver and solver_new * add cur day * writer delete * update multi process * add missing files * fix report * add chunk retrieved executor * update try in stream runner result * add path * add math executor * update hotpotqa example * remove log * fix python coder solver * update hotpotqa example * fix python coder solver * update config * fix bad * add log * remove unused code * commit with task thought * move kag model to common * add default chat llm * fix * use static planner * support chunk graph node * add args * support naive rag * llm client support tool calls * add default async * add openai * fix result * fix markdown reader * fix thinker * update asyncio interface * feat(solver): add mcp support (#444) * 上传mcp client相关代码 * 1、完成一套mcp client的调用,从pipeline到planner、executor 2、允许json中传入多个mcp_server,通过大模型进行调用并选择 3、调通baidu_map_mcp的使用 * 1、schema * bugfix:删减冗余代码 --------- Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> * fix affairqa after solver refactor * fix affairqa after solver refactor * fix readme * add params * update version * update mcp executor * update mcp executor * solver add mcp executor * add missing file * add mpc executor * add executor * x * update * fix requirement * fix main llm config * fix solver * bugfix:修复invoke函数调用逻辑 * chg eva * update example * add kag layer * add step task * support dot refresh * support dot refresh * support dot refresh * support dot refresh * add retrieved num * add retrieved num * add pipelineconf * update ppr * update musique prompts * update * add to_dict for BuilderComponentData * async build * add deduce prompt * add deduce prompt * add deduce prompt * fix reader * add deduce prompt * add page thinker report * modify prmpt * add step status * add self cognition * add self cognition * add memory graph storage * add now time * update memory config * add now time * chg graph loader * 添加prqa数据集和代码 * bugfix:prqa调用逻辑修复 * optimize:优化代码逻辑,生成答案规范化 * add retry py code * update memory graph * update memory graph * fix * fix ner * add with_out_refer generator prompt * fix * close ckpt * fix query * fix query * update version * add llm checker * add llm checker * 1、上传evalutor.py以及修改gold_answer.json格式 2、优化代码逻辑 3、修改README.md文件 * update exp * update exp * rerank support * add static rewrite query * recall more chunks * fix graph load * add static rewrite query * fix bugs * add finish check * add finish check * add finish check * add finish check * 1、上传evalutor.py的结果 2、优化代码逻辑,优化readme文件 * add lf retry * add memory graph api * fix reader api * add ner * add metrics * fix bug * remove ner * add reraise fo retry * add edge prop to memory graph * add memory graph * 1、评测数据集结果修正 2、优化evaluator.py代码 3、删除结果不存在而gold_answer中有答案的问题 * 删除评测结果文件 * fix knext host addr * async eva * add lf prompt * add lf prompt * add config * add retry * add unknown check * add rc result * add rc result * add rc result * add rc result * 依据kag pipeline格式修改代码逻辑并通过测试 * bugfix:删除冗余代码 * fix report prompt * bugfix:触发重试机制 * bugfix:中文符号错误 * fix rethinker prompt * update version to 0.6.2b78 * update version * 1、修改evaluator.py,通过大模型计算准确率,符合最新调用逻辑 2、修改prompt,让没有回答的结果重复测试 * update affairqa for evaluate * update affairqa for evaluate * bugfix:修正数据集 * bugfix:修正数据集 * bugfix:修正数据集 * fix name conflict * bugfix:删除错误问题 * bugfix:文件名命名错误导致evaluator失败 * update for affairqa eval * bugfix:修改代码保持evaluate逻辑一致 * x * update for affairqa readme * remove temp eval scripts * bugfix for math deduce * merge 0.6.2_dev * merge 0.6.2_dev * fix * update client addr * updated version * update for affairqa eval * evaUtils 支持中文 * fix affairqa eval: * remove unused example * update kag config * fix default value * update readme * fix init * 注释信息修改,并添加部分class说明 * update example config * Tc 0.7.0 (#459) * 提交affairQA 代码 * fix affairqa eval --------- Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> * fix all examples * reformat --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>
2025-04-17 17:23:52 +08:00
import time
import uuid
2025-05-28 16:06:28 +08:00
import subprocess
import shlex
feat(kag): update to v0.7 (#456) * add think cost * update csv scanner * add final rerank * add reasoner * add iterative planner * fix dpr search * fix dpr search * add reference data * move odps import * update requirement.txt * update 2wiki * add missing file * fix markdown reader * add iterative planning * update version * update runner * update 2wiki example * update bridge * merge solver and solver_new * add cur day * writer delete * update multi process * add missing files * fix report * add chunk retrieved executor * update try in stream runner result * add path * add math executor * update hotpotqa example * remove log * fix python coder solver * update hotpotqa example * fix python coder solver * update config * fix bad * add log * remove unused code * commit with task thought * move kag model to common * add default chat llm * fix * use static planner * support chunk graph node * add args * support naive rag * llm client support tool calls * add default async * add openai * fix result * fix markdown reader * fix thinker * update asyncio interface * feat(solver): add mcp support (#444) * 上传mcp client相关代码 * 1、完成一套mcp client的调用,从pipeline到planner、executor 2、允许json中传入多个mcp_server,通过大模型进行调用并选择 3、调通baidu_map_mcp的使用 * 1、schema * bugfix:删减冗余代码 --------- Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> * fix affairqa after solver refactor * fix affairqa after solver refactor * fix readme * add params * update version * update mcp executor * update mcp executor * solver add mcp executor * add missing file * add mpc executor * add executor * x * update * fix requirement * fix main llm config * fix solver * bugfix:修复invoke函数调用逻辑 * chg eva * update example * add kag layer * add step task * support dot refresh * support dot refresh * support dot refresh * support dot refresh * add retrieved num * add retrieved num * add pipelineconf * update ppr * update musique prompts * update * add to_dict for BuilderComponentData * async build * add deduce prompt * add deduce prompt * add deduce prompt * fix reader * add deduce prompt * add page thinker report * modify prmpt * add step status * add self cognition * add self cognition * add memory graph storage * add now time * update memory config * add now time * chg graph loader * 添加prqa数据集和代码 * bugfix:prqa调用逻辑修复 * optimize:优化代码逻辑,生成答案规范化 * add retry py code * update memory graph * update memory graph * fix * fix ner * add with_out_refer generator prompt * fix * close ckpt * fix query * fix query * update version * add llm checker * add llm checker * 1、上传evalutor.py以及修改gold_answer.json格式 2、优化代码逻辑 3、修改README.md文件 * update exp * update exp * rerank support * add static rewrite query * recall more chunks * fix graph load * add static rewrite query * fix bugs * add finish check * add finish check * add finish check * add finish check * 1、上传evalutor.py的结果 2、优化代码逻辑,优化readme文件 * add lf retry * add memory graph api * fix reader api * add ner * add metrics * fix bug * remove ner * add reraise fo retry * add edge prop to memory graph * add memory graph * 1、评测数据集结果修正 2、优化evaluator.py代码 3、删除结果不存在而gold_answer中有答案的问题 * 删除评测结果文件 * fix knext host addr * async eva * add lf prompt * add lf prompt * add config * add retry * add unknown check * add rc result * add rc result * add rc result * add rc result * 依据kag pipeline格式修改代码逻辑并通过测试 * bugfix:删除冗余代码 * fix report prompt * bugfix:触发重试机制 * bugfix:中文符号错误 * fix rethinker prompt * update version to 0.6.2b78 * update version * 1、修改evaluator.py,通过大模型计算准确率,符合最新调用逻辑 2、修改prompt,让没有回答的结果重复测试 * update affairqa for evaluate * update affairqa for evaluate * bugfix:修正数据集 * bugfix:修正数据集 * bugfix:修正数据集 * fix name conflict * bugfix:删除错误问题 * bugfix:文件名命名错误导致evaluator失败 * update for affairqa eval * bugfix:修改代码保持evaluate逻辑一致 * x * update for affairqa readme * remove temp eval scripts * bugfix for math deduce * merge 0.6.2_dev * merge 0.6.2_dev * fix * update client addr * updated version * update for affairqa eval * evaUtils 支持中文 * fix affairqa eval: * remove unused example * update kag config * fix default value * update readme * fix init * 注释信息修改,并添加部分class说明 * update example config * Tc 0.7.0 (#459) * 提交affairQA 代码 * fix affairqa eval --------- Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> * fix all examples * reformat --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>
2025-04-17 17:23:52 +08:00
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
import requests
2024-10-24 11:46:15 +08:00
import importlib
feat(kag): update to v0.7 (#456) * add think cost * update csv scanner * add final rerank * add reasoner * add iterative planner * fix dpr search * fix dpr search * add reference data * move odps import * update requirement.txt * update 2wiki * add missing file * fix markdown reader * add iterative planning * update version * update runner * update 2wiki example * update bridge * merge solver and solver_new * add cur day * writer delete * update multi process * add missing files * fix report * add chunk retrieved executor * update try in stream runner result * add path * add math executor * update hotpotqa example * remove log * fix python coder solver * update hotpotqa example * fix python coder solver * update config * fix bad * add log * remove unused code * commit with task thought * move kag model to common * add default chat llm * fix * use static planner * support chunk graph node * add args * support naive rag * llm client support tool calls * add default async * add openai * fix result * fix markdown reader * fix thinker * update asyncio interface * feat(solver): add mcp support (#444) * 上传mcp client相关代码 * 1、完成一套mcp client的调用,从pipeline到planner、executor 2、允许json中传入多个mcp_server,通过大模型进行调用并选择 3、调通baidu_map_mcp的使用 * 1、schema * bugfix:删减冗余代码 --------- Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> * fix affairqa after solver refactor * fix affairqa after solver refactor * fix readme * add params * update version * update mcp executor * update mcp executor * solver add mcp executor * add missing file * add mpc executor * add executor * x * update * fix requirement * fix main llm config * fix solver * bugfix:修复invoke函数调用逻辑 * chg eva * update example * add kag layer * add step task * support dot refresh * support dot refresh * support dot refresh * support dot refresh * add retrieved num * add retrieved num * add pipelineconf * update ppr * update musique prompts * update * add to_dict for BuilderComponentData * async build * add deduce prompt * add deduce prompt * add deduce prompt * fix reader * add deduce prompt * add page thinker report * modify prmpt * add step status * add self cognition * add self cognition * add memory graph storage * add now time * update memory config * add now time * chg graph loader * 添加prqa数据集和代码 * bugfix:prqa调用逻辑修复 * optimize:优化代码逻辑,生成答案规范化 * add retry py code * update memory graph * update memory graph * fix * fix ner * add with_out_refer generator prompt * fix * close ckpt * fix query * fix query * update version * add llm checker * add llm checker * 1、上传evalutor.py以及修改gold_answer.json格式 2、优化代码逻辑 3、修改README.md文件 * update exp * update exp * rerank support * add static rewrite query * recall more chunks * fix graph load * add static rewrite query * fix bugs * add finish check * add finish check * add finish check * add finish check * 1、上传evalutor.py的结果 2、优化代码逻辑,优化readme文件 * add lf retry * add memory graph api * fix reader api * add ner * add metrics * fix bug * remove ner * add reraise fo retry * add edge prop to memory graph * add memory graph * 1、评测数据集结果修正 2、优化evaluator.py代码 3、删除结果不存在而gold_answer中有答案的问题 * 删除评测结果文件 * fix knext host addr * async eva * add lf prompt * add lf prompt * add config * add retry * add unknown check * add rc result * add rc result * add rc result * add rc result * 依据kag pipeline格式修改代码逻辑并通过测试 * bugfix:删除冗余代码 * fix report prompt * bugfix:触发重试机制 * bugfix:中文符号错误 * fix rethinker prompt * update version to 0.6.2b78 * update version * 1、修改evaluator.py,通过大模型计算准确率,符合最新调用逻辑 2、修改prompt,让没有回答的结果重复测试 * update affairqa for evaluate * update affairqa for evaluate * bugfix:修正数据集 * bugfix:修正数据集 * bugfix:修正数据集 * fix name conflict * bugfix:删除错误问题 * bugfix:文件名命名错误导致evaluator失败 * update for affairqa eval * bugfix:修改代码保持evaluate逻辑一致 * x * update for affairqa readme * remove temp eval scripts * bugfix for math deduce * merge 0.6.2_dev * merge 0.6.2_dev * fix * update client addr * updated version * update for affairqa eval * evaUtils 支持中文 * fix affairqa eval: * remove unused example * update kag config * fix default value * update readme * fix init * 注释信息修改,并添加部分class说明 * update example config * Tc 0.7.0 (#459) * 提交affairQA 代码 * fix affairqa eval --------- Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> * fix all examples * reformat --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>
2025-04-17 17:23:52 +08:00
import numpy as np
2025-05-07 10:15:56 +08:00
from typing import Tuple, TypeVar, Optional
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
from pathlib import Path
2024-10-24 11:46:15 +08:00
from shutil import copystat, copy2
from typing import Any, Union
from jinja2 import Environment, FileSystemLoader, Template
from stat import S_IWUSR as OWNER_WRITE_PERMISSION
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
from tenacity import retry, stop_after_attempt
feat(kag): update to v0.7 (#456) * add think cost * update csv scanner * add final rerank * add reasoner * add iterative planner * fix dpr search * fix dpr search * add reference data * move odps import * update requirement.txt * update 2wiki * add missing file * fix markdown reader * add iterative planning * update version * update runner * update 2wiki example * update bridge * merge solver and solver_new * add cur day * writer delete * update multi process * add missing files * fix report * add chunk retrieved executor * update try in stream runner result * add path * add math executor * update hotpotqa example * remove log * fix python coder solver * update hotpotqa example * fix python coder solver * update config * fix bad * add log * remove unused code * commit with task thought * move kag model to common * add default chat llm * fix * use static planner * support chunk graph node * add args * support naive rag * llm client support tool calls * add default async * add openai * fix result * fix markdown reader * fix thinker * update asyncio interface * feat(solver): add mcp support (#444) * 上传mcp client相关代码 * 1、完成一套mcp client的调用,从pipeline到planner、executor 2、允许json中传入多个mcp_server,通过大模型进行调用并选择 3、调通baidu_map_mcp的使用 * 1、schema * bugfix:删减冗余代码 --------- Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> * fix affairqa after solver refactor * fix affairqa after solver refactor * fix readme * add params * update version * update mcp executor * update mcp executor * solver add mcp executor * add missing file * add mpc executor * add executor * x * update * fix requirement * fix main llm config * fix solver * bugfix:修复invoke函数调用逻辑 * chg eva * update example * add kag layer * add step task * support dot refresh * support dot refresh * support dot refresh * support dot refresh * add retrieved num * add retrieved num * add pipelineconf * update ppr * update musique prompts * update * add to_dict for BuilderComponentData * async build * add deduce prompt * add deduce prompt * add deduce prompt * fix reader * add deduce prompt * add page thinker report * modify prmpt * add step status * add self cognition * add self cognition * add memory graph storage * add now time * update memory config * add now time * chg graph loader * 添加prqa数据集和代码 * bugfix:prqa调用逻辑修复 * optimize:优化代码逻辑,生成答案规范化 * add retry py code * update memory graph * update memory graph * fix * fix ner * add with_out_refer generator prompt * fix * close ckpt * fix query * fix query * update version * add llm checker * add llm checker * 1、上传evalutor.py以及修改gold_answer.json格式 2、优化代码逻辑 3、修改README.md文件 * update exp * update exp * rerank support * add static rewrite query * recall more chunks * fix graph load * add static rewrite query * fix bugs * add finish check * add finish check * add finish check * add finish check * 1、上传evalutor.py的结果 2、优化代码逻辑,优化readme文件 * add lf retry * add memory graph api * fix reader api * add ner * add metrics * fix bug * remove ner * add reraise fo retry * add edge prop to memory graph * add memory graph * 1、评测数据集结果修正 2、优化evaluator.py代码 3、删除结果不存在而gold_answer中有答案的问题 * 删除评测结果文件 * fix knext host addr * async eva * add lf prompt * add lf prompt * add config * add retry * add unknown check * add rc result * add rc result * add rc result * add rc result * 依据kag pipeline格式修改代码逻辑并通过测试 * bugfix:删除冗余代码 * fix report prompt * bugfix:触发重试机制 * bugfix:中文符号错误 * fix rethinker prompt * update version to 0.6.2b78 * update version * 1、修改evaluator.py,通过大模型计算准确率,符合最新调用逻辑 2、修改prompt,让没有回答的结果重复测试 * update affairqa for evaluate * update affairqa for evaluate * bugfix:修正数据集 * bugfix:修正数据集 * bugfix:修正数据集 * fix name conflict * bugfix:删除错误问题 * bugfix:文件名命名错误导致evaluator失败 * update for affairqa eval * bugfix:修改代码保持evaluate逻辑一致 * x * update for affairqa readme * remove temp eval scripts * bugfix for math deduce * merge 0.6.2_dev * merge 0.6.2_dev * fix * update client addr * updated version * update for affairqa eval * evaUtils 支持中文 * fix affairqa eval: * remove unused example * update kag config * fix default value * update readme * fix init * 注释信息修改,并添加部分class说明 * update example config * Tc 0.7.0 (#459) * 提交affairQA 代码 * fix affairqa eval --------- Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> * fix all examples * reformat --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>
2025-04-17 17:23:52 +08:00
from aiolimiter import AsyncLimiter
2024-10-24 11:46:15 +08:00
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
reset = "\033[0m"
bold = "\033[1m"
underline = "\033[4m"
red = "\033[31m"
green = "\033[32m"
yellow = "\033[33m"
blue = "\033[34m"
magenta = "\033[35m"
cyan = "\033[36m"
white = "\033[37m"
2024-10-24 11:46:15 +08:00
2025-05-28 16:06:28 +08:00
def run_cmd(cmd, catch_stdout=True, catch_stderr=True, shell=False):
args = shlex.split(cmd)
if catch_stdout:
stdout = subprocess.PIPE
else:
stdout = None
if catch_stderr:
stderr = subprocess.PIPE
else:
stderr = None
result = subprocess.run(args, stdout=stdout, stderr=stderr, shell=shell)
return result
2024-10-24 11:46:15 +08:00
def append_python_path(path: str) -> bool:
"""
Append the given path to `sys.path`.
"""
path = Path(path).resolve()
path = str(path)
if path not in sys.path:
sys.path.append(path)
return True
return False
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
2024-10-24 11:46:15 +08:00
def render_template(
root_dir: Union[str, os.PathLike], file: Union[str, os.PathLike], **kwargs: Any
) -> None:
env = Environment(loader=FileSystemLoader(root_dir))
template = env.get_template(str(file))
content = template.render(kwargs)
path_obj = Path(root_dir) / file
render_path = path_obj.with_suffix("") if path_obj.suffix == ".tmpl" else path_obj
if path_obj.suffix == ".tmpl":
path_obj.rename(render_path)
render_path.write_text(content, "utf8")
def copytree(src: Path, dst: Path, **kwargs):
names = [x.name for x in src.iterdir()]
if not dst.exists():
dst.mkdir(parents=True)
for name in names:
_name = Template(name).render(**kwargs)
src_name = src / name
dst_name = dst / _name
if src_name.is_dir():
copytree(src_name, dst_name, **kwargs)
else:
copyfile(src_name, dst_name, **kwargs)
copystat(src, dst)
_make_writable(dst)
def copyfile(src: Path, dst: Path, **kwargs):
if dst.exists():
return
dst = Path(Template(str(dst)).render(**kwargs))
copy2(src, dst)
_make_writable(dst)
if dst.suffix != ".tmpl":
return
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
render_template("/", dst, **kwargs)
2024-10-24 11:46:15 +08:00
def remove_files_except(path, file, new_file):
for filename in os.listdir(path):
file_path = os.path.join(path, filename)
if os.path.isfile(file_path) and filename != file:
os.remove(file_path)
os.rename(path / file, path / new_file)
def _make_writable(path):
current_permissions = os.stat(path).st_mode
os.chmod(path, current_permissions | OWNER_WRITE_PERMISSION)
def escape_single_quotes(s: str):
return s.replace("'", "\\'")
def load_json(content):
try:
return json.loads(content)
except json.JSONDecodeError as e:
substr = content[: e.colno - 1]
return json.loads(substr)
feat(kag): update to v0.7 (#456) * add think cost * update csv scanner * add final rerank * add reasoner * add iterative planner * fix dpr search * fix dpr search * add reference data * move odps import * update requirement.txt * update 2wiki * add missing file * fix markdown reader * add iterative planning * update version * update runner * update 2wiki example * update bridge * merge solver and solver_new * add cur day * writer delete * update multi process * add missing files * fix report * add chunk retrieved executor * update try in stream runner result * add path * add math executor * update hotpotqa example * remove log * fix python coder solver * update hotpotqa example * fix python coder solver * update config * fix bad * add log * remove unused code * commit with task thought * move kag model to common * add default chat llm * fix * use static planner * support chunk graph node * add args * support naive rag * llm client support tool calls * add default async * add openai * fix result * fix markdown reader * fix thinker * update asyncio interface * feat(solver): add mcp support (#444) * 上传mcp client相关代码 * 1、完成一套mcp client的调用,从pipeline到planner、executor 2、允许json中传入多个mcp_server,通过大模型进行调用并选择 3、调通baidu_map_mcp的使用 * 1、schema * bugfix:删减冗余代码 --------- Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> * fix affairqa after solver refactor * fix affairqa after solver refactor * fix readme * add params * update version * update mcp executor * update mcp executor * solver add mcp executor * add missing file * add mpc executor * add executor * x * update * fix requirement * fix main llm config * fix solver * bugfix:修复invoke函数调用逻辑 * chg eva * update example * add kag layer * add step task * support dot refresh * support dot refresh * support dot refresh * support dot refresh * add retrieved num * add retrieved num * add pipelineconf * update ppr * update musique prompts * update * add to_dict for BuilderComponentData * async build * add deduce prompt * add deduce prompt * add deduce prompt * fix reader * add deduce prompt * add page thinker report * modify prmpt * add step status * add self cognition * add self cognition * add memory graph storage * add now time * update memory config * add now time * chg graph loader * 添加prqa数据集和代码 * bugfix:prqa调用逻辑修复 * optimize:优化代码逻辑,生成答案规范化 * add retry py code * update memory graph * update memory graph * fix * fix ner * add with_out_refer generator prompt * fix * close ckpt * fix query * fix query * update version * add llm checker * add llm checker * 1、上传evalutor.py以及修改gold_answer.json格式 2、优化代码逻辑 3、修改README.md文件 * update exp * update exp * rerank support * add static rewrite query * recall more chunks * fix graph load * add static rewrite query * fix bugs * add finish check * add finish check * add finish check * add finish check * 1、上传evalutor.py的结果 2、优化代码逻辑,优化readme文件 * add lf retry * add memory graph api * fix reader api * add ner * add metrics * fix bug * remove ner * add reraise fo retry * add edge prop to memory graph * add memory graph * 1、评测数据集结果修正 2、优化evaluator.py代码 3、删除结果不存在而gold_answer中有答案的问题 * 删除评测结果文件 * fix knext host addr * async eva * add lf prompt * add lf prompt * add config * add retry * add unknown check * add rc result * add rc result * add rc result * add rc result * 依据kag pipeline格式修改代码逻辑并通过测试 * bugfix:删除冗余代码 * fix report prompt * bugfix:触发重试机制 * bugfix:中文符号错误 * fix rethinker prompt * update version to 0.6.2b78 * update version * 1、修改evaluator.py,通过大模型计算准确率,符合最新调用逻辑 2、修改prompt,让没有回答的结果重复测试 * update affairqa for evaluate * update affairqa for evaluate * bugfix:修正数据集 * bugfix:修正数据集 * bugfix:修正数据集 * fix name conflict * bugfix:删除错误问题 * bugfix:文件名命名错误导致evaluator失败 * update for affairqa eval * bugfix:修改代码保持evaluate逻辑一致 * x * update for affairqa readme * remove temp eval scripts * bugfix for math deduce * merge 0.6.2_dev * merge 0.6.2_dev * fix * update client addr * updated version * update for affairqa eval * evaUtils 支持中文 * fix affairqa eval: * remove unused example * update kag config * fix default value * update readme * fix init * 注释信息修改,并添加部分class说明 * update example config * Tc 0.7.0 (#459) * 提交affairQA 代码 * fix affairqa eval --------- Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> * fix all examples * reformat --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>
2025-04-17 17:23:52 +08:00
def flatten_2d_list(nested_list):
return [item for sublist in nested_list for item in sublist]
2024-10-24 11:46:15 +08:00
def split_module_class_name(name: str, text: str) -> Tuple[str, str]:
"""
Split `name` as module name and class name pair.
:param name: fully qualified class name, e.g. ``foo.bar.MyClass``
:type name: str
:param text: describe the kind of the class, used in the exception message
:type text: str
:rtype: Tuple[str, str]
:raises RuntimeError: if `name` is not a fully qualified class name
"""
i = name.rfind(".")
if i == -1:
message = "invalid %s class name: %s" % (text, name)
raise RuntimeError(message)
module_name = name[:i]
class_name = name[i + 1 :]
return module_name, class_name
def dynamic_import_class(name: str, text: str):
"""
Import the class specified by `name` dyanmically.
:param name: fully qualified class name, e.g. ``foo.bar.MyClass``
:type name: str
:param text: describe the kind of the class, use in the exception message
:type text: str
:raises RuntimeError: if `name` is not a fully qualified class name, or
the class is not in the module specified by `name`
:raises ModuleNotFoundError: the module specified by `name` is not found
"""
module_name, class_name = split_module_class_name(name, text)
module = importlib.import_module(module_name)
class_ = getattr(module, class_name, None)
if class_ is None:
message = "class %r not found in module %r" % (class_name, module_name)
raise RuntimeError(message)
if not isinstance(class_, type):
message = "%r is not a class" % (name,)
raise RuntimeError(message)
return class_
def processing_phrases(phrase):
phrase = str(phrase)
return re.sub("[^A-Za-z0-9\u4e00-\u9fa5 ]", " ", phrase.lower()).strip()
def to_camel_case(phrase):
s = processing_phrases(phrase).replace(" ", "_")
return "".join(
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
word.capitalize() if i != 0 else word for i, word in enumerate(s.split("_"))
2024-10-24 11:46:15 +08:00
)
def to_snake_case(name):
words = re.findall("[A-Za-z][a-z0-9]*", name)
result = "_".join(words).lower()
return result
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
def get_vector_field_name(property_key: str):
name = f"{property_key}_vector"
name = to_snake_case(name)
return "_" + name
def get_sparse_vector_field_name(property_key: str):
name = f"{property_key}_sparse"
name = to_snake_case(name)
return "_" + name
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
def split_list_into_n_parts(lst, n):
length = len(lst)
part_size = length // n
seg = [x * part_size for x in range(n)]
seg.append(min(length, part_size * n))
remainder = length % n
result = []
# 分割列表
start = 0
for i in range(n):
# 计算当前份的元素数量
if i < remainder:
end = start + part_size + 1
else:
end = start + part_size
# 添加当前份到结果列表
result.append(lst[start:end])
# 更新起始位置
start = end
return result
def generate_hash_id(value):
"""
Generates a hash ID and an abstracted version of the input value.
If the input value is a dictionary, it sorts the dictionary items and abstracts the dictionary.
If the input value is not a dictionary, it abstracts the value directly.
Args:
value: The input value to be hashed and abstracted.
Returns:
feat(kag): update to v0.7 (#456) * add think cost * update csv scanner * add final rerank * add reasoner * add iterative planner * fix dpr search * fix dpr search * add reference data * move odps import * update requirement.txt * update 2wiki * add missing file * fix markdown reader * add iterative planning * update version * update runner * update 2wiki example * update bridge * merge solver and solver_new * add cur day * writer delete * update multi process * add missing files * fix report * add chunk retrieved executor * update try in stream runner result * add path * add math executor * update hotpotqa example * remove log * fix python coder solver * update hotpotqa example * fix python coder solver * update config * fix bad * add log * remove unused code * commit with task thought * move kag model to common * add default chat llm * fix * use static planner * support chunk graph node * add args * support naive rag * llm client support tool calls * add default async * add openai * fix result * fix markdown reader * fix thinker * update asyncio interface * feat(solver): add mcp support (#444) * 上传mcp client相关代码 * 1、完成一套mcp client的调用,从pipeline到planner、executor 2、允许json中传入多个mcp_server,通过大模型进行调用并选择 3、调通baidu_map_mcp的使用 * 1、schema * bugfix:删减冗余代码 --------- Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> * fix affairqa after solver refactor * fix affairqa after solver refactor * fix readme * add params * update version * update mcp executor * update mcp executor * solver add mcp executor * add missing file * add mpc executor * add executor * x * update * fix requirement * fix main llm config * fix solver * bugfix:修复invoke函数调用逻辑 * chg eva * update example * add kag layer * add step task * support dot refresh * support dot refresh * support dot refresh * support dot refresh * add retrieved num * add retrieved num * add pipelineconf * update ppr * update musique prompts * update * add to_dict for BuilderComponentData * async build * add deduce prompt * add deduce prompt * add deduce prompt * fix reader * add deduce prompt * add page thinker report * modify prmpt * add step status * add self cognition * add self cognition * add memory graph storage * add now time * update memory config * add now time * chg graph loader * 添加prqa数据集和代码 * bugfix:prqa调用逻辑修复 * optimize:优化代码逻辑,生成答案规范化 * add retry py code * update memory graph * update memory graph * fix * fix ner * add with_out_refer generator prompt * fix * close ckpt * fix query * fix query * update version * add llm checker * add llm checker * 1、上传evalutor.py以及修改gold_answer.json格式 2、优化代码逻辑 3、修改README.md文件 * update exp * update exp * rerank support * add static rewrite query * recall more chunks * fix graph load * add static rewrite query * fix bugs * add finish check * add finish check * add finish check * add finish check * 1、上传evalutor.py的结果 2、优化代码逻辑,优化readme文件 * add lf retry * add memory graph api * fix reader api * add ner * add metrics * fix bug * remove ner * add reraise fo retry * add edge prop to memory graph * add memory graph * 1、评测数据集结果修正 2、优化evaluator.py代码 3、删除结果不存在而gold_answer中有答案的问题 * 删除评测结果文件 * fix knext host addr * async eva * add lf prompt * add lf prompt * add config * add retry * add unknown check * add rc result * add rc result * add rc result * add rc result * 依据kag pipeline格式修改代码逻辑并通过测试 * bugfix:删除冗余代码 * fix report prompt * bugfix:触发重试机制 * bugfix:中文符号错误 * fix rethinker prompt * update version to 0.6.2b78 * update version * 1、修改evaluator.py,通过大模型计算准确率,符合最新调用逻辑 2、修改prompt,让没有回答的结果重复测试 * update affairqa for evaluate * update affairqa for evaluate * bugfix:修正数据集 * bugfix:修正数据集 * bugfix:修正数据集 * fix name conflict * bugfix:删除错误问题 * bugfix:文件名命名错误导致evaluator失败 * update for affairqa eval * bugfix:修改代码保持evaluate逻辑一致 * x * update for affairqa readme * remove temp eval scripts * bugfix for math deduce * merge 0.6.2_dev * merge 0.6.2_dev * fix * update client addr * updated version * update for affairqa eval * evaUtils 支持中文 * fix affairqa eval: * remove unused example * update kag config * fix default value * update readme * fix init * 注释信息修改,并添加部分class说明 * update example config * Tc 0.7.0 (#459) * 提交affairQA 代码 * fix affairqa eval --------- Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> * fix all examples * reformat --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>
2025-04-17 17:23:52 +08:00
str: A hash ID generated from the input value.
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
"""
if isinstance(value, dict):
sorted_items = sorted(value.items())
key = str(sorted_items)
else:
feat(kag): update to v0.7 (#456) * add think cost * update csv scanner * add final rerank * add reasoner * add iterative planner * fix dpr search * fix dpr search * add reference data * move odps import * update requirement.txt * update 2wiki * add missing file * fix markdown reader * add iterative planning * update version * update runner * update 2wiki example * update bridge * merge solver and solver_new * add cur day * writer delete * update multi process * add missing files * fix report * add chunk retrieved executor * update try in stream runner result * add path * add math executor * update hotpotqa example * remove log * fix python coder solver * update hotpotqa example * fix python coder solver * update config * fix bad * add log * remove unused code * commit with task thought * move kag model to common * add default chat llm * fix * use static planner * support chunk graph node * add args * support naive rag * llm client support tool calls * add default async * add openai * fix result * fix markdown reader * fix thinker * update asyncio interface * feat(solver): add mcp support (#444) * 上传mcp client相关代码 * 1、完成一套mcp client的调用,从pipeline到planner、executor 2、允许json中传入多个mcp_server,通过大模型进行调用并选择 3、调通baidu_map_mcp的使用 * 1、schema * bugfix:删减冗余代码 --------- Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> * fix affairqa after solver refactor * fix affairqa after solver refactor * fix readme * add params * update version * update mcp executor * update mcp executor * solver add mcp executor * add missing file * add mpc executor * add executor * x * update * fix requirement * fix main llm config * fix solver * bugfix:修复invoke函数调用逻辑 * chg eva * update example * add kag layer * add step task * support dot refresh * support dot refresh * support dot refresh * support dot refresh * add retrieved num * add retrieved num * add pipelineconf * update ppr * update musique prompts * update * add to_dict for BuilderComponentData * async build * add deduce prompt * add deduce prompt * add deduce prompt * fix reader * add deduce prompt * add page thinker report * modify prmpt * add step status * add self cognition * add self cognition * add memory graph storage * add now time * update memory config * add now time * chg graph loader * 添加prqa数据集和代码 * bugfix:prqa调用逻辑修复 * optimize:优化代码逻辑,生成答案规范化 * add retry py code * update memory graph * update memory graph * fix * fix ner * add with_out_refer generator prompt * fix * close ckpt * fix query * fix query * update version * add llm checker * add llm checker * 1、上传evalutor.py以及修改gold_answer.json格式 2、优化代码逻辑 3、修改README.md文件 * update exp * update exp * rerank support * add static rewrite query * recall more chunks * fix graph load * add static rewrite query * fix bugs * add finish check * add finish check * add finish check * add finish check * 1、上传evalutor.py的结果 2、优化代码逻辑,优化readme文件 * add lf retry * add memory graph api * fix reader api * add ner * add metrics * fix bug * remove ner * add reraise fo retry * add edge prop to memory graph * add memory graph * 1、评测数据集结果修正 2、优化evaluator.py代码 3、删除结果不存在而gold_answer中有答案的问题 * 删除评测结果文件 * fix knext host addr * async eva * add lf prompt * add lf prompt * add config * add retry * add unknown check * add rc result * add rc result * add rc result * add rc result * 依据kag pipeline格式修改代码逻辑并通过测试 * bugfix:删除冗余代码 * fix report prompt * bugfix:触发重试机制 * bugfix:中文符号错误 * fix rethinker prompt * update version to 0.6.2b78 * update version * 1、修改evaluator.py,通过大模型计算准确率,符合最新调用逻辑 2、修改prompt,让没有回答的结果重复测试 * update affairqa for evaluate * update affairqa for evaluate * bugfix:修正数据集 * bugfix:修正数据集 * bugfix:修正数据集 * fix name conflict * bugfix:删除错误问题 * bugfix:文件名命名错误导致evaluator失败 * update for affairqa eval * bugfix:修改代码保持evaluate逻辑一致 * x * update for affairqa readme * remove temp eval scripts * bugfix for math deduce * merge 0.6.2_dev * merge 0.6.2_dev * fix * update client addr * updated version * update for affairqa eval * evaUtils 支持中文 * fix affairqa eval: * remove unused example * update kag config * fix default value * update readme * fix init * 注释信息修改,并添加部分class说明 * update example config * Tc 0.7.0 (#459) * 提交affairQA 代码 * fix affairqa eval --------- Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> * fix all examples * reformat --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>
2025-04-17 17:23:52 +08:00
key = str(value) # Ensure key is a string regardless of input type
# Encode to bytes for hashing
key = key.encode("utf-8")
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
hasher = hashlib.sha256()
hasher.update(key)
return hasher.hexdigest()
feat(kag): update to v0.7 (#456) * add think cost * update csv scanner * add final rerank * add reasoner * add iterative planner * fix dpr search * fix dpr search * add reference data * move odps import * update requirement.txt * update 2wiki * add missing file * fix markdown reader * add iterative planning * update version * update runner * update 2wiki example * update bridge * merge solver and solver_new * add cur day * writer delete * update multi process * add missing files * fix report * add chunk retrieved executor * update try in stream runner result * add path * add math executor * update hotpotqa example * remove log * fix python coder solver * update hotpotqa example * fix python coder solver * update config * fix bad * add log * remove unused code * commit with task thought * move kag model to common * add default chat llm * fix * use static planner * support chunk graph node * add args * support naive rag * llm client support tool calls * add default async * add openai * fix result * fix markdown reader * fix thinker * update asyncio interface * feat(solver): add mcp support (#444) * 上传mcp client相关代码 * 1、完成一套mcp client的调用,从pipeline到planner、executor 2、允许json中传入多个mcp_server,通过大模型进行调用并选择 3、调通baidu_map_mcp的使用 * 1、schema * bugfix:删减冗余代码 --------- Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> * fix affairqa after solver refactor * fix affairqa after solver refactor * fix readme * add params * update version * update mcp executor * update mcp executor * solver add mcp executor * add missing file * add mpc executor * add executor * x * update * fix requirement * fix main llm config * fix solver * bugfix:修复invoke函数调用逻辑 * chg eva * update example * add kag layer * add step task * support dot refresh * support dot refresh * support dot refresh * support dot refresh * add retrieved num * add retrieved num * add pipelineconf * update ppr * update musique prompts * update * add to_dict for BuilderComponentData * async build * add deduce prompt * add deduce prompt * add deduce prompt * fix reader * add deduce prompt * add page thinker report * modify prmpt * add step status * add self cognition * add self cognition * add memory graph storage * add now time * update memory config * add now time * chg graph loader * 添加prqa数据集和代码 * bugfix:prqa调用逻辑修复 * optimize:优化代码逻辑,生成答案规范化 * add retry py code * update memory graph * update memory graph * fix * fix ner * add with_out_refer generator prompt * fix * close ckpt * fix query * fix query * update version * add llm checker * add llm checker * 1、上传evalutor.py以及修改gold_answer.json格式 2、优化代码逻辑 3、修改README.md文件 * update exp * update exp * rerank support * add static rewrite query * recall more chunks * fix graph load * add static rewrite query * fix bugs * add finish check * add finish check * add finish check * add finish check * 1、上传evalutor.py的结果 2、优化代码逻辑,优化readme文件 * add lf retry * add memory graph api * fix reader api * add ner * add metrics * fix bug * remove ner * add reraise fo retry * add edge prop to memory graph * add memory graph * 1、评测数据集结果修正 2、优化evaluator.py代码 3、删除结果不存在而gold_answer中有答案的问题 * 删除评测结果文件 * fix knext host addr * async eva * add lf prompt * add lf prompt * add config * add retry * add unknown check * add rc result * add rc result * add rc result * add rc result * 依据kag pipeline格式修改代码逻辑并通过测试 * bugfix:删除冗余代码 * fix report prompt * bugfix:触发重试机制 * bugfix:中文符号错误 * fix rethinker prompt * update version to 0.6.2b78 * update version * 1、修改evaluator.py,通过大模型计算准确率,符合最新调用逻辑 2、修改prompt,让没有回答的结果重复测试 * update affairqa for evaluate * update affairqa for evaluate * bugfix:修正数据集 * bugfix:修正数据集 * bugfix:修正数据集 * fix name conflict * bugfix:删除错误问题 * bugfix:文件名命名错误导致evaluator失败 * update for affairqa eval * bugfix:修改代码保持evaluate逻辑一致 * x * update for affairqa readme * remove temp eval scripts * bugfix for math deduce * merge 0.6.2_dev * merge 0.6.2_dev * fix * update client addr * updated version * update for affairqa eval * evaUtils 支持中文 * fix affairqa eval: * remove unused example * update kag config * fix default value * update readme * fix init * 注释信息修改,并添加部分class说明 * update example config * Tc 0.7.0 (#459) * 提交affairQA 代码 * fix affairqa eval --------- Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> * fix all examples * reformat --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>
2025-04-17 17:23:52 +08:00
@retry(stop=stop_after_attempt(3), reraise=True)
refactor(all): kag v0.6 (#174) * add path find * fix find path * spg guided relation extraction * fix dict parse with same key * rename graphalgoclient to graphclient * rename graphalgoclient to graphclient * file reader supports http url * add checkpointer class * parser supports checkpoint * add build * remove incorrect logs * remove logs * update examples * update chain checkpointer * vectorizer batch size set to 32 * add a zodb backended checkpointer * add a zodb backended checkpointer * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * 增加solver * add kag * update outline splitter * add main test * add op * code refactor * add tools * fix outline splitter * fix outline prompt * graph api pass * commit with page rank * add search api and graph api * add markdown report * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * update markdown reader * update pdf reader * raise extractor failure * add default expr * add log * merge jc reader features * rm import * add build * fix zodb based checkpointer * add thread for zodb IO * fix(common): resolve mutlithread conflict in zodb IO * fix(common): load existing zodb checkpoints * update examples * update examples * fix zodb writer * add docstring * fix jieba version mismatch * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * commit kag_config-tc.yaml 1、rename type to register_name 2、put a uniqe & specific name to register_name 3、rename reader to scanner 4、rename parser to reader 5、rename num_parallel to num_parallel_file, rename chain_level_num_paralle to num_parallel_chain_of_file 6、rename kag_extractor to schema_free_extractor, schema_base_extractor to schema_constraint_extractor 7、pre-define llm & vectorize_model and refer them in the yaml file Issues to be resolved: 1、examples of event extract & spg extract 2、statistic of indexer, such as nums of nodes & edges extracted, ratio of llm invoke. 3、Exceptions such as Debt, account does not exist should be thrown in llm invoke. 4、conf of solver need to be re-examined. * 1、fix bug in base_table_splitter * 1、fix bug in base_table_splitter * 1、fix bug in default_chain * update outline splitter * add main test * add markdown report * code refactor * fix outline splitter * fix outline prompt * update markdown reader * fix vectorizer num batch compute * add retry for vectorize model call * update markdown reader * raise extractor failure * rm parser * run pipeline * add config option of whether to perform llm config check, default to false * fix * recover pdf reader * several components can be null for default chain * 支持完整qa运行 * add if * remove unused code * 使用chunk兜底 * excluded source relation to choose * add generate * default recall 10 * add local memory * 排除相似边 * 增加保护 * 修复并发问题 * add debug logger * 支持topk参数化 * 支持chunk截断和调整spo select 的prompt * 增加查询请求保护 * 增加force_chunk配置 * fix entity linker algorithm * 增加sub query改写 * fix md reader dup in test * fix * merge knext to kag parallel * fix package * 修复指标下跌问题 * scanner update * scanner update * add doc and update example scripts * fix * add bridge to spg server * add format * fix bridge * update conf for baike * disable ckpt for spg server runner * llm invoke error default raise exceptions * chore(version): bump version to X.Y.Z * update default response generation prompt * add method getSummarizationMetrics * fix(common): fix project conf empty error * fix typo * 增加上报信息 * 修改main solver * postprocessor support spg server * 修改solver支持名 * fix language * 修改chunker接口,增加openapi * rename vectorizer to vectorize_model in spg server config * generate_random_string start with gen * add knext llm vector checker * add knext llm vector checker * add knext llm vector checker * solver移除默认值 * udpate yaml and register_name for baike * udpate yaml and register_name for baike * remove config key check * 修复llmmodule * fix knext project * udpate yaml and register_name for examples * udpate yaml and register_name for examples * Revert "udpate yaml and register_name for examples" This reverts commit b3fa5ca9ba749e501133ac67bd8746027ab839d9. * update register name * fix * fix * support multiple resigter names * update component * update reader register names (#183) * fix markdown reader * fix llm client for retry * feat(common): add processed chunk id checkpoint (#185) * update reader register names * add processed chunk id checkpoint * feat(example): add example config (#186) * update reader register names * add processed chunk id checkpoint * add example config file * add max_workers parameter for getSummarizationMetrics to make it faster * add csqa data generation script generate_data.py * commit generated csqa builder and solver data * add csqa basic project files * adjust split_length and num_threads_per_chain to match lightrag settings * ignore ckpt dirs * add csqa evaluation script eval.py * save evaluation scripts summarization_metrics.py and factual_correctness.py * save LightRAG output csqa_lightrag_answers.json * ignore KAG output csqa_kag_answers.json * add README.md for CSQA * fix(solver): fix solver pipeline conf (#191) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * update links and file paths * reformat csqa kag_config.yaml * reformat csqa python files * reformat getSummarizationMetrics and compare_summarization_answers * fix(solver): fix solver config (#192) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * add except * fix typo in csqa README.md * feat(conf): support reinitialize config for call from java side (#199) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * revert default response generation prompt * update project list * add README.md for the hotpotqa, 2wiki and musique examples * 增加spo检索 * turn off kag config dump by default * turn off knext schema dump by default * add .gitignore and fix kag_config.yaml * add README.md for the medicine example * add README.md for the supplychain example * bugfix for risk mining * use exact out * refactor(solver): format solver code (#205) * update reader register names * add processed chunk id checkpoint * add example config file * update solver pipeline config * fix project create * fix main solver conf * support reinitialize config for java call * black format --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> Co-authored-by: huaidong.xhd <huaidong.xhd@antgroup.com>
2025-01-03 17:10:51 +08:00
def download_from_http(url: str, dest: str = None) -> str:
"""Downloads a file from an HTTP URL and saves it to a temporary directory.
This function uses the requests library to download a file from the specified
HTTP URL and saves it to the system's temporary directory. After the download
is complete, it returns the local path of the downloaded file.
Args:
url (str): The HTTP URL of the file to be downloaded.
Returns:
str: The local path of the downloaded file.
"""
# Send an HTTP GET request to download the file
response = requests.get(url, stream=True)
response.raise_for_status() # Check if the request was successful
if dest is None:
# Create a temporary file
temp_dir = tempfile.gettempdir()
temp_file_path = os.path.join(temp_dir, os.path.basename(url))
dest = temp_file_path
with open(dest, "wb") as temp_file:
# Write the downloaded content to the temporary file
for chunk in response.iter_content(chunk_size=1024**2):
temp_file.write(chunk)
# Return the path of the temporary file
return temp_file.name
feat(kag): update to v0.7 (#456) * add think cost * update csv scanner * add final rerank * add reasoner * add iterative planner * fix dpr search * fix dpr search * add reference data * move odps import * update requirement.txt * update 2wiki * add missing file * fix markdown reader * add iterative planning * update version * update runner * update 2wiki example * update bridge * merge solver and solver_new * add cur day * writer delete * update multi process * add missing files * fix report * add chunk retrieved executor * update try in stream runner result * add path * add math executor * update hotpotqa example * remove log * fix python coder solver * update hotpotqa example * fix python coder solver * update config * fix bad * add log * remove unused code * commit with task thought * move kag model to common * add default chat llm * fix * use static planner * support chunk graph node * add args * support naive rag * llm client support tool calls * add default async * add openai * fix result * fix markdown reader * fix thinker * update asyncio interface * feat(solver): add mcp support (#444) * 上传mcp client相关代码 * 1、完成一套mcp client的调用,从pipeline到planner、executor 2、允许json中传入多个mcp_server,通过大模型进行调用并选择 3、调通baidu_map_mcp的使用 * 1、schema * bugfix:删减冗余代码 --------- Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> * fix affairqa after solver refactor * fix affairqa after solver refactor * fix readme * add params * update version * update mcp executor * update mcp executor * solver add mcp executor * add missing file * add mpc executor * add executor * x * update * fix requirement * fix main llm config * fix solver * bugfix:修复invoke函数调用逻辑 * chg eva * update example * add kag layer * add step task * support dot refresh * support dot refresh * support dot refresh * support dot refresh * add retrieved num * add retrieved num * add pipelineconf * update ppr * update musique prompts * update * add to_dict for BuilderComponentData * async build * add deduce prompt * add deduce prompt * add deduce prompt * fix reader * add deduce prompt * add page thinker report * modify prmpt * add step status * add self cognition * add self cognition * add memory graph storage * add now time * update memory config * add now time * chg graph loader * 添加prqa数据集和代码 * bugfix:prqa调用逻辑修复 * optimize:优化代码逻辑,生成答案规范化 * add retry py code * update memory graph * update memory graph * fix * fix ner * add with_out_refer generator prompt * fix * close ckpt * fix query * fix query * update version * add llm checker * add llm checker * 1、上传evalutor.py以及修改gold_answer.json格式 2、优化代码逻辑 3、修改README.md文件 * update exp * update exp * rerank support * add static rewrite query * recall more chunks * fix graph load * add static rewrite query * fix bugs * add finish check * add finish check * add finish check * add finish check * 1、上传evalutor.py的结果 2、优化代码逻辑,优化readme文件 * add lf retry * add memory graph api * fix reader api * add ner * add metrics * fix bug * remove ner * add reraise fo retry * add edge prop to memory graph * add memory graph * 1、评测数据集结果修正 2、优化evaluator.py代码 3、删除结果不存在而gold_answer中有答案的问题 * 删除评测结果文件 * fix knext host addr * async eva * add lf prompt * add lf prompt * add config * add retry * add unknown check * add rc result * add rc result * add rc result * add rc result * 依据kag pipeline格式修改代码逻辑并通过测试 * bugfix:删除冗余代码 * fix report prompt * bugfix:触发重试机制 * bugfix:中文符号错误 * fix rethinker prompt * update version to 0.6.2b78 * update version * 1、修改evaluator.py,通过大模型计算准确率,符合最新调用逻辑 2、修改prompt,让没有回答的结果重复测试 * update affairqa for evaluate * update affairqa for evaluate * bugfix:修正数据集 * bugfix:修正数据集 * bugfix:修正数据集 * fix name conflict * bugfix:删除错误问题 * bugfix:文件名命名错误导致evaluator失败 * update for affairqa eval * bugfix:修改代码保持evaluate逻辑一致 * x * update for affairqa readme * remove temp eval scripts * bugfix for math deduce * merge 0.6.2_dev * merge 0.6.2_dev * fix * update client addr * updated version * update for affairqa eval * evaUtils 支持中文 * fix affairqa eval: * remove unused example * update kag config * fix default value * update readme * fix init * 注释信息修改,并添加部分class说明 * update example config * Tc 0.7.0 (#459) * 提交affairQA 代码 * fix affairqa eval --------- Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> * fix all examples * reformat --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>
2025-04-17 17:23:52 +08:00
class RateLimiterManger:
def __init__(self):
self.limiter_map = {}
def get_rate_limiter(
self, name: str, max_rate: float = 1000, time_period: float = 1
):
if name not in self.limiter_map:
limiter = AsyncLimiter(max_rate, time_period)
self.limiter_map[name] = limiter
return self.limiter_map[name]
def get_now(language="zh"):
if language == "zh":
days_of_week = ["周一", "周二", "周三", "周四", "周五", "周六", "周日"]
date_format = "%Y年%m月%d"
elif language == "en":
days_of_week = [
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday",
]
date_format = "%Y-%m-%d"
else:
raise ValueError(
"Unsupported language. Please use 'zh' for Chinese or 'en' for English."
)
today = datetime.datetime.now()
return today.strftime(date_format) + " (" + days_of_week[today.weekday()] + ")"
def generate_random_string(bit=8):
possible_characters = string.ascii_letters + string.digits
random_str = "".join(random.choice(possible_characters) for _ in range(bit))
return "gen" + random_str
def generate_biz_id_with_type(biz_id, type_name):
return f"{biz_id}_{type_name}"
def get_p_clean(p):
if re.search(".*[\\u4e00-\\u9fa5]+.*", p):
p = re.sub("[ \t::()“”‘’'\"\[\]\(\)]+?", "", p)
else:
p = None
return p
def get_recall_node_label(label_set):
for l in label_set:
if l != "Entity":
return l
return "Entity"
feat(kag): update to v0.7 (#456) * add think cost * update csv scanner * add final rerank * add reasoner * add iterative planner * fix dpr search * fix dpr search * add reference data * move odps import * update requirement.txt * update 2wiki * add missing file * fix markdown reader * add iterative planning * update version * update runner * update 2wiki example * update bridge * merge solver and solver_new * add cur day * writer delete * update multi process * add missing files * fix report * add chunk retrieved executor * update try in stream runner result * add path * add math executor * update hotpotqa example * remove log * fix python coder solver * update hotpotqa example * fix python coder solver * update config * fix bad * add log * remove unused code * commit with task thought * move kag model to common * add default chat llm * fix * use static planner * support chunk graph node * add args * support naive rag * llm client support tool calls * add default async * add openai * fix result * fix markdown reader * fix thinker * update asyncio interface * feat(solver): add mcp support (#444) * 上传mcp client相关代码 * 1、完成一套mcp client的调用,从pipeline到planner、executor 2、允许json中传入多个mcp_server,通过大模型进行调用并选择 3、调通baidu_map_mcp的使用 * 1、schema * bugfix:删减冗余代码 --------- Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> * fix affairqa after solver refactor * fix affairqa after solver refactor * fix readme * add params * update version * update mcp executor * update mcp executor * solver add mcp executor * add missing file * add mpc executor * add executor * x * update * fix requirement * fix main llm config * fix solver * bugfix:修复invoke函数调用逻辑 * chg eva * update example * add kag layer * add step task * support dot refresh * support dot refresh * support dot refresh * support dot refresh * add retrieved num * add retrieved num * add pipelineconf * update ppr * update musique prompts * update * add to_dict for BuilderComponentData * async build * add deduce prompt * add deduce prompt * add deduce prompt * fix reader * add deduce prompt * add page thinker report * modify prmpt * add step status * add self cognition * add self cognition * add memory graph storage * add now time * update memory config * add now time * chg graph loader * 添加prqa数据集和代码 * bugfix:prqa调用逻辑修复 * optimize:优化代码逻辑,生成答案规范化 * add retry py code * update memory graph * update memory graph * fix * fix ner * add with_out_refer generator prompt * fix * close ckpt * fix query * fix query * update version * add llm checker * add llm checker * 1、上传evalutor.py以及修改gold_answer.json格式 2、优化代码逻辑 3、修改README.md文件 * update exp * update exp * rerank support * add static rewrite query * recall more chunks * fix graph load * add static rewrite query * fix bugs * add finish check * add finish check * add finish check * add finish check * 1、上传evalutor.py的结果 2、优化代码逻辑,优化readme文件 * add lf retry * add memory graph api * fix reader api * add ner * add metrics * fix bug * remove ner * add reraise fo retry * add edge prop to memory graph * add memory graph * 1、评测数据集结果修正 2、优化evaluator.py代码 3、删除结果不存在而gold_answer中有答案的问题 * 删除评测结果文件 * fix knext host addr * async eva * add lf prompt * add lf prompt * add config * add retry * add unknown check * add rc result * add rc result * add rc result * add rc result * 依据kag pipeline格式修改代码逻辑并通过测试 * bugfix:删除冗余代码 * fix report prompt * bugfix:触发重试机制 * bugfix:中文符号错误 * fix rethinker prompt * update version to 0.6.2b78 * update version * 1、修改evaluator.py,通过大模型计算准确率,符合最新调用逻辑 2、修改prompt,让没有回答的结果重复测试 * update affairqa for evaluate * update affairqa for evaluate * bugfix:修正数据集 * bugfix:修正数据集 * bugfix:修正数据集 * fix name conflict * bugfix:删除错误问题 * bugfix:文件名命名错误导致evaluator失败 * update for affairqa eval * bugfix:修改代码保持evaluate逻辑一致 * x * update for affairqa readme * remove temp eval scripts * bugfix for math deduce * merge 0.6.2_dev * merge 0.6.2_dev * fix * update client addr * updated version * update for affairqa eval * evaUtils 支持中文 * fix affairqa eval: * remove unused example * update kag config * fix default value * update readme * fix init * 注释信息修改,并添加部分class说明 * update example config * Tc 0.7.0 (#459) * 提交affairQA 代码 * fix affairqa eval --------- Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com> * fix all examples * reformat --------- Co-authored-by: peilong <peilong.zpl@antgroup.com> Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com> Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com> Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>
2025-04-17 17:23:52 +08:00
def node_2_doc(node: dict):
prop_set = []
for key in node.keys():
if key in ["id"]:
continue
value = node[key]
if isinstance(value, list):
value = "\n".join(value)
else:
value = str(value)
if key == "name":
prop = f"节点名称:{value}"
elif key == "description":
prop = f"描述:{value}"
else:
prop = f"{key}:{value}"
prop_set.append(prop)
return "\n".join(prop_set)
def extract_content_target(input_string):
"""
Extract the content and target parts from the input string.
Args:
input_string (str): A string containing content and target.
Returns:
dict: A dictionary containing 'content' and 'target'. If not found, the corresponding value is None.
"""
# Define regex patterns
# Content may contain newlines and special characters, so use non-greedy mode
content_pattern = r"content=\[(.*?)\]"
target_pattern = (
r"target=([^,\]]+)" # Assume target does not contain commas or closing brackets
)
# Search for content
content_match = re.search(content_pattern, input_string, re.DOTALL)
if content_match:
content = content_match.group(1).strip()
else:
content = None
# Search for target
target_match = re.search(target_pattern, input_string)
if target_match:
target = (
target_match.group(1).strip().rstrip("'")
) # Remove trailing single quote if present
else:
target = None
return content, target
def generate_unique_message_key(message):
unique_id = uuid.uuid5(uuid.NAMESPACE_URL, str(message))
timestamp = int(time.time() * 1000) # 获取当前时间戳(毫秒级)
# unique_id = uuid.uuid4().hex # 生成一个UUID并转换为十六进制字符串
async_message_key = f"KAG_{timestamp}_{unique_id}"
return async_message_key
def rrf_score(length, r: int = 1):
return np.array([1 / (r + i) for i in range(length)])
2025-05-07 10:15:56 +08:00
T = TypeVar("T")
def resolve_instance(
instance: Optional[Union[T, dict]],
default_config: dict,
from_config_func,
expected_type=None,
) -> T:
if isinstance(instance, dict):
return from_config_func(instance)
elif instance is None:
return from_config_func(default_config)
elif expected_type and not isinstance(instance, expected_type):
raise TypeError(f"Expected {expected_type}, got {type(instance)}")
else:
return instance
2025-05-28 09:56:50 +08:00
2025-05-28 09:56:50 +08:00
def extract_tag_content(text):
# 匹配<tag>和</tag>之间的内容,支持任意标签名
matches = re.findall(r"<([^>]+)>(.*?)</\1>", text, flags=re.DOTALL)
2025-05-28 09:56:50 +08:00
return [(tag, content.strip()) for tag, content in matches]
2025-05-28 09:56:50 +08:00
def extract_specific_tag_content(text, tag):
# 构建正则表达式:匹配指定标签内的内容(支持嵌套相同标签)
pattern = rf"<{tag}\b[^>]*>(.*?)</{tag}>"
2025-05-28 09:56:50 +08:00
matches = re.findall(pattern, text, flags=re.DOTALL)
return [content.strip() for content in matches]
2025-06-20 17:13:20 +08:00
def extract_box_answer(text):
pattern = r"\\boxed\{([^}]*)\}"
extracted_answers = re.findall(pattern, text)
if len(extracted_answers) == 0:
return ""
else:
return extracted_answers[0]
def search_plan_extraction(text):
text = text.replace("\n", "")
pattern = r'(?i)<search.*?>.*?</search>'
matches = re.findall(pattern, text)
# 提取内容部分
extracted_plans = []
for match in matches:
# 使用非贪婪匹配提取内容
plan = re.search(r'<search.*?>(.*?)</search>', match, re.IGNORECASE).group(1)
extracted_plans.append(plan)
if len(extracted_plans) == 0:
return ""
else:
return extracted_plans[0].strip()