update examples

This commit is contained in:
锦呈 2025-06-26 14:18:56 +08:00
parent 59376537ae
commit 9d8592bd73
229 changed files with 7 additions and 4556 deletions

View File

@ -1065,6 +1065,5 @@ if __name__ == "__main__":
file_path = os.path.join(
dir_path, "../../../../tests/unit/builder/data", "需求内容test.md"
)
file_path = "/Users/zhangxinhong.zxh/workspace/KAG/dep/KAG/kag/examples/AFAC2024/builder/data/BY04.md"
chunks = reader.invoke(file_path, write_ckpt=False)
print(chunks)

View File

@ -1 +0,0 @@
ckpt/

View File

@ -1,14 +0,0 @@
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
"""
Builder Dir.
"""

View File

@ -1,15 +0,0 @@
from kag.common.checkpointer import CheckPointer, CheckpointerManager
checkpointer: CheckPointer = CheckpointerManager.get_checkpointer(
{
"type": "diskcache",
# "ckpt_dir": "ckpt/SchemaFreeExtractor",
"ckpt_dir": "ckpt/OutlineExtractor",
}
)
if checkpointer.size() > 0:
tmp_key = checkpointer.keys()[-1]
print(checkpointer.read_from_ckpt(tmp_key))
else:
print("checkpoint is empty")

View File

@ -1,14 +0,0 @@
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
"""
Place the files to be used for building the index in this directory.
"""

View File

@ -1,50 +0,0 @@
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
import asyncio
import logging
import argparse
import os
from kag.common.registry import import_modules_from_path
from kag.builder.runner import BuilderChainRunner
logger = logging.getLogger(__name__)
async def buildKB(file_path):
from kag.common.conf import KAG_CONFIG
runner = BuilderChainRunner.from_config(
KAG_CONFIG.all_config["kag_builder_pipeline"]
)
await runner.ainvoke(file_path)
logger.info(f"\n\nbuildKB successfully for {file_path}\n\n")
if __name__ == "__main__":
import_modules_from_path(".")
parser = argparse.ArgumentParser(description="args")
parser.add_argument(
"--corpus_file",
type=str,
help="test file name in /data",
default="./data/附件1 10kV110kV线路保护及辅助装置标准化设计规范.md",
)
args = parser.parse_args()
file_path = args.corpus_file
dir_path = os.path.dirname(__file__)
file_path = os.path.join(dir_path, file_path)
asyncio.run(buildKB(file_path))

View File

@ -1,14 +0,0 @@
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
"""
Place the prompts to be used for building the index in this directory.
"""

View File

@ -1,146 +0,0 @@
#------------project configuration start----------------#
openie_llm: &openie_llm
type: maas
base_url: https://dashscope.aliyuncs.com/compatible-mode/v1
api_key: key
model: qwen2.5-7b-instruct-1m
enable_check: false
max_tokens: 8192
chat_llm: &chat_llm
type: maas
base_url: https://dashscope.aliyuncs.com/compatible-mode/v1
api_key: key
model: qwen2.5-72b-instruct
enable_check: false
max_tokens: 8192
ner_llm:
type: maas
base_url: https://dashscope.aliyuncs.com/compatible-mode/v1
api_key: key
model: qwen2.5-72b-instruct
enable_check: false
max_tokens: 8192
vectorize_model: &vectorize_model
api_key: key
base_url: https://api.siliconflow.cn/v1
model: BAAI/bge-m3
type: openai
vector_dimensions: 1024
vectorizer: *vectorize_model
log:
level: INFO
project:
biz_scene: default
host_addr: http://127.0.0.1:8887
id: '5'
language: en
namespace: EastElectric
#------------project configuration end----------------#
#------------kag-builder configuration start----------------#
e1: &e1
type: schema_free_extractor # kag.builder.component.extractor.schema_free_extractor.SchemaFreeExtractor
llm: *openie_llm
ner_prompt:
type: default_ner # kag.builder.prompt.default.ner.OpenIENERPrompt
std_prompt:
type: default_std # kag.builder.prompt.default.std.OpenIEEntitystandardizationdPrompt
triple_prompt:
type: default_triple # kag.builder.prompt.default.triple.OpenIETriplePrompt
e2: &e2
type: outline_extractor # kag.builder.component.extractor.outline_extractor.OutlineExtractor
e3: &e3
type: summary_extractor # kag.builder.component.extractor.summary_extractor.SummaryExtractor
e4: &e4
type: chunk_extractor # kag.builder.component.extractor.chunk_extractor.ChunkExtractor
kag_builder_pipeline:
chain:
type: unstructured_builder_chain # kag.builder.default_chain.DefaultUnstructuredBuilderChain
extractor:
- *e2
- *e3
- *e4
reader:
type: md_reader # kag.builder.component.reader.dict_reader.DictReader
cut_depth: 6
post_processor:
type: kag_post_processor # kag.builder.component.postprocessor.kag_postprocessor.KAGPostProcessor
splitter:
type: length_splitter # kag.builder.component.splitter.length_splitter.LengthSplitter
split_length: 1000
window_length: 0
vectorizer:
type: batch_vectorizer # kag.builder.component.vectorizer.batch_vectorizer.BatchVectorizer
vectorize_model: *vectorize_model
writer:
type: kg_writer # kag.builder.component.writer.kg_writer.KGWriter
num_threads_per_chain: 1
num_chains: 16
scanner:
type: file_scanner # kag.builder.component.scanner.dataset_scanner.HotpotqaCorpusScanner
#------------kag-builder configuration end----------------#
#------------kag-solver configuration start----------------#
search_api: &search_api
type: openspg_search_api #kag.solver.tools.search_api.impl.openspg_search_api.OpenSPGSearchAPI
graph_api: &graph_api
type: openspg_graph_api #kag.solver.tools.graph_api.impl.openspg_graph_api.OpenSPGGraphApi
r1: &r1
type: vector_chunk_retriever
vectorize_model: *vectorize_model
search_api: *search_api
top_k: 10
r2: &r2
type: text_chunk_retriever
search_api: *search_api
top_k: 10
r3: &r3
type: outline_chunk_retriever # recall chunk through outline
search_api: *search_api
graph_api: *graph_api
vectorize_model: *vectorize_model
top_k: 5
r4: &r4
type: summary_chunk_retriever # recall chunk through summary
search_api: *search_api
graph_api: *graph_api
vectorize_model: *vectorize_model
top_k: 5
retriever_executor: &retriever_executor
type: kag_hybrid_retrieval_executor
retrievers:
- *r3
merger:
type: kag_merger
kag_solver_pipeline:
type: kag_static_pipeline
planner:
type: kag_static_planner
llm: *chat_llm
plan_prompt:
type: default_retriever_static_planning
rewrite_prompt:
type: default_query_rewrite
executors:
- *retriever_executor
generator:
type: llm_generator_with_thought
llm_client: *chat_llm
#------------kag-solver configuration end----------------#

View File

@ -1,20 +0,0 @@
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
"""
Place the DSL file for graph reasoning in this directory.
For example:
```company.dsl
MATCH (s:DEFAULT.Company)
RETURN s.id, s.address
```
"""

View File

@ -1,153 +0,0 @@
namespace EastElectric
Chunk(文本块): EntityType
properties:
content(内容): Text
index: TextAndVector
Outline(标题大纲): EntityType
properties:
content(内容): Text
index: TextAndVector
relations:
sourceChunk(关联): Chunk
childOf(子标题): Outline
Diagram(标题大纲): EntityType
properties:
content(内容): Text
index: TextAndVector
relations:
sourceChunk(关联): Chunk
Summary(文本摘要): EntityType
properties:
content(内容): Text
index: TextAndVector
relations:
sourceChunk(关联): Chunk
childOf(子摘要): Summary
ArtificialObject(人造物体): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
Astronomy(天文学): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
Building(建筑): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
Creature(生物): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
Concept(概念): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
Date(日期): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
GeographicLocation(地理位置): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
Keyword(关键词): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
Medicine(药物): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
NaturalScience(自然科学): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
Organization(组织机构): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
Person(人物): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
Transport(运输): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
Works(作品): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
Others(其他): EntityType
properties:
desc(描述): Text
index: TextAndVector
semanticType(语义类型): Text
index: Text
Event(事件): EventType
properties:
subject(主体): Person
participants(参与者): Person
constraint: MultiValue
time(时间): Date
location(地点): GeographicLocation
abstract(摘要): Text
index: TextAndVector
type(事件类型): Text
index: Text
SemanticConcept(语义概念): EntityType
properties:
desc(内容): Text
index: Text

View File

@ -1,18 +0,0 @@
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
"""
{{namespace}}.schema:
The MarkLang file for the schema of this project.
You can execute `kag schema commit` to commit your schema to SPG server.
"""

View File

@ -1,14 +0,0 @@
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
"""
Place the files to be used for building the index in this directory.
"""

View File

@ -1,14 +0,0 @@
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
"""
Place the prompts to be used for solving problems in this directory.
"""

View File

@ -1,115 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
# flake8: noqa
import asyncio
import json
from typing import List
from kag.interface import (
ExecutorABC,
LLMClient,
Task,
Context,
RetrieverOutputMerger,
RetrieverABC,
)
@ExecutorABC.register("evidence_based_reasoner")
class EvidenceBasedReasoner(ExecutorABC):
def __init__(
self,
llm: LLMClient,
retrievers: List[RetrieverABC],
merger: RetrieverOutputMerger,
):
self.llm = llm
self.retrievers = retrievers
self.merger = merger
async def ainvoke(self, query: str, task: Task, context: Context, **kwargs):
# retrieve_task = Task(
# executor=self.retriever.schema()["name"],
# arguments=task.arguments,
# id=task.id,
# )
retrieval_futures = []
for retriever in self.retrievers:
retrieval_futures.append(
asyncio.create_task(retriever.ainvoke(task, **kwargs))
)
outputs = await asyncio.gather(*retrieval_futures)
merged = await self.merger.ainvoke(task, outputs, **kwargs)
retrieved_docs = []
for chunk in merged.chunks:
retrieved_docs.append(chunk.content)
retrieved_docs = "\n\n".join(retrieved_docs)
system_instruction = """
作为解决复杂多跳问题的专家我需要您协助我解答一个多跳问题该问题被拆分成多个简单的单跳查询每个问题可能依赖于前面问题的答案也就是说问题正文可能包含诸如{{i.output}}之类的内容表示第i个子问题的答案我将为您提供一些关于如何解答这些初步问题或答案本身的见解这些见解对于准确解决问题至关重要此外我将提供与当前问题相关的文本摘录建议您仔细阅读并彻底理解您的回复请以思考开头概述逐步得出结论的思考过程最后以答案结尾以便清晰准确地给出答案无需任何额外的注释
召回文档
Sylvester
Sylvester 这个名字源自拉丁语形容词 silvestris意为树木繁茂的荒野的 silvestris 又源自名词 silva意为林地古典拉丁语将其拼写为 i在古典拉丁语中y 代表与 i 不同的独立发音这并非拉丁语的固有发音而是用于转录外来词的发音古典时期之后y 的发音开始为 i Sylv代替 Silv拼写的拼写可以追溯到古典时期之后
伊利诺伊州香槟县斯坦顿镇
斯坦顿镇是美国伊利诺伊州香槟县的一个镇区根据 2010 年人口普查其人口为 505 共有 202 个住房单元
纽约州蒙特贝罗
蒙特贝罗意大利语美丽的山峰是美国纽约州罗克兰县拉马波镇的一个建制村它位于萨弗恩以北希尔伯恩以东韦斯利山以南艾尔蒙特以西2010 年人口普查时人口为 4,526
埃里克·霍特
埃里克·霍特1987 2 16 日出生于纽约州蒙特贝罗是一名美国足球运动员目前是自由球员
问题
0公元 800 谁被加冕为西方皇帝
思考提供的关于查理曼大帝的一段文字表明他于 800 年被加冕为神圣罗马帝国皇帝答案查理曼大帝
1{{0.output}} 后来被称为什么
思考为了确定 {{0.oputput}}查理曼大帝后来被称为什么我需要复习一下提供的关于查理曼大帝的文章文章表明查理曼大帝也被称为查理大帝答案查理大帝
2 {{0.output}} 时代Sylvester 这个姓氏起源于什么语言
思考这个问题询问的是 {{0.output}} 查理曼大帝统治时期Sylvester 这个姓氏的起源当时正值中世纪早期关于 Sylvester 这个名字的文章指出它源于拉丁语答案拉丁语
"""
query = f"{task.id}: {task.arguments['query']}"
subqa = []
for pt in task.parents:
subq = f"{pt.id}: {pt.arguments['query']}"
result = json.loads(pt.result)
suba = str(result["response"])
subqa.append(f"{subq}\n{suba}")
subqa = "\n\n".join(subqa)
request = f"{system_instruction}\nDocs:\n{retrieved_docs}\nQuestions:\n{subqa}\n{query}"
# print(f"Reasoner request = {request}")
response = await self.llm.acall(request)
# print(f"Reasoner response = {response}")
task.update_memory("retriever", merged)
task.result = json.dumps(
{"query": task.arguments["query"], "response": response}, ensure_ascii=False
)
return response
def schema(self, func_name: str = None):
return {
"name": "Retriever",
"description": "Synthesizes precise, evidence-backed answers to user queries by analyzing provided contextual documents. Note: Contextual documents are pre-loaded and processed implicitly; no explicit context parameter is required.",
"parameters": {
"query": {
"type": "string",
"description": "User-provided query.",
"optional": False,
},
},
}

View File

@ -1,76 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2023 OpenSPG Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
# flake8: noqa
import json
from kag.common.tools.algorithm_tool.rerank.rerank_by_vector import RerankByVector
from kag.interface import GeneratorABC, LLMClient
from kag.solver.executor.retriever.local_knowledge_base.kag_retriever.kag_hybrid_executor import (
to_reference_list,
)
@GeneratorABC.register("llm_generator_with_thought")
class LLMGeneratorWithThought(GeneratorABC):
def __init__(
self,
llm_client: LLMClient,
chunk_reranker: RerankByVector = None,
**kwargs,
):
super().__init__(**kwargs)
self.llm_client = llm_client
self.chunk_reranker = chunk_reranker or RerankByVector.from_config(
{
"type": "rerank_by_vector",
}
)
def invoke(self, query, context, **kwargs):
rerank_queries = []
chunks = []
thoughts = []
for task in context.gen_task(False):
print(f"task.result = {task.result}")
task_result = json.loads(task.result)
subq = task_result["query"]
suba = task_result["response"]
thoughts.append(f"Sub-Query: {subq}\n{suba}")
retrieved_docs = task.memory.get("retriever")
if retrieved_docs and self.chunk_reranker:
rerank_queries.append(task.arguments["query"])
chunks.append(retrieved_docs.chunks)
rerank_chunks = self.chunk_reranker.invoke(query, rerank_queries, chunks)
total_reference_source = rerank_chunks
refer_data = to_reference_list(
prefix_id=0, retrieved_datas=total_reference_source
)
refer_data = [f"Title:{x['document_name']}\n{x['content']}" for x in refer_data]
refer_data = "\n\n".join(refer_data)
thoughts = "\n\n".join(thoughts)
system_instruction = """
作为一名高级阅读理解助手你的任务是根据我提供的上下文回答复杂的多跳问题我提供的上下文包含两部分一组有助于回答问题的文档以及对问题的逐步分解和分析性思维过程请结合这两部分上下文来回答问题你的回答应从思考之后开始逐步系统地分解推理过程并说明你是如何得出结论的最后以答案结尾给出简洁明确的答案无需额外的阐述\n
注意
1. 我希望你的答案与召回文档完全一致
2. 如果您认为所提供的文件无法回答问题请回答未知
"""
prompt = (
f"{system_instruction}\n\n召回文档:\n{refer_data}\n思考:\n{thoughts}问题: {query}"
)
response = self.llm_client(prompt)
if "答案:" not in response:
raise ValueError(f"no answer found in response: {response}")
answer = response.split("答案:")[1].strip()
return answer

Some files were not shown because too many files have changed in this diff Show More