KAG/tests/unit/common/kag_utils.py

from kag.common.utils import extract_tag_content


def run_extra_tag():
    test_cases = [
        {
            "input": "<tag1>abced</tag1>some word<tag2>other tags</tag2>",
            "expected": [("tag1", "abced"), ("", "some word"), ("tag2", "other tags")],
            "description": "基本闭合标签与无标签文本混合",
        },
        {
            "input": "<p>Hello <b>world</b> this is <i>test</i>",
            "expected": [
                ("p", "Hello "),
                ("b", "world"),
                ("", " this is "),
                ("i", "test"),
            ],
            "description": "混合闭合与未闭合标签",
        },
        {
            "input": "plain text without any tags",
            "expected": [("", "plain text without any tags")],
            "description": "纯文本无标签",
        },
        {
            "input": "<div>\n    Line 1\n    <span>Line 2</span>\n    Line 3\n</div>",
            "expected": [
                ("div", "\n    Line 1\n    <span>Line 2</span>\n    Line 3\n")
            ],
            "description": "多行内容和空白处理",
        },
        {
            "input": "<a>A</a><b>B</b><c>C</c>",
            "expected": [("a", "A"), ("b", "B"), ("c", "C")],
            "description": "连续多个闭合标签",
        },
        {
            "input": "<title>My Document</title><content>This is the content",
            "expected": [("title", "My Document"), ("content", "This is the content")],
            "description": "未闭合标签（EOF结尾）",
        },
        {
            "input": "<log>Error: &*^%$#@!;</log><note>End of log</note>",
            "expected": [("log", "Error: &*^%$#@!;"), ("note", "End of log")],
            "description": "含特殊字符的内容",
        },
        {
            "input": "",
            "expected": [],
            "description": "空字符串输入",
        },
    ]

    for i, test in enumerate(test_cases):
        result = extract_tag_content(test["input"])
        assert (
            result == test["expected"]
        ), f"Test {i+1} failed: {test['description']}\nGot: {result}\nExpected: {test['expected']}"
        print(f"Test {i+1} passed: {test['description']}")


if __name__ == "__main__":
    run_extra_tag()
-												feat(solver): support kag thinker (#640)

* feat(kag): update to v0.7 (#456)

* add think cost

* update csv scanner

* add final rerank

* add reasoner

* add iterative planner

* fix dpr search

* fix dpr search

* add reference data

* move odps import

* update requirement.txt

* update 2wiki

* add missing file

* fix markdown reader

* add iterative planning

* update version

* update runner

* update 2wiki example

* update bridge

* merge solver and solver_new

* add cur day

* writer delete

* update multi process

* add missing files

* fix report

* add chunk retrieved executor

* update try in stream runner result

* add path

* add math executor

* update hotpotqa example

* remove log

* fix python coder solver

* update hotpotqa example

* fix python coder solver

* update config

* fix bad

* add log

* remove unused code

* commit with task thought

* move kag model to common

* add default chat llm

* fix

* use static planner

* support chunk graph node

* add args

* support naive rag

* llm client support tool calls

* add default async

* add openai

* fix result

* fix markdown reader

* fix thinker

* update asyncio interface

* feat(solver): add mcp support (#444)

* 上传mcp client相关代码

* 1、完成一套mcp client的调用，从pipeline到planner、executor
2、允许json中传入多个mcp_server，通过大模型进行调用并选择
3、调通baidu_map_mcp的使用

* 1、schema

* bugfix:删减冗余代码

---------

Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com>

* fix affairqa after solver refactor

* fix affairqa after solver refactor

* fix readme

* add params

* update version

* update mcp executor

* update mcp executor

* solver add mcp executor

* add missing file

* add mpc executor

* add executor

* x

* update

* fix requirement

* fix main llm config

* fix solver

* bugfix:修复invoke函数调用逻辑

* chg eva

* update example

* add kag layer

* add step task

* support dot refresh

* support dot refresh

* support dot refresh

* support dot refresh

* add retrieved num

* add retrieved num

* add pipelineconf

* update ppr

* update musique prompts

* update

* add to_dict for BuilderComponentData

* async build

* add deduce prompt

* add deduce prompt

* add deduce prompt

* fix reader

* add deduce prompt

* add page thinker report

* modify prmpt

* add step status

* add self cognition

* add self cognition

* add memory graph storage

* add now time

* update memory config

* add now time

* chg graph loader

* 添加prqa数据集和代码

* bugfix:prqa调用逻辑修复

* optimize：优化代码逻辑，生成答案规范化

* add retry py code

* update memory graph

* update memory graph

* fix

* fix ner

* add with_out_refer generator prompt

* fix

* close ckpt

* fix query

* fix query

* update version

* add llm checker

* add llm checker

* 1、上传evalutor.py以及修改gold_answer.json格式
2、优化代码逻辑
3、修改README.md文件

* update exp

* update exp

* rerank support

* add static rewrite query

* recall more chunks

* fix graph load

* add static rewrite query

* fix bugs

* add finish check

* add finish check

* add finish check

* add finish check

* 1、上传evalutor.py的结果
2、优化代码逻辑，优化readme文件

* add lf retry

* add memory graph api

* fix reader api

* add ner

* add metrics

* fix bug

* remove ner

* add reraise fo retry

* add edge prop to memory graph

* add memory graph

* 1、评测数据集结果修正
2、优化evaluator.py代码
3、删除结果不存在而gold_answer中有答案的问题

* 删除评测结果文件

* fix knext host addr

* async eva

* add lf prompt

* add lf prompt

* add config

* add retry

* add unknown check

* add rc result

* add rc result

* add rc result

* add rc result

* 依据kag pipeline格式修改代码逻辑并通过测试

* bugfix:删除冗余代码

* fix report prompt

* bugfix:触发重试机制

* bugfix:中文符号错误

* fix rethinker prompt

* update version to 0.6.2b78

* update version

* 1、修改evaluator.py，通过大模型计算准确率，符合最新调用逻辑
2、修改prompt，让没有回答的结果重复测试

* update affairqa for evaluate

* update affairqa for evaluate

* bugfix:修正数据集

* bugfix:修正数据集

* bugfix:修正数据集

* fix name conflict

* bugfix:删除错误问题

* bugfix:文件名命名错误导致evaluator失败

* update for affairqa eval

* bugfix:修改代码保持evaluate逻辑一致

* x

* update for affairqa readme

* remove temp eval scripts

* bugfix for math deduce

* merge 0.6.2_dev

* merge 0.6.2_dev

* fix

* update client addr

* updated version

* update for affairqa eval

* evaUtils 支持中文

* fix affairqa eval:

* remove unused example

* update kag config

* fix default value

* update readme

* fix init

* 注释信息修改，并添加部分class说明

* update example config

* Tc 0.7.0 (#459)

* 提交affairQA 代码

* fix affairqa eval

---------

Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>

* fix all examples

* reformat

---------

Co-authored-by: peilong <peilong.zpl@antgroup.com>
Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com>
Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com>
Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>

* update chunk metadata

* update chunk metadata

* add debug reporter

* update table text

* add server

* fix math executor

* update api-key for openai vec

* update

* fix naive rag bug

* format code

* fix

---------

Co-authored-by: zhuzhongshu123 <152354526+zhuzhongshu123@users.noreply.github.com>
Co-authored-by: 锦呈 <zhangxinhong.zxh@antgroup.com>
Co-authored-by: wanxingyu.wxy <wanxingyu.wxy@antgroup.com>
Co-authored-by: zhengke.gzk <zhengke.gzk@antgroup.com>
											
										
										
											2025-07-08 17:44:32 +08:00
+								from kag.common.utils import extract_tag_content
 								def run_extra_tag():
 								    test_cases = [
 								        {
 								            "input": "<tag1>abced</tag1>some word<tag2>other tags</tag2>",
 								            "expected": [("tag1", "abced"), ("", "some word"), ("tag2", "other tags")],
 								            "description": "基本闭合标签与无标签文本混合",
 								        },
 								        {
 								            "input": "<p>Hello <b>world</b> this is <i>test</i>",
 								            "expected": [
 								                ("p", "Hello "),
 								                ("b", "world"),
 								                ("", " this is "),
 								                ("i", "test"),
 								            ],
 								            "description": "混合闭合与未闭合标签",
 								        },
 								        {
 								            "input": "plain text without any tags",
 								            "expected": [("", "plain text without any tags")],
 								            "description": "纯文本无标签",
 								        },
 								        {
 								            "input": "<div>\n    Line 1\n    <span>Line 2</span>\n    Line 3\n</div>",
 								            "expected": [
 								                ("div", "\n    Line 1\n    <span>Line 2</span>\n    Line 3\n")
 								            ],
 								            "description": "多行内容和空白处理",
 								        },
 								        {
 								            "input": "<a>A</a><b>B</b><c>C</c>",
 								            "expected": [("a", "A"), ("b", "B"), ("c", "C")],
 								            "description": "连续多个闭合标签",
 								        },
 								        {
 								            "input": "<title>My Document</title><content>This is the content",
 								            "expected": [("title", "My Document"), ("content", "This is the content")],
 								            "description": "未闭合标签（EOF结尾）",
 								        },
 								        {
 								            "input": "<log>Error: &*^%$#@!;</log><note>End of log</note>",
 								            "expected": [("log", "Error: &*^%$#@!;"), ("note", "End of log")],
 								            "description": "含特殊字符的内容",
 								        },
 								        {
 								            "input": "",
 								            "expected": [],
 								            "description": "空字符串输入",
 								        },
 								    ]
 								    for i, test in enumerate(test_cases):
 								        result = extract_tag_content(test["input"])
 								        assert (
 								            result == test["expected"]
 								        ), f"Test {i+1} failed: {test['description']}\nGot: {result}\nExpected: {test['expected']}"
 								        print(f"Test {i+1} passed: {test['description']}")
 								if __name__ == "__main__":
 								    run_extra_tag()