From ccc2a200712359869d165ebcae29bd4e6ceef3fe Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 15 Jul 2025 12:26:33 +0800 Subject: [PATCH] feat: remove deprecated MAX_TOKEN_SUMMARY parameter to prevent LLM output truncation - Remove MAX_TOKEN_SUMMARY parameter and related configurations - Eliminate forced token-based truncation in entity/relationship descriptions - Switch to fragment-count based summarization logic using FORCE_LLM_SUMMARY_ON_MERGE - Update FORCE_LLM_SUMMARY_ON_MERGE default from 6 to 4 for better summarization - Clean up documentation, environment examples, and API display code - Preserve backward compatibility by graceful parameter removal This change resolves issues where LLMs were forcibly truncating entity relationship descriptions mid-sentence, leading to incomplete and potentially inaccurate knowledge graph content. The new approach allows LLMs to generate complete descriptions while still providing summarization when multiple fragments need to be merged. Breaking Change: None - parameter removal is backward compatible Fixes: Entity relationship description truncation issues --- README-zh.md | 1 - README.md | 1 - env.example | 2 -- lightrag/api/utils_api.py | 4 ---- lightrag/constants.py | 3 +-- lightrag/lightrag.py | 5 ----- lightrag/operate.py | 1 - 7 files changed, 1 insertion(+), 16 deletions(-) diff --git a/README-zh.md b/README-zh.md index 8b377e0e..9f7f314e 100644 --- a/README-zh.md +++ b/README-zh.md @@ -242,7 +242,6 @@ if __name__ == "__main__": | **tokenizer** | `Tokenizer` | 用于将文本转换为 tokens(数字)以及使用遵循 TokenizerInterface 协议的 .encode() 和 .decode() 函数将 tokens 转换回文本的函数。 如果您不指定,它将使用默认的 Tiktoken tokenizer。 | `TiktokenTokenizer` | | **tiktoken_model_name** | `str` | 如果您使用的是默认的 Tiktoken tokenizer,那么这是要使用的特定 Tiktoken 模型的名称。如果您提供自己的 tokenizer,则忽略此设置。 | `gpt-4o-mini` | | **entity_extract_max_gleaning** | `int` | 实体提取过程中的循环次数,附加历史消息 | `1` | -| **entity_summary_to_max_tokens** | `int` | 每个实体摘要的最大令牌大小 | `500` | | **node_embedding_algorithm** | `str` | 节点嵌入算法(当前未使用) | `node2vec` | | **node2vec_params** | `dict` | 节点嵌入的参数 | `{"dimensions": 1536,"num_walks": 10,"walk_length": 40,"window_size": 2,"iterations": 3,"random_seed": 3,}` | | **embedding_func** | `EmbeddingFunc` | 从文本生成嵌入向量的函数 | `openai_embed` | diff --git a/README.md b/README.md index 5d8a642f..fa2b5924 100644 --- a/README.md +++ b/README.md @@ -249,7 +249,6 @@ A full list of LightRAG init parameters: | **tokenizer** | `Tokenizer` | The function used to convert text into tokens (numbers) and back using .encode() and .decode() functions following `TokenizerInterface` protocol. If you don't specify one, it will use the default Tiktoken tokenizer. | `TiktokenTokenizer` | | **tiktoken_model_name** | `str` | If you're using the default Tiktoken tokenizer, this is the name of the specific Tiktoken model to use. This setting is ignored if you provide your own tokenizer. | `gpt-4o-mini` | | **entity_extract_max_gleaning** | `int` | Number of loops in the entity extraction process, appending history messages | `1` | -| **entity_summary_to_max_tokens** | `int` | Maximum token size for each entity summary | `500` | | **node_embedding_algorithm** | `str` | Algorithm for node embedding (currently not used) | `node2vec` | | **node2vec_params** | `dict` | Parameters for node embedding | `{"dimensions": 1536,"num_walks": 10,"walk_length": 40,"window_size": 2,"iterations": 3,"random_seed": 3,}` | | **embedding_func** | `EmbeddingFunc` | Function to generate embedding vectors from text | `openai_embed` | diff --git a/env.example b/env.example index f8f6d614..828c6d24 100644 --- a/env.example +++ b/env.example @@ -72,8 +72,6 @@ OLLAMA_EMULATING_MODEL_TAG=latest SUMMARY_LANGUAGE=English ### Number of duplicated entities/edges to trigger LLM re-summary on merge ( at least 3 is recommented) # FORCE_LLM_SUMMARY_ON_MERGE=6 -### Max tokens for entity/relations description after merge -# MAX_TOKEN_SUMMARY=500 ### Maximum number of entity extraction attempts for ambiguous content # MAX_GLEANING=1 diff --git a/lightrag/api/utils_api.py b/lightrag/api/utils_api.py index a724069d..b7099bb3 100644 --- a/lightrag/api/utils_api.py +++ b/lightrag/api/utils_api.py @@ -10,7 +10,6 @@ from ascii_colors import ASCIIColors from lightrag.api import __api_version__ as api_version from lightrag import __version__ as core_version from lightrag.constants import ( - DEFAULT_MAX_TOKEN_SUMMARY, DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE, ) from fastapi import HTTPException, Security, Request, status @@ -280,9 +279,6 @@ def display_splash_screen(args: argparse.Namespace) -> None: ASCIIColors.white(" ├─ Top-K: ", end="") ASCIIColors.yellow(f"{args.top_k}") ASCIIColors.white(" ├─ Max Token Summary: ", end="") - ASCIIColors.yellow( - f"{get_env_value('MAX_TOKEN_SUMMARY', DEFAULT_MAX_TOKEN_SUMMARY, int)}" - ) ASCIIColors.white(" └─ Force LLM Summary on Merge: ", end="") ASCIIColors.yellow( f"{get_env_value('FORCE_LLM_SUMMARY_ON_MERGE', DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE, int)}" diff --git a/lightrag/constants.py b/lightrag/constants.py index 82451a36..c3fd6531 100644 --- a/lightrag/constants.py +++ b/lightrag/constants.py @@ -8,8 +8,7 @@ consistency and makes maintenance easier. # Default values for environment variables DEFAULT_MAX_GLEANING = 1 -DEFAULT_MAX_TOKEN_SUMMARY = 500 -DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 6 +DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 4 DEFAULT_WOKERS = 2 DEFAULT_TIMEOUT = 150 diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index b6cca32a..6ee61e2d 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -23,7 +23,6 @@ from typing import ( ) from lightrag.constants import ( DEFAULT_MAX_GLEANING, - DEFAULT_MAX_TOKEN_SUMMARY, DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE, ) from lightrag.utils import get_env_value @@ -134,10 +133,6 @@ class LightRAG: ) """Maximum number of entity extraction attempts for ambiguous content.""" - summary_to_max_tokens: int = field( - default=get_env_value("MAX_TOKEN_SUMMARY", DEFAULT_MAX_TOKEN_SUMMARY, int) - ) - force_llm_summary_on_merge: int = field( default=get_env_value( "FORCE_LLM_SUMMARY_ON_MERGE", DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE, int diff --git a/lightrag/operate.py b/lightrag/operate.py index 49de3c71..4bf579d1 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -118,7 +118,6 @@ async def _handle_entity_relation_summary( tokenizer: Tokenizer = global_config["tokenizer"] llm_max_tokens = global_config["llm_model_max_token_size"] - # summary_max_tokens = global_config["summary_to_max_tokens"] language = global_config["addon_params"].get( "language", PROMPTS["DEFAULT_LANGUAGE"]