From 6cab68bb47fb0d1ff60ca23f788b6efad4ab17ee Mon Sep 17 00:00:00 2001 From: yangdx Date: Fri, 15 Aug 2025 10:09:44 +0800 Subject: [PATCH] Improve KG chunk selection documentation and configuration clarity --- env.example | 16 ++++++++++++---- lightrag/operate.py | 8 ++++++-- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/env.example b/env.example index 2cc97d55..39d6da68 100644 --- a/env.example +++ b/env.example @@ -71,12 +71,20 @@ ENABLE_LLM_CACHE=true # MAX_RELATION_TOKENS=10000 ### control the maximum tokens send to LLM (include entities, raltions and chunks) # MAX_TOTAL_TOKENS=30000 -### chunk selection strategies for KG: WEIGHT or VECTOR -KG_CHUNK_PICK_METHOD=VECTOR -### maximum number of related chunks per source entity or relation (higher values increase re-ranking time) + +### maximum number of related chunks per source entity or relation +### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph) +### Higher values increase re-ranking time # RELATED_CHUNK_NUMBER=5 -### Reranker configuration (Set ENABLE_RERANK to true in reranking model is configed) +### chunk selection strategies +### VECTOR: Pick KG chunks by vector similarity, delivered chunks to the LLM aligning more closely with naive retrieval +### WEIGHT: Pick KG chunks by entity and chunk weight, delivered more solely KG related chunks to the LLM +### If reranking is enabled, the impact of chunk selection strategies will be diminished. +# KG_CHUNK_PICK_METHOD=VECTOR + +### Reranking configuration +### Reranker Set ENABLE_RERANK to true in reranking model is configed # ENABLE_RERANK=True ### Minimum rerank score for document chunk exclusion (set to 0.0 to keep all chunks, 0.6 or above if LLM is not strong enought) # MIN_RERANK_SCORE=0.0 diff --git a/lightrag/operate.py b/lightrag/operate.py index 1096f28d..c9be89f7 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -2779,9 +2779,12 @@ async def _find_related_text_unit_from_entities( entity_info["sorted_chunks"] = sorted_chunks total_entity_chunks += len(sorted_chunks) - # Step 4: Apply the selected chunk selection algorithm selected_chunk_ids = [] # Initialize to avoid UnboundLocalError + # Step 4: Apply the selected chunk selection algorithm + # Pick by vector similarity: + # The order of text chunks aligns with the naive retrieval's destination. + # When reranking is disabled, the text chunks delivered to the LLM tend to favor naive retrieval. if kg_chunk_pick_method == "VECTOR" and query and chunks_vdb: num_of_chunks = int(max_related_chunks * len(entities_with_chunks) / 2) @@ -2822,7 +2825,8 @@ async def _find_related_text_unit_from_entities( kg_chunk_pick_method = "WEIGHT" if kg_chunk_pick_method == "WEIGHT": - # Apply linear gradient weighted polling algorithm + # Pick by entity and chunk weight: + # When reranking is disabled, delivered more solely KG related chunks to the LLM selected_chunk_ids = pick_by_weighted_polling( entities_with_chunks, max_related_chunks, min_related_chunks=1 )