From 6cab68bb47fb0d1ff60ca23f788b6efad4ab17ee Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Fri, 15 Aug 2025 10:09:44 +0800
Subject: [PATCH] Improve KG chunk selection documentation and configuration
 clarity

---
 env.example         | 16 ++++++++++++----
 lightrag/operate.py |  8 ++++++--
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/env.example b/env.example
index 2cc97d55..39d6da68 100644
--- a/env.example
+++ b/env.example
@@ -71,12 +71,20 @@ ENABLE_LLM_CACHE=true
 # MAX_RELATION_TOKENS=10000
 ### control the maximum tokens send to LLM (include entities, raltions and chunks)
 # MAX_TOTAL_TOKENS=30000
-### chunk selection strategies for KG: WEIGHT or VECTOR
-KG_CHUNK_PICK_METHOD=VECTOR
-### maximum number of related chunks per source entity or relation (higher values increase re-ranking time)
+
+### maximum number of related chunks per source entity or relation
+###     The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
+###     Higher values increase re-ranking time
 # RELATED_CHUNK_NUMBER=5
 
-### Reranker configuration (Set ENABLE_RERANK to true in reranking model is configed)
+### chunk selection strategies
+###     VECTOR: Pick KG chunks by vector similarity, delivered chunks to the LLM aligning more closely with naive retrieval
+###     WEIGHT: Pick KG chunks by entity and chunk weight, delivered more solely KG related chunks to the LLM
+###     If reranking is enabled, the impact of chunk selection strategies will be diminished.
+# KG_CHUNK_PICK_METHOD=VECTOR
+
+### Reranking configuration
+###     Reranker Set ENABLE_RERANK to true in reranking model is configed
 # ENABLE_RERANK=True
 ### Minimum rerank score for document chunk exclusion (set to 0.0 to keep all chunks, 0.6 or above if LLM is not strong enought)
 # MIN_RERANK_SCORE=0.0
diff --git a/lightrag/operate.py b/lightrag/operate.py
index 1096f28d..c9be89f7 100644
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -2779,9 +2779,12 @@ async def _find_related_text_unit_from_entities(
         entity_info["sorted_chunks"] = sorted_chunks
         total_entity_chunks += len(sorted_chunks)
 
-    # Step 4: Apply the selected chunk selection algorithm
     selected_chunk_ids = []  # Initialize to avoid UnboundLocalError
 
+    # Step 4: Apply the selected chunk selection algorithm
+    # Pick by vector similarity: 
+    #     The order of text chunks aligns with the naive retrieval's destination. 
+    #     When reranking is disabled, the text chunks delivered to the LLM tend to favor naive retrieval.
     if kg_chunk_pick_method == "VECTOR" and query and chunks_vdb:
         num_of_chunks = int(max_related_chunks * len(entities_with_chunks) / 2)
 
@@ -2822,7 +2825,8 @@ async def _find_related_text_unit_from_entities(
                 kg_chunk_pick_method = "WEIGHT"
 
     if kg_chunk_pick_method == "WEIGHT":
-        # Apply linear gradient weighted polling algorithm
+        # Pick by entity and chunk weight:
+        #     When reranking is disabled, delivered more solely KG related chunks to the LLM
         selected_chunk_ids = pick_by_weighted_polling(
             entities_with_chunks, max_related_chunks, min_related_chunks=1
         )