mirror of
https://github.com/HKUDS/LightRAG.git
synced 2025-11-09 06:13:47 +00:00
fix: Deduplicate entities and relationships in a single chunk with multiple gleaning results during KG rebuild
This commit is contained in:
parent
70e154b0aa
commit
6b6d14bc3a
@ -284,6 +284,7 @@ async def _rebuild_knowledge_from_chunks(
|
||||
pipeline_status["history_messages"].append(status_message)
|
||||
|
||||
# Get cached extraction results for these chunks using storage
|
||||
# cached_results: chunk_id -> [list of extraction result from LLM cache sorted by created_at]
|
||||
cached_results = await _get_cached_extraction_results(
|
||||
llm_response_cache,
|
||||
all_referenced_chunk_ids,
|
||||
@ -309,6 +310,7 @@ async def _rebuild_knowledge_from_chunks(
|
||||
chunk_entities[chunk_id] = defaultdict(list)
|
||||
chunk_relationships[chunk_id] = defaultdict(list)
|
||||
|
||||
# process multiple LLM extraction results for a single chunk_id
|
||||
for extraction_result in extraction_results:
|
||||
entities, relationships = await _parse_extraction_result(
|
||||
text_chunks_storage=text_chunks_storage,
|
||||
@ -317,9 +319,20 @@ async def _rebuild_knowledge_from_chunks(
|
||||
)
|
||||
|
||||
# Merge entities and relationships from this extraction result
|
||||
# Only keep the first occurrence of each entity_name in the same chunk_id
|
||||
for entity_name, entity_list in entities.items():
|
||||
if (
|
||||
entity_name not in chunk_entities[chunk_id]
|
||||
or len(chunk_entities[chunk_id][entity_name]) == 0
|
||||
):
|
||||
chunk_entities[chunk_id][entity_name].extend(entity_list)
|
||||
|
||||
# Only keep the first occurrence of each rel_key in the same chunk_id
|
||||
for rel_key, rel_list in relationships.items():
|
||||
if (
|
||||
rel_key not in chunk_relationships[chunk_id]
|
||||
or len(chunk_relationships[chunk_id][rel_key]) == 0
|
||||
):
|
||||
chunk_relationships[chunk_id][rel_key].extend(rel_list)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user