Perf: pass useless check for tidy graph (#8121)

### What problem does this PR solve?
Support passing the attribute check when the upstream has already made
sure it.

### Type of change
- [X] Performance Improvement
This commit is contained in:
Stephen Hu 2025-06-09 11:44:13 +08:00 committed by GitHub
parent ad1f89fea0
commit 2337bbf6ca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 15 additions and 13 deletions

View File

@ -166,7 +166,7 @@ async def generate_subgraph(
)
if ignored_rels:
callback(msg=f"ignored {ignored_rels} relations due to missing entities.")
tidy_graph(subgraph, callback)
tidy_graph(subgraph, callback, check_attribute=False)
subgraph.graph["source_id"] = [doc_id]
chunk = {

View File

@ -157,30 +157,32 @@ def set_tags_to_cache(kb_ids, tags):
k = hasher.hexdigest()
REDIS_CONN.set(k, json.dumps(tags).encode("utf-8"), 600)
def tidy_graph(graph: nx.Graph, callback):
def tidy_graph(graph: nx.Graph, callback, check_attribute: bool = True):
"""
Ensure all nodes and edges in the graph have some essential attribute.
"""
def is_valid_node(node_attrs: dict) -> bool:
def is_valid_item(node_attrs: dict) -> bool:
valid_node = True
for attr in ["description", "source_id"]:
if attr not in node_attrs:
valid_node = False
break
return valid_node
purged_nodes = []
for node, node_attrs in graph.nodes(data=True):
if not is_valid_node(node_attrs):
purged_nodes.append(node)
for node in purged_nodes:
graph.remove_node(node)
if purged_nodes and callback:
callback(msg=f"Purged {len(purged_nodes)} nodes from graph due to missing essential attributes.")
if check_attribute:
purged_nodes = []
for node, node_attrs in graph.nodes(data=True):
if not is_valid_item(node_attrs):
purged_nodes.append(node)
for node in purged_nodes:
graph.remove_node(node)
if purged_nodes and callback:
callback(msg=f"Purged {len(purged_nodes)} nodes from graph due to missing essential attributes.")
purged_edges = []
for source, target, attr in graph.edges(data=True):
if not is_valid_node(attr):
purged_edges.append((source, target))
if check_attribute:
if not is_valid_item(attr):
purged_edges.append((source, target))
if "keywords" not in attr:
attr["keywords"] = []
for source, target in purged_edges: