2024-08-22 12:26:13 -07:00
|
|
|
import logging
|
2024-08-15 12:03:41 -04:00
|
|
|
from datetime import datetime
|
2024-08-21 12:03:32 -04:00
|
|
|
from time import time
|
2024-08-15 12:03:41 -04:00
|
|
|
|
|
|
|
from core.llm_client import LLMClient
|
2024-08-22 12:26:13 -07:00
|
|
|
from core.nodes import EntityNode, EpisodicNode
|
2024-08-15 12:03:41 -04:00
|
|
|
from core.prompts import prompt_library
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
async def extract_new_nodes(
|
2024-08-22 13:06:42 -07:00
|
|
|
llm_client: LLMClient,
|
|
|
|
episode: EpisodicNode,
|
|
|
|
relevant_schema: dict[str, any],
|
|
|
|
previous_episodes: list[EpisodicNode],
|
2024-08-15 12:03:41 -04:00
|
|
|
) -> list[EntityNode]:
|
2024-08-22 13:06:42 -07:00
|
|
|
# Prepare context for LLM
|
|
|
|
existing_nodes = [
|
|
|
|
{'name': node_name, 'label': node_info['label'], 'uuid': node_info['uuid']}
|
|
|
|
for node_name, node_info in relevant_schema['nodes'].items()
|
|
|
|
]
|
|
|
|
|
|
|
|
context = {
|
|
|
|
'episode_content': episode.content,
|
|
|
|
'episode_timestamp': (episode.valid_at.isoformat() if episode.valid_at else None),
|
|
|
|
'existing_nodes': existing_nodes,
|
|
|
|
'previous_episodes': [
|
|
|
|
{
|
|
|
|
'content': ep.content,
|
|
|
|
'timestamp': ep.valid_at.isoformat() if ep.valid_at else None,
|
|
|
|
}
|
|
|
|
for ep in previous_episodes
|
|
|
|
],
|
|
|
|
}
|
|
|
|
|
|
|
|
llm_response = await llm_client.generate_response(prompt_library.extract_nodes.v1(context))
|
|
|
|
new_nodes_data = llm_response.get('new_nodes', [])
|
|
|
|
logger.info(f'Extracted new nodes: {new_nodes_data}')
|
|
|
|
# Convert the extracted data into EntityNode objects
|
|
|
|
new_nodes = []
|
|
|
|
for node_data in new_nodes_data:
|
|
|
|
# Check if the node already exists
|
|
|
|
if not any(existing_node['name'] == node_data['name'] for existing_node in existing_nodes):
|
|
|
|
new_node = EntityNode(
|
|
|
|
name=node_data['name'],
|
|
|
|
labels=node_data['labels'],
|
|
|
|
summary=node_data['summary'],
|
|
|
|
created_at=datetime.now(),
|
|
|
|
)
|
|
|
|
new_nodes.append(new_node)
|
|
|
|
logger.info(f'Created new node: {new_node.name} (UUID: {new_node.uuid})')
|
|
|
|
else:
|
|
|
|
logger.info(f"Node {node_data['name']} already exists, skipping creation.")
|
|
|
|
|
|
|
|
return new_nodes
|
2024-08-18 13:22:31 -04:00
|
|
|
|
|
|
|
|
|
|
|
async def extract_nodes(
|
2024-08-22 13:06:42 -07:00
|
|
|
llm_client: LLMClient,
|
|
|
|
episode: EpisodicNode,
|
|
|
|
previous_episodes: list[EpisodicNode],
|
2024-08-18 13:22:31 -04:00
|
|
|
) -> list[EntityNode]:
|
2024-08-22 13:06:42 -07:00
|
|
|
start = time()
|
|
|
|
|
|
|
|
# Prepare context for LLM
|
|
|
|
context = {
|
|
|
|
'episode_content': episode.content,
|
|
|
|
'episode_timestamp': (episode.valid_at.isoformat() if episode.valid_at else None),
|
|
|
|
'previous_episodes': [
|
|
|
|
{
|
|
|
|
'content': ep.content,
|
|
|
|
'timestamp': ep.valid_at.isoformat() if ep.valid_at else None,
|
|
|
|
}
|
|
|
|
for ep in previous_episodes
|
|
|
|
],
|
|
|
|
}
|
|
|
|
|
|
|
|
llm_response = await llm_client.generate_response(prompt_library.extract_nodes.v3(context))
|
|
|
|
new_nodes_data = llm_response.get('new_nodes', [])
|
|
|
|
|
|
|
|
end = time()
|
|
|
|
logger.info(f'Extracted new nodes: {new_nodes_data} in {(end - start) * 1000} ms')
|
|
|
|
# Convert the extracted data into EntityNode objects
|
|
|
|
new_nodes = []
|
|
|
|
for node_data in new_nodes_data:
|
|
|
|
new_node = EntityNode(
|
|
|
|
name=node_data['name'],
|
|
|
|
labels=node_data['labels'],
|
|
|
|
summary=node_data['summary'],
|
|
|
|
created_at=datetime.now(),
|
|
|
|
)
|
|
|
|
new_nodes.append(new_node)
|
|
|
|
logger.info(f'Created new node: {new_node.name} (UUID: {new_node.uuid})')
|
|
|
|
|
|
|
|
return new_nodes
|
2024-08-18 13:22:31 -04:00
|
|
|
|
|
|
|
|
|
|
|
async def dedupe_extracted_nodes(
|
2024-08-22 13:06:42 -07:00
|
|
|
llm_client: LLMClient,
|
|
|
|
extracted_nodes: list[EntityNode],
|
|
|
|
existing_nodes: list[EntityNode],
|
2024-08-21 12:03:32 -04:00
|
|
|
) -> tuple[list[EntityNode], dict[str, str]]:
|
2024-08-22 13:06:42 -07:00
|
|
|
start = time()
|
2024-08-21 12:03:32 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
# build existing node map
|
|
|
|
node_map = {}
|
|
|
|
for node in existing_nodes:
|
|
|
|
node_map[node.name] = node
|
2024-08-18 13:22:31 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
# Prepare context for LLM
|
|
|
|
existing_nodes_context = [
|
|
|
|
{'name': node.name, 'summary': node.summary} for node in existing_nodes
|
|
|
|
]
|
2024-08-18 13:22:31 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
extracted_nodes_context = [
|
|
|
|
{'name': node.name, 'summary': node.summary} for node in extracted_nodes
|
|
|
|
]
|
2024-08-18 13:22:31 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
context = {
|
|
|
|
'existing_nodes': existing_nodes_context,
|
|
|
|
'extracted_nodes': extracted_nodes_context,
|
|
|
|
}
|
2024-08-18 13:22:31 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
llm_response = await llm_client.generate_response(prompt_library.dedupe_nodes.v2(context))
|
2024-08-18 13:22:31 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
duplicate_data = llm_response.get('duplicates', [])
|
2024-08-21 12:03:32 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
end = time()
|
|
|
|
logger.info(f'Deduplicated nodes: {duplicate_data} in {(end - start) * 1000} ms')
|
2024-08-21 12:03:32 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
uuid_map = {}
|
|
|
|
for duplicate in duplicate_data:
|
|
|
|
uuid = node_map[duplicate['name']].uuid
|
|
|
|
uuid_value = node_map[duplicate['duplicate_of']].uuid
|
|
|
|
uuid_map[uuid] = uuid_value
|
2024-08-18 13:22:31 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
nodes = []
|
|
|
|
for node in extracted_nodes:
|
|
|
|
if node.uuid in uuid_map:
|
|
|
|
existing_name = uuid_map[node.name]
|
|
|
|
existing_node = node_map[existing_name]
|
|
|
|
nodes.append(existing_node)
|
|
|
|
continue
|
|
|
|
nodes.append(node)
|
2024-08-18 13:22:31 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
return nodes, uuid_map
|
2024-08-21 12:03:32 -04:00
|
|
|
|
|
|
|
|
|
|
|
async def dedupe_node_list(
|
2024-08-22 13:06:42 -07:00
|
|
|
llm_client: LLMClient,
|
|
|
|
nodes: list[EntityNode],
|
2024-08-21 12:03:32 -04:00
|
|
|
) -> tuple[list[EntityNode], dict[str, str]]:
|
2024-08-22 13:06:42 -07:00
|
|
|
start = time()
|
2024-08-21 12:03:32 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
# build node map
|
|
|
|
node_map = {}
|
|
|
|
for node in nodes:
|
|
|
|
node_map[node.name] = node
|
2024-08-21 12:03:32 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
# Prepare context for LLM
|
|
|
|
nodes_context = [{'name': node.name, 'summary': node.summary} for node in nodes]
|
2024-08-21 12:03:32 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
context = {
|
|
|
|
'nodes': nodes_context,
|
|
|
|
}
|
2024-08-21 12:03:32 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
llm_response = await llm_client.generate_response(
|
|
|
|
prompt_library.dedupe_nodes.node_list(context)
|
|
|
|
)
|
2024-08-21 12:03:32 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
nodes_data = llm_response.get('nodes', [])
|
2024-08-21 12:03:32 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
end = time()
|
|
|
|
logger.info(f'Deduplicated nodes: {nodes_data} in {(end - start) * 1000} ms')
|
2024-08-21 12:03:32 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
# Get full node data
|
|
|
|
unique_nodes = []
|
|
|
|
uuid_map: dict[str, str] = {}
|
|
|
|
for node_data in nodes_data:
|
|
|
|
node = node_map[node_data['names'][0]]
|
|
|
|
unique_nodes.append(node)
|
2024-08-21 12:03:32 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
for name in node_data['names'][1:]:
|
|
|
|
uuid = node_map[name].uuid
|
|
|
|
uuid_value = node_map[node_data['names'][0]].uuid
|
|
|
|
uuid_map[uuid] = uuid_value
|
2024-08-21 12:03:32 -04:00
|
|
|
|
2024-08-22 13:06:42 -07:00
|
|
|
return unique_nodes, uuid_map
|