dedupe fixes (#35)

2025-06-27 02:00:02 +00:00 · 2024-08-23 18:06:42 -04:00 · 2024-08-23 18:06:42 -04:00 · 0d2942daea
commit 0d2942daea
parent 57aed456fa
4 changed files with 11 additions and 7 deletions
--- a/core/llm_client/openai_client.py
+++ b/core/llm_client/openai_client.py
@ -47,7 +47,7 @@ class OpenAIClient(LLMClient):
            response = await self.client.chat.completions.create(
                model=self.model,
                messages=openai_messages,
-                temperature=0.1,
+                temperature=0,
                max_tokens=3000,
                response_format={'type': 'json_object'},
            )
--- a/core/prompts/dedupe_edges.py
+++ b/core/prompts/dedupe_edges.py
@ -54,8 +54,9 @@ def v1(context: dict[str, Any]) -> list[Message]:
        do not return it in the list of unique facts.

        Guidelines:
-        1. The facts do not have to be completely identical to be duplicates, 
-        they just need to have similar factual content
+        1. identical or near identical facts are duplicates
+        2. Facts are also duplicates if they are represented by similar sentences
+        3. Facts will often discuss the same or similar relation between identical entities

        Respond with a JSON object in the following format:
        {{
@ -130,8 +131,10 @@ def edge_list(context: dict[str, Any]) -> list[Message]:
        If any facts in Facts is a duplicate of another fact, return a new fact with one of their uuid's.

        Guidelines:
-        1. The facts do not have to be completely identical to be duplicates, they just need to have similar content
-        2. The final list should have only unique facts. If 3 facts are all duplicates of each other, only one of their
+        1. identical or near identical facts are duplicates
+        2. Facts are also duplicates if they are represented by similar sentences
+        3. Facts will often discuss the same or similar relation between identical entities
+        4. The final list should have only unique facts. If 3 facts are all duplicates of each other, only one of their
            facts should be in the response

        Respond with a JSON object in the following format:
--- a/core/prompts/extract_edges.py
+++ b/core/prompts/extract_edges.py
@ -122,7 +122,7 @@ def v2(context: dict[str, Any]) -> list[Message]:
                    "relation_type": "RELATION_TYPE_IN_CAPS",
                    "source_node_uuid": "uuid of the source entity node",
                    "target_node_uuid": "uuid of the target entity node",
-                    "fact": "Detailed description of the relationship",
+                    "fact": "brief description of the relationship",
                    "valid_at": "YYYY-MM-DDTHH:MM:SSZ or null if not explicitly mentioned",
                    "invalid_at": "YYYY-MM-DDTHH:MM:SSZ or null if ongoing or not explicitly mentioned"
                }}
--- a/core/prompts/extract_nodes.py
+++ b/core/prompts/extract_nodes.py
@ -125,10 +125,11 @@ def v3(context: dict[str, Any]) -> list[Message]:
    sys_prompt = """You are an AI assistant that extracts entity nodes from conversational text. Your primary task is to identify and extract the speaker and other significant entities mentioned in the conversation."""

    user_prompt = f"""
-Given the following conversation, extract entity nodes that are explicitly or implicitly mentioned:
+Given the following conversation, extract entity nodes from the CURRENT MESSAGE that are explicitly or implicitly mentioned:

 Conversation:
 {json.dumps([ep['content'] for ep in context['previous_episodes']], indent=2)}
+<CURRENT MESSAGE>
 {context["episode_content"]}

 Guidelines: