From bfd8d3bb68e352cd02770050e045f960f000153b Mon Sep 17 00:00:00 2001
From: Preston Rasmussen <109292228+prasmussen15@users.noreply.github.com>
Date: Thu, 19 Sep 2024 16:16:40 -0400
Subject: [PATCH] Add group_id CRUD endpoints and option store content bool
 (#130)

* add group_ids CRUD

* option to not store content

* ellipsis
---
 README.md                 | 89 ++++++++++++++++++++++++---------------
 graphiti_core/edges.py    | 79 +++++++++++++++++++++++++++++++---
 graphiti_core/graphiti.py | 17 ++++++--
 graphiti_core/nodes.py    | 72 ++++++++++++++++++++++++++-----
 4 files changed, 202 insertions(+), 55 deletions(-)
diff --git a/README.md b/README.md
index 02b326d3..766caf8a 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,9 @@
 
 </div>
 
-Graphiti builds dynamic, temporally aware Knowledge Graphs that represent complex, evolving relationships between entities over time. Graphiti ingests both unstructured and structured data, and the resulting graph may be queried using a fusion of time, full-text, semantic, and graph algorithm approaches.
+Graphiti builds dynamic, temporally aware Knowledge Graphs that represent complex, evolving relationships between
+entities over time. Graphiti ingests both unstructured and structured data, and the resulting graph may be queried using
+a fusion of time, full-text, semantic, and graph algorithm approaches.
 
 <br />
 
@@ -26,25 +28,39 @@ Graphiti builds dynamic, temporally aware Knowledge Graphs that represent comple
 
 <br />
 
-Graphiti helps you create and query Knowledge Graphs that evolve over time. A knowledge graph is a network of interconnected facts, such as _“Kendra loves Adidas shoes.”_ Each fact is a “triplet” represented by two entities, or nodes (_”Kendra”_, _“Adidas shoes”_), and their relationship, or edge (_”loves”_). Knowledge Graphs have been explored extensively for information retrieval. What makes Graphiti unique is its ability to autonomously build a knowledge graph while handling changing relationships and maintaining historical context.
+Graphiti helps you create and query Knowledge Graphs that evolve over time. A knowledge graph is a network of
+interconnected facts, such as _“Kendra loves Adidas shoes.”_ Each fact is a “triplet” represented by two entities, or
+nodes (_”Kendra”_, _“Adidas shoes”_), and their relationship, or edge (_”loves”_). Knowledge Graphs have been explored
+extensively for information retrieval. What makes Graphiti unique is its ability to autonomously build a knowledge graph
+while handling changing relationships and maintaining historical context.
 
 With Graphiti, you can build LLM applications such as:
 
-- Assistants that learn from user interactions, fusing personal knowledge with dynamic data from business systems like CRMs and billing platforms.
+- Assistants that learn from user interactions, fusing personal knowledge with dynamic data from business systems like
+  CRMs and billing platforms.
 - Agents that autonomously execute complex tasks, reasoning with state changes from multiple dynamic sources.
 
-Graphiti supports a wide range of applications in sales, customer service, health, finance, and more, enabling long-term recall and state-based reasoning for both assistants and agents.
+Graphiti supports a wide range of applications in sales, customer service, health, finance, and more, enabling long-term
+recall and state-based reasoning for both assistants and agents.
 
 ## Why Graphiti?
 
-We were intrigued by Microsoft’s GraphRAG, which expanded on RAG text chunking by using a graph to better model a document corpus and making this representation available via semantic and graph search techniques. However, GraphRAG did not address our core problem: It's primarily designed for static documents and doesn't inherently handle temporal aspects of data.
+We were intrigued by Microsoft’s GraphRAG, which expanded on RAG text chunking by using a graph to better model a
+document corpus and making this representation available via semantic and graph search techniques. However, GraphRAG did
+not address our core problem: It's primarily designed for static documents and doesn't inherently handle temporal
+aspects of data.
 
-Graphiti is designed from the ground up to handle constantly changing information, hybrid semantic and graph search, and scale:
+Graphiti is designed from the ground up to handle constantly changing information, hybrid semantic and graph search, and
+scale:
 
-- **Temporal Awareness:** Tracks changes in facts and relationships over time, enabling point-in-time queries. Graph edges include temporal metadata to record relationship lifecycles.
-- **Episodic Processing:** Ingests data as discrete episodes, maintaining data provenance and allowing incremental entity and relationship extraction.
-- **Hybrid Search:** Combines semantic and BM25 full-text search, with the ability to rerank results by distance from a central node e.g. “Kendra”.
-- **Scalable:** Designed for processing large datasets, with parallelization of LLM calls for bulk processing while preserving the chronology of events.
+- **Temporal Awareness:** Tracks changes in facts and relationships over time, enabling point-in-time queries. Graph
+  edges include temporal metadata to record relationship lifecycles.
+- **Episodic Processing:** Ingests data as discrete episodes, maintaining data provenance and allowing incremental
+  entity and relationship extraction.
+- **Hybrid Search:** Combines semantic and BM25 full-text search, with the ability to rerank results by distance from a
+  central node e.g. “Kendra”.
+- **Scalable:** Designed for processing large datasets, with parallelization of LLM calls for bulk processing while
+  preserving the chronology of events.
 - **Supports Varied Sources:** Can ingest both unstructured text and structured JSON data.
 
 <p align="center">
@@ -63,6 +79,7 @@ Requirements:
 
 - Python 3.10 or higher
 - Neo4j 5.21 or higher
+- Neo4j GraphDataScience Plugin (required for community flows)
 - OpenAI API key (for LLM inference and embedding)
 
 Optional:
@@ -70,7 +87,8 @@ Optional:
 - Anthropic or Groq API key (for alternative LLM providers)
 
 > [!TIP]
-> The simplest way to install Neo4j is via [Neo4j Desktop](https://neo4j.com/download/). It provides a user-friendly interface to manage Neo4j instances and databases.
+> The simplest way to install Neo4j is via [Neo4j Desktop](https://neo4j.com/download/). It provides a user-friendly
+> interface to manage Neo4j instances and databases.
 
 ```bash
 pip install graphiti-core
@@ -85,7 +103,8 @@ poetry add graphiti-core
 ## Quick Start
 
 > [!IMPORTANT]
-> Graphiti uses OpenAI for LLM inference and embedding. Ensure that an `OPENAI_API_KEY` is set in your environment. Support for Anthropic and Groq LLM inferences is available, too.
+> Graphiti uses OpenAI for LLM inference and embedding. Ensure that an `OPENAI_API_KEY` is set in your environment.
+> Support for Anthropic and Groq LLM inferences is available, too.
 
 ```python
 from graphiti_core import Graphiti
@@ -119,25 +138,25 @@ for i, episode in enumerate(episodes):
 results = await graphiti.search('Who was the California Attorney General?')
 [
     EntityEdge(
-    │   uuid='3133258f738e487383f07b04e15d4ac0',
-    │   source_node_uuid='2a85789b318d4e418050506879906e62',
-    │   target_node_uuid='baf7781f445945989d6e4f927f881556',
-    │   created_at=datetime.datetime(2024, 8, 26, 13, 13, 24, 861097),
-    │   name='HELD_POSITION',
-        # the fact reflects the updated state that Harris is
-        # no longer the AG of California
-    │   fact='Kamala Harris was the Attorney General of California',
-    │   fact_embedding=[
-    │   │   -0.009955154731869698,
-    │       ...
-    │   │   0.00784289836883545
-    │   ],
-    │   episodes=['b43e98ad0a904088a76c67985caecc22'],
-    │   expired_at=datetime.datetime(2024, 8, 26, 20, 18, 1, 53812),
-        # These dates represent the date this edge was true.
-    │   valid_at=datetime.datetime(2011, 1, 3, 0, 0, tzinfo=<UTC>),
-    │   invalid_at=datetime.datetime(2017, 1, 3, 0, 0, tzinfo=<UTC>)
-    )
+│   uuid = '3133258f738e487383f07b04e15d4ac0',
+│   source_node_uuid = '2a85789b318d4e418050506879906e62',
+│   target_node_uuid = 'baf7781f445945989d6e4f927f881556',
+│   created_at = datetime.datetime(2024, 8, 26, 13, 13, 24, 861097),
+│   name = 'HELD_POSITION',
+# the fact reflects the updated state that Harris is
+# no longer the AG of California
+│   fact = 'Kamala Harris was the Attorney General of California',
+│   fact_embedding = [
+│   │   -0.009955154731869698,
+│       ...
+│   │   0.00784289836883545
+│],
+│   episodes = ['b43e98ad0a904088a76c67985caecc22'],
+│   expired_at = datetime.datetime(2024, 8, 26, 20, 18, 1, 53812),
+# These dates represent the date this edge was true.
+│   valid_at = datetime.datetime(2011, 1, 3, 0, 0, tzinfo= < UTC >),
+│   invalid_at = datetime.datetime(2017, 1, 3, 0, 0, tzinfo= < UTC >)
+)
 ]
 
 # Rerank search results based on graph distance
@@ -170,14 +189,16 @@ Graphiti is under active development. We aim to maintain API stability while wor
 - [ ] Achieving good performance with different LLM and embedding models
 - [ ] Creating a dedicated embedder interface
 - [ ] Supporting custom graph schemas:
-  - Allow developers to provide their own defined node and edge classes when ingesting episodes
-  - Enable more flexible knowledge representation tailored to specific use cases
+    - Allow developers to provide their own defined node and edge classes when ingesting episodes
+    - Enable more flexible knowledge representation tailored to specific use cases
 - [ ] Enhancing retrieval capabilities with more robust and configurable options
 - [ ] Expanding test coverage to ensure reliability and catch edge cases
 
 ## Contributing
 
-We encourage and appreciate all forms of contributions, whether it's code, documentation, addressing GitHub Issues, or answering questions in the Graphiti Discord channel. For detailed guidelines on code contributions, please refer to [CONTRIBUTING](CONTRIBUTING.md).
+We encourage and appreciate all forms of contributions, whether it's code, documentation, addressing GitHub Issues, or
+answering questions in the Graphiti Discord channel. For detailed guidelines on code contributions, please refer
+to [CONTRIBUTING](CONTRIBUTING.md).
 
 ## Support
 
diff --git a/graphiti_core/edges.py b/graphiti_core/edges.py
index a8d6f8d9..18f2f8a9 100644
--- a/graphiti_core/edges.py
+++ b/graphiti_core/edges.py
@@ -104,7 +104,6 @@ class EpisodicEdge(Edge):
 
         edges = [get_episodic_edge_from_record(record) for record in records]
 
-        logger.info(f'Found Edge: {uuid}')
         if len(edges) == 0:
             raise EdgeNotFoundError(uuid)
         return edges[0]
@@ -127,7 +126,29 @@ class EpisodicEdge(Edge):
 
         edges = [get_episodic_edge_from_record(record) for record in records]
 
-        logger.info(f'Found Edges: {uuids}')
+        if len(edges) == 0:
+            raise EdgeNotFoundError(uuids[0])
+        return edges
+
+    @classmethod
+    async def get_by_group_ids(cls, driver: AsyncDriver, group_ids: list[str | None]):
+        records, _, _ = await driver.execute_query(
+            """
+        MATCH (n:Episodic)-[e:MENTIONS]->(m:Entity)
+        WHERE e.group_id IN $group_ids
+        RETURN
+            e.uuid As uuid,
+            e.group_id AS group_id,
+            n.uuid AS source_node_uuid, 
+            m.uuid AS target_node_uuid, 
+            e.created_at AS created_at
+        """,
+            group_ids=group_ids,
+        )
+
+        edges = [get_episodic_edge_from_record(record) for record in records]
+        uuids = [edge.uuid for edge in edges]
+
         if len(edges) == 0:
             raise EdgeNotFoundError(uuids[0])
         return edges
@@ -215,7 +236,6 @@ class EntityEdge(Edge):
 
         edges = [get_entity_edge_from_record(record) for record in records]
 
-        logger.info(f'Found Edge: {uuid}')
         if len(edges) == 0:
             raise EdgeNotFoundError(uuid)
         return edges[0]
@@ -245,7 +265,36 @@ class EntityEdge(Edge):
 
         edges = [get_entity_edge_from_record(record) for record in records]
 
-        logger.info(f'Found Edges: {uuids}')
+        if len(edges) == 0:
+            raise EdgeNotFoundError(uuids[0])
+        return edges
+
+    @classmethod
+    async def get_by_group_ids(cls, driver: AsyncDriver, group_ids: list[str | None]):
+        records, _, _ = await driver.execute_query(
+            """
+        MATCH (n:Entity)-[e:RELATES_TO]->(m:Entity)
+        WHERE e.group_id IN $group_ids
+        RETURN
+            e.uuid AS uuid,
+            n.uuid AS source_node_uuid,
+            m.uuid AS target_node_uuid,
+            e.created_at AS created_at,
+            e.name AS name,
+            e.group_id AS group_id,
+            e.fact AS fact,
+            e.fact_embedding AS fact_embedding,
+            e.episodes AS episodes,
+            e.expired_at AS expired_at,
+            e.valid_at AS valid_at,
+            e.invalid_at AS invalid_at
+        """,
+            group_ids=group_ids,
+        )
+
+        edges = [get_entity_edge_from_record(record) for record in records]
+        uuids = [edge.uuid for edge in edges]
+
         if len(edges) == 0:
             raise EdgeNotFoundError(uuids[0])
         return edges
@@ -288,8 +337,6 @@ class CommunityEdge(Edge):
 
         edges = [get_community_edge_from_record(record) for record in records]
 
-        logger.info(f'Found Edge: {uuid}')
-
         return edges[0]
 
     @classmethod
@@ -310,7 +357,25 @@ class CommunityEdge(Edge):
 
         edges = [get_community_edge_from_record(record) for record in records]
 
-        logger.info(f'Found Edges: {uuids}')
+        return edges
+
+    @classmethod
+    async def get_by_group_ids(cls, driver: AsyncDriver, group_ids: list[str | None]):
+        records, _, _ = await driver.execute_query(
+            """
+        MATCH (n:Community)-[e:HAS_MEMBER]->(m:Entity | Community)
+        WHERE e.group_id IN $group_ids
+        RETURN
+            e.uuid As uuid,
+            e.group_id AS group_id,
+            n.uuid AS source_node_uuid, 
+            m.uuid AS target_node_uuid, 
+            e.created_at AS created_at
+        """,
+            group_ids=group_ids,
+        )
+
+        edges = [get_community_edge_from_record(record) for record in records]
 
         return edges
 
diff --git a/graphiti_core/graphiti.py b/graphiti_core/graphiti.py
index 7cf9719a..da66c0c1 100644
--- a/graphiti_core/graphiti.py
+++ b/graphiti_core/graphiti.py
@@ -77,7 +77,14 @@ load_dotenv()
 
 
 class Graphiti:
-    def __init__(self, uri: str, user: str, password: str, llm_client: LLMClient | None = None):
+    def __init__(
+        self,
+        uri: str,
+        user: str,
+        password: str,
+        llm_client: LLMClient | None = None,
+        store_raw_episode_content: bool = True,
+    ):
         """
         Initialize a Graphiti instance.
 
@@ -116,6 +123,7 @@ class Graphiti:
         """
         self.driver = AsyncGraphDatabase.driver(uri, auth=(user, password))
         self.database = 'neo4j'
+        self.store_raw_episode_content = store_raw_episode_content
         if llm_client:
             self.llm_client = llm_client
         else:
@@ -251,6 +259,8 @@ class Graphiti:
             An id for the graph partition the episode is a part of.
         uuid : str | None
             Optional uuid of the episode.
+        update_communities : bool
+            Optional. Whether to update communities with new node information
 
         Returns
         -------
@@ -276,7 +286,6 @@ class Graphiti:
         try:
             start = time()
 
-            nodes: list[EntityNode] = []
             entity_edges: list[EntityEdge] = []
             embedder = self.llm_client.get_embedder()
             now = datetime.now()
@@ -295,6 +304,8 @@ class Graphiti:
                 valid_at=reference_time,
             )
             episode.uuid = uuid if uuid is not None else episode.uuid
+            if not self.store_raw_episode_content:
+                episode.content = ''
 
             # Extract entities as nodes
 
@@ -323,7 +334,7 @@ class Graphiti:
                 ),
             )
             logger.info(f'Adjusted mentioned nodes: {[(n.name, n.uuid) for n in mentioned_nodes]}')
-            nodes.extend(mentioned_nodes)
+            nodes = mentioned_nodes
 
             extracted_edges_with_resolved_pointers = resolve_edge_pointers(
                 extracted_edges, uuid_map
diff --git a/graphiti_core/nodes.py b/graphiti_core/nodes.py
index 769cfe5e..a0431a64 100644
--- a/graphiti_core/nodes.py
+++ b/graphiti_core/nodes.py
@@ -158,8 +158,6 @@ class EpisodicNode(Node):
 
         episodes = [get_episodic_node_from_record(record) for record in records]
 
-        logger.info(f'Found Node: {uuid}')
-
         if len(episodes) == 0:
             raise NodeNotFoundError(uuid)
 
@@ -185,7 +183,27 @@ class EpisodicNode(Node):
 
         episodes = [get_episodic_node_from_record(record) for record in records]
 
-        logger.info(f'Found Nodes: {uuids}')
+        return episodes
+
+    @classmethod
+    async def get_by_group_ids(cls, driver: AsyncDriver, group_ids: list[str | None]):
+        records, _, _ = await driver.execute_query(
+            """
+        MATCH (e:Episodic) WHERE e.group_id IN $group_ids
+            RETURN DISTINCT
+            e.content AS content,
+            e.created_at AS created_at,
+            e.valid_at AS valid_at,
+            e.uuid AS uuid,
+            e.name AS name,
+            e.group_id AS group_id,
+            e.source_description AS source_description,
+            e.source AS source
+        """,
+            group_ids=group_ids,
+        )
+
+        episodes = [get_episodic_node_from_record(record) for record in records]
 
         return episodes
 
@@ -240,8 +258,6 @@ class EntityNode(Node):
 
         nodes = [get_entity_node_from_record(record) for record in records]
 
-        logger.info(f'Found Node: {uuid}')
-
         return nodes[0]
 
     @classmethod
@@ -262,7 +278,25 @@ class EntityNode(Node):
 
         nodes = [get_entity_node_from_record(record) for record in records]
 
-        logger.info(f'Found Nodes: {uuids}')
+        return nodes
+
+    @classmethod
+    async def get_by_group_ids(cls, driver: AsyncDriver, group_ids: list[str | None]):
+        records, _, _ = await driver.execute_query(
+            """
+        MATCH (n:Entity) WHERE n.group_id IN $group_ids
+        RETURN
+            n.uuid As uuid, 
+            n.name AS name,
+            n.name_embedding AS name_embedding,
+            n.group_id AS group_id,
+            n.created_at AS created_at, 
+            n.summary AS summary
+        """,
+            group_ids=group_ids,
+        )
+
+        nodes = [get_entity_node_from_record(record) for record in records]
 
         return nodes
 
@@ -317,8 +351,6 @@ class CommunityNode(Node):
 
         nodes = [get_community_node_from_record(record) for record in records]
 
-        logger.info(f'Found Node: {uuid}')
-
         return nodes[0]
 
     @classmethod
@@ -337,11 +369,29 @@ class CommunityNode(Node):
             uuids=uuids,
         )
 
-        nodes = [get_community_node_from_record(record) for record in records]
+        communities = [get_community_node_from_record(record) for record in records]
 
-        logger.info(f'Found Nodes: {uuids}')
+        return communities
 
-        return nodes
+    @classmethod
+    async def get_by_group_ids(cls, driver: AsyncDriver, group_ids: list[str | None]):
+        records, _, _ = await driver.execute_query(
+            """
+        MATCH (n:Community) WHERE n.group_id IN $group_ids
+        RETURN
+            n.uuid As uuid, 
+            n.name AS name,
+            n.name_embedding AS name_embedding,
+            n.group_id AS group_id,
+            n.created_at AS created_at, 
+            n.summary AS summary
+        """,
+            group_ids=group_ids,
+        )
+
+        communities = [get_community_node_from_record(record) for record in records]
+
+        return communities
 
 
 # Node helpers