Update relationships after inc index (#1236)

2025-12-13 07:51:34 +00:00 · 2024-09-30 18:18:58 -06:00 · 2024-09-30 18:18:58 -06:00 · 336e6f9ca1
commit 336e6f9ca1
parent 4d713f6b23
2 changed files with 66 additions and 4 deletions
--- a/.semversioner/next-release/patch-20240930234415130922.json
+++ b/.semversioner/next-release/patch-20240930234415130922.json
@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Add relationship merge"
+}
--- a/graphrag/index/update/dataframes.py
+++ b/graphrag/index/update/dataframes.py
@ -92,11 +92,26 @@ async def update_dataframe_outputs(

    merged_entities_df, _ = _group_and_resolve_entities(old_entities, delta_entities)
    # Save the updated entities back to storage
-    # TODO: Using _new in the mean time, to compare outputs without overwriting the original
+    # TODO: Using _new in the meantime, to compare outputs without overwriting the original
    await storage.set(
        "create_final_entities_new.parquet", merged_entities_df.to_parquet()
    )

+    # Update relationships with the entities id mapping
+    old_relationships = await _load_table_from_storage(
+        "create_final_relationships.parquet", storage
+    )
+    delta_relationships = dataframe_dict["create_final_relationships"]
+    merged_relationships_df = _update_and_merge_relationships(
+        old_relationships,
+        delta_relationships,
+    )
+
+    # TODO: Using _new in the meantime, to compare outputs without overwriting the original
+    await storage.set(
+        "create_final_relationships_new.parquet", merged_relationships_df.to_parquet()
+    )
+

 async def _concat_dataframes(name, dataframe_dict, storage):
    """Concatenate the dataframes.
@ -148,6 +163,9 @@ def _group_and_resolve_entities(
    )
    id_mapping = dict(zip(merged["id_B"], merged["id_A"], strict=True))

+    # Increment human readable id in b by the max of a
+    df_b["human_readable_id"] += df_a["human_readable_id"].max() + 1
+
    # Concat A and B
    combined = pd.concat([df_a, df_b], copy=False)

@ -171,9 +189,6 @@ def _group_and_resolve_entities(
    # Force the result into a DataFrame
    resolved: pd.DataFrame = pd.DataFrame(aggregated)

-    # Recreate humand readable id with an autonumeric
-    resolved["human_readable_id"] = range(len(resolved))
-
    # Modify column order to keep consistency
    resolved = resolved.loc[
        :,
@ -190,3 +205,46 @@ def _group_and_resolve_entities(
    ]

    return resolved, id_mapping
+
+
+def _update_and_merge_relationships(
+    old_relationships: pd.DataFrame, delta_relationships: pd.DataFrame
+) -> pd.DataFrame:
+    """Update and merge relationships.
+
+    Parameters
+    ----------
+    old_relationships : pd.DataFrame
+        The old relationships.
+    delta_relationships : pd.DataFrame
+        The delta relationships.
+
+    Returns
+    -------
+    pd.DataFrame
+        The updated relationships.
+    """
+    # Increment the human readable id in b by the max of a
+    delta_relationships["human_readable_id"] += (
+        old_relationships["human_readable_id"].max() + 1
+    )
+
+    # Merge the final relationships
+    final_relationships = pd.concat(
+        [old_relationships, delta_relationships], copy=False
+    )
+
+    # Recalculate target and source degrees
+    final_relationships["source_degree"] = final_relationships.groupby("source")[
+        "target"
+    ].transform("count")
+    final_relationships["target_degree"] = final_relationships.groupby("target")[
+        "source"
+    ].transform("count")
+
+    # Recalculate the rank of the relationships (source degree + target degree)
+    final_relationships["rank"] = (
+        final_relationships["source_degree"] + final_relationships["target_degree"]
+    )
+
+    return final_relationships