Update relationships after inc index (#1236)

2025-12-17 10:09:43 +00:00 · 2024-09-30 18:18:58 -06:00 · 2024-09-30 18:18:58 -06:00 · 336e6f9ca1
commit 336e6f9ca1
parent 4d713f6b23
2 changed files with 66 additions and 4 deletions
--- a/.semversioner/next-release/patch-20240930234415130922.json
+++ b/.semversioner/next-release/patch-20240930234415130922.json
@ -0,0 +1,4 @@
 {
  "type": "patch",
  "description": "Add relationship merge"
 }
--- a/graphrag/index/update/dataframes.py
+++ b/graphrag/index/update/dataframes.py
@ -92,11 +92,26 @@ async def update_dataframe_outputs(
    merged_entities_df, _ = _group_and_resolve_entities(old_entities, delta_entities)
    # Save the updated entities back to storage
-    # TODO: Using _new in the mean time, to compare outputs without overwriting the original
+    # TODO: Using _new in the meantime, to compare outputs without overwriting the original
    await storage.set(
        "create_final_entities_new.parquet", merged_entities_df.to_parquet()
    )
    # Update relationships with the entities id mapping
    old_relationships = await _load_table_from_storage(
        "create_final_relationships.parquet", storage
    )
    delta_relationships = dataframe_dict["create_final_relationships"]
    merged_relationships_df = _update_and_merge_relationships(
        old_relationships,
        delta_relationships,
    )
    # TODO: Using _new in the meantime, to compare outputs without overwriting the original
    await storage.set(
        "create_final_relationships_new.parquet", merged_relationships_df.to_parquet()
    )
 async def _concat_dataframes(name, dataframe_dict, storage):
    """Concatenate the dataframes.
@ -148,6 +163,9 @@ def _group_and_resolve_entities(
    )
    id_mapping = dict(zip(merged["id_B"], merged["id_A"], strict=True))
    # Increment human readable id in b by the max of a
    df_b["human_readable_id"] += df_a["human_readable_id"].max() + 1
    # Concat A and B
    combined = pd.concat([df_a, df_b], copy=False)
@ -171,9 +189,6 @@ def _group_and_resolve_entities(
    # Force the result into a DataFrame
    resolved: pd.DataFrame = pd.DataFrame(aggregated)
    # Recreate humand readable id with an autonumeric
    resolved["human_readable_id"] = range(len(resolved))
    # Modify column order to keep consistency
    resolved = resolved.loc[
        :,
@ -190,3 +205,46 @@ def _group_and_resolve_entities(
    ]
    return resolved, id_mapping
 def _update_and_merge_relationships(
    old_relationships: pd.DataFrame, delta_relationships: pd.DataFrame
 ) -> pd.DataFrame:
    """Update and merge relationships.
    Parameters
    ----------
    old_relationships : pd.DataFrame
        The old relationships.
    delta_relationships : pd.DataFrame
        The delta relationships.
    Returns
    -------
    pd.DataFrame
        The updated relationships.
    """
    # Increment the human readable id in b by the max of a
    delta_relationships["human_readable_id"] += (
        old_relationships["human_readable_id"].max() + 1
    )
    # Merge the final relationships
    final_relationships = pd.concat(
        [old_relationships, delta_relationships], copy=False
    )
    # Recalculate target and source degrees
    final_relationships["source_degree"] = final_relationships.groupby("source")[
        "target"
    ].transform("count")
    final_relationships["target_degree"] = final_relationships.groupby("target")[
        "source"
    ].transform("count")
    # Recalculate the rank of the relationships (source degree + target degree)
    final_relationships["rank"] = (
        final_relationships["source_degree"] + final_relationships["target_degree"]
    )
    return final_relationships