mirror of
https://github.com/microsoft/graphrag.git
synced 2025-12-17 10:09:43 +00:00
Update relationships after inc index (#1236)
This commit is contained in:
parent
4d713f6b23
commit
336e6f9ca1
@ -0,0 +1,4 @@
|
|||||||
|
{
|
||||||
|
"type": "patch",
|
||||||
|
"description": "Add relationship merge"
|
||||||
|
}
|
||||||
@ -92,11 +92,26 @@ async def update_dataframe_outputs(
|
|||||||
|
|
||||||
merged_entities_df, _ = _group_and_resolve_entities(old_entities, delta_entities)
|
merged_entities_df, _ = _group_and_resolve_entities(old_entities, delta_entities)
|
||||||
# Save the updated entities back to storage
|
# Save the updated entities back to storage
|
||||||
# TODO: Using _new in the mean time, to compare outputs without overwriting the original
|
# TODO: Using _new in the meantime, to compare outputs without overwriting the original
|
||||||
await storage.set(
|
await storage.set(
|
||||||
"create_final_entities_new.parquet", merged_entities_df.to_parquet()
|
"create_final_entities_new.parquet", merged_entities_df.to_parquet()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Update relationships with the entities id mapping
|
||||||
|
old_relationships = await _load_table_from_storage(
|
||||||
|
"create_final_relationships.parquet", storage
|
||||||
|
)
|
||||||
|
delta_relationships = dataframe_dict["create_final_relationships"]
|
||||||
|
merged_relationships_df = _update_and_merge_relationships(
|
||||||
|
old_relationships,
|
||||||
|
delta_relationships,
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO: Using _new in the meantime, to compare outputs without overwriting the original
|
||||||
|
await storage.set(
|
||||||
|
"create_final_relationships_new.parquet", merged_relationships_df.to_parquet()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def _concat_dataframes(name, dataframe_dict, storage):
|
async def _concat_dataframes(name, dataframe_dict, storage):
|
||||||
"""Concatenate the dataframes.
|
"""Concatenate the dataframes.
|
||||||
@ -148,6 +163,9 @@ def _group_and_resolve_entities(
|
|||||||
)
|
)
|
||||||
id_mapping = dict(zip(merged["id_B"], merged["id_A"], strict=True))
|
id_mapping = dict(zip(merged["id_B"], merged["id_A"], strict=True))
|
||||||
|
|
||||||
|
# Increment human readable id in b by the max of a
|
||||||
|
df_b["human_readable_id"] += df_a["human_readable_id"].max() + 1
|
||||||
|
|
||||||
# Concat A and B
|
# Concat A and B
|
||||||
combined = pd.concat([df_a, df_b], copy=False)
|
combined = pd.concat([df_a, df_b], copy=False)
|
||||||
|
|
||||||
@ -171,9 +189,6 @@ def _group_and_resolve_entities(
|
|||||||
# Force the result into a DataFrame
|
# Force the result into a DataFrame
|
||||||
resolved: pd.DataFrame = pd.DataFrame(aggregated)
|
resolved: pd.DataFrame = pd.DataFrame(aggregated)
|
||||||
|
|
||||||
# Recreate humand readable id with an autonumeric
|
|
||||||
resolved["human_readable_id"] = range(len(resolved))
|
|
||||||
|
|
||||||
# Modify column order to keep consistency
|
# Modify column order to keep consistency
|
||||||
resolved = resolved.loc[
|
resolved = resolved.loc[
|
||||||
:,
|
:,
|
||||||
@ -190,3 +205,46 @@ def _group_and_resolve_entities(
|
|||||||
]
|
]
|
||||||
|
|
||||||
return resolved, id_mapping
|
return resolved, id_mapping
|
||||||
|
|
||||||
|
|
||||||
|
def _update_and_merge_relationships(
|
||||||
|
old_relationships: pd.DataFrame, delta_relationships: pd.DataFrame
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Update and merge relationships.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
old_relationships : pd.DataFrame
|
||||||
|
The old relationships.
|
||||||
|
delta_relationships : pd.DataFrame
|
||||||
|
The delta relationships.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pd.DataFrame
|
||||||
|
The updated relationships.
|
||||||
|
"""
|
||||||
|
# Increment the human readable id in b by the max of a
|
||||||
|
delta_relationships["human_readable_id"] += (
|
||||||
|
old_relationships["human_readable_id"].max() + 1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Merge the final relationships
|
||||||
|
final_relationships = pd.concat(
|
||||||
|
[old_relationships, delta_relationships], copy=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Recalculate target and source degrees
|
||||||
|
final_relationships["source_degree"] = final_relationships.groupby("source")[
|
||||||
|
"target"
|
||||||
|
].transform("count")
|
||||||
|
final_relationships["target_degree"] = final_relationships.groupby("target")[
|
||||||
|
"source"
|
||||||
|
].transform("count")
|
||||||
|
|
||||||
|
# Recalculate the rank of the relationships (source degree + target degree)
|
||||||
|
final_relationships["rank"] = (
|
||||||
|
final_relationships["source_degree"] + final_relationships["target_degree"]
|
||||||
|
)
|
||||||
|
|
||||||
|
return final_relationships
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user