Index migration to v2

In [1]:

Copied!

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

Index Migration (v1 to v2)¶

This notebook is used to maintain data model parity with older indexes for version 2.0 of GraphRAG. If you have a pre-2.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment. If you have a pre-1.0 index, please run the v1 migration notebook first!

NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using graphrag init. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. This also ensures that you have default vector store config, which is now required or indexing will fail.

WARNING: This will overwrite your parquet files, you may want to make a backup!

In [2]:

Copied!

# This is the directory that has your settings.yaml
PROJECT_DIRECTORY = "<your project directory>"
# This is the directory that has your settings.yaml
PROJECT_DIRECTORY = ""

In [3]:

Copied!





from pathlib import Path

from graphrag.config.load_config import load_config
from graphrag.storage.factory import StorageFactory

config = load_config(Path(PROJECT_DIRECTORY))
storage_config = config.output.model_dump()
storage = StorageFactory().create_storage(
    storage_type=storage_config["type"],
    kwargs=storage_config,
)
from pathlib import Path

from graphrag.config.load_config import load_config
from graphrag.storage.factory import StorageFactory

config = load_config(Path(PROJECT_DIRECTORY))
storage_config = config.output.model_dump()
storage = StorageFactory().create_storage(
    storage_type=storage_config["type"],
    kwargs=storage_config,
)

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[3], line 6
      3 from graphrag.config.load_config import load_config
      4 from graphrag.storage.factory import StorageFactory
----> 6 config = load_config(Path(PROJECT_DIRECTORY))
      7 storage_config = config.output.model_dump()
      8 storage = StorageFactory().create_storage(
      9     storage_type=storage_config["type"],
     10     kwargs=storage_config,
     11 )

File ~/work/graphrag/graphrag/graphrag/config/load_config.py:183, in load_config(root_dir, config_filepath, cli_overrides)
    151 """Load configuration from a file.
    152 
    153 Parameters
   (...)    180     If there are pydantic validation errors when instantiating the config.
    181 """
    182 root = root_dir.resolve()
--> 183 config_path = _get_config_path(root, config_filepath)
    184 _load_dotenv(config_path)
    185 config_extension = config_path.suffix

File ~/work/graphrag/graphrag/graphrag/config/load_config.py:106, in _get_config_path(root_dir, config_filepath)
    104         raise FileNotFoundError(msg)
    105 else:
--> 106     config_path = _search_for_config_in_root_dir(root_dir)
    108 if not config_path:
    109     msg = f"Config file not found in root directory: {root_dir}"

File ~/work/graphrag/graphrag/graphrag/config/load_config.py:40, in _search_for_config_in_root_dir(root)
     38 if not root.is_dir():
     39     msg = f"Invalid config path: {root} is not a directory"
---> 40     raise FileNotFoundError(msg)
     42 for file in _default_config_files:
     43     if (root / file).is_file():

FileNotFoundError: Invalid config path: /home/runner/work/graphrag/graphrag/docs/examples_notebooks/<your project directory> is not a directory

In [4]:

Copied!

def remove_columns(df, columns):
    """Remove columns from a DataFrame, suppressing errors."""
    df.drop(labels=columns, axis=1, errors="ignore", inplace=True)
def remove_columns(df, columns):
    """Remove columns from a DataFrame, suppressing errors."""
    df.drop(labels=columns, axis=1, errors="ignore", inplace=True)

In [5]:

Copied!





import numpy as np

from graphrag.utils.storage import (
    delete_table_from_storage,
    load_table_from_storage,
    write_table_to_storage,
)

final_documents = await load_table_from_storage("create_final_documents", storage)
final_text_units = await load_table_from_storage("create_final_text_units", storage)
final_entities = await load_table_from_storage("create_final_entities", storage)
final_covariates = await load_table_from_storage("create_final_covariates", storage)
final_nodes = await load_table_from_storage("create_final_nodes", storage)
final_relationships = await load_table_from_storage(
    "create_final_relationships", storage
)
final_communities = await load_table_from_storage("create_final_communities", storage)
final_community_reports = await load_table_from_storage(
    "create_final_community_reports", storage
)

# we've renamed document attributes as metadata
if "attributes" in final_documents.columns:
    final_documents.rename(columns={"attributes": "metadata"}, inplace=True)

# we're removing the nodes table, so we need to copy the graph columns into entities
graph_props = (
    final_nodes.loc[:, ["id", "degree", "x", "y"]].groupby("id").first().reset_index()
)
final_entities = final_entities.merge(graph_props, on="id", how="left")
# we're also persisting the frequency column
final_entities["frequency"] = final_entities["text_unit_ids"].count()


# we added children to communities to eliminate query-time reconstruction
parent_grouped = final_communities.groupby("parent").agg(
    children=("community", "unique")
)
final_communities = final_communities.merge(
    parent_grouped,
    left_on="community",
    right_on="parent",
    how="left",
)
# replace NaN children with empty list
final_communities["children"] = final_communities["children"].apply(
    lambda x: x if isinstance(x, np.ndarray) else []  # type: ignore
)

# add children to the reports as well
final_community_reports = final_community_reports.merge(
    parent_grouped,
    left_on="community",
    right_on="parent",
    how="left",
)

# we renamed all the output files for better clarity now that we don't have workflow naming constraints from DataShaper
await write_table_to_storage(final_documents, "documents", storage)
await write_table_to_storage(final_text_units, "text_units", storage)
await write_table_to_storage(final_entities, "entities", storage)
await write_table_to_storage(final_relationships, "relationships", storage)
await write_table_to_storage(final_covariates, "covariates", storage)
await write_table_to_storage(final_communities, "communities", storage)
await write_table_to_storage(final_community_reports, "community_reports", storage)

# delete all the old versions
await delete_table_from_storage("create_final_documents", storage)
await delete_table_from_storage("create_final_text_units", storage)
await delete_table_from_storage("create_final_entities", storage)
await delete_table_from_storage("create_final_nodes", storage)
await delete_table_from_storage("create_final_relationships", storage)
await delete_table_from_storage("create_final_covariates", storage)
await delete_table_from_storage("create_final_communities", storage)
await delete_table_from_storage("create_final_community_reports", storage)
import numpy as np

from graphrag.utils.storage import (
    delete_table_from_storage,
    load_table_from_storage,
    write_table_to_storage,
)

final_documents = await load_table_from_storage("create_final_documents", storage)
final_text_units = await load_table_from_storage("create_final_text_units", storage)
final_entities = await load_table_from_storage("create_final_entities", storage)
final_covariates = await load_table_from_storage("create_final_covariates", storage)
final_nodes = await load_table_from_storage("create_final_nodes", storage)
final_relationships = await load_table_from_storage(
    "create_final_relationships", storage
)
final_communities = await load_table_from_storage("create_final_communities", storage)
final_community_reports = await load_table_from_storage(
    "create_final_community_reports", storage
)

# we've renamed document attributes as metadata
if "attributes" in final_documents.columns:
    final_documents.rename(columns={"attributes": "metadata"}, inplace=True)

# we're removing the nodes table, so we need to copy the graph columns into entities
graph_props = (
    final_nodes.loc[:, ["id", "degree", "x", "y"]].groupby("id").first().reset_index()
)
final_entities = final_entities.merge(graph_props, on="id", how="left")
# we're also persisting the frequency column
final_entities["frequency"] = final_entities["text_unit_ids"].count()


# we added children to communities to eliminate query-time reconstruction
parent_grouped = final_communities.groupby("parent").agg(
    children=("community", "unique")
)
final_communities = final_communities.merge(
    parent_grouped,
    left_on="community",
    right_on="parent",
    how="left",
)
# replace NaN children with empty list
final_communities["children"] = final_communities["children"].apply(
    lambda x: x if isinstance(x, np.ndarray) else []  # type: ignore
)

# add children to the reports as well
final_community_reports = final_community_reports.merge(
    parent_grouped,
    left_on="community",
    right_on="parent",
    how="left",
)

# we renamed all the output files for better clarity now that we don't have workflow naming constraints from DataShaper
await write_table_to_storage(final_documents, "documents", storage)
await write_table_to_storage(final_text_units, "text_units", storage)
await write_table_to_storage(final_entities, "entities", storage)
await write_table_to_storage(final_relationships, "relationships", storage)
await write_table_to_storage(final_covariates, "covariates", storage)
await write_table_to_storage(final_communities, "communities", storage)
await write_table_to_storage(final_community_reports, "community_reports", storage)

# delete all the old versions
await delete_table_from_storage("create_final_documents", storage)
await delete_table_from_storage("create_final_text_units", storage)
await delete_table_from_storage("create_final_entities", storage)
await delete_table_from_storage("create_final_nodes", storage)
await delete_table_from_storage("create_final_relationships", storage)
await delete_table_from_storage("create_final_covariates", storage)
await delete_table_from_storage("create_final_communities", storage)
await delete_table_from_storage("create_final_community_reports", storage)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 9
      1 import numpy as np
      3 from graphrag.utils.storage import (
      4     delete_table_from_storage,
      5     load_table_from_storage,
      6     write_table_to_storage,
      7 )
----> 9 final_documents = await load_table_from_storage("create_final_documents", storage)
     10 final_text_units = await load_table_from_storage("create_final_text_units", storage)
     11 final_entities = await load_table_from_storage("create_final_entities", storage)

NameError: name 'storage' is not defined