mirror of
https://github.com/microsoft/graphrag.git
synced 2025-06-26 23:19:58 +00:00

* Remove create_final_nodes * Rename final entity output to "entities" * Remove duplicate code from graph extraction * Rename create_final_relationships output to "relationships" * Rename create_final_communities output to "communities" * Combine compute_communities and create_final_communities * Rename create_final_covariates output to "covariates" * Rename create_final_community_reports output to "community_reports" * Rename create_final_text_units output to "text_units" * Rename create_final_documents output to "documents" * Remove transient snapshots config * Move create_final_entities to finalize_entities operation * Move create_final_relationships flow to finalize_relationships operation * Reuse some community report functions * Collapse most of graph and text unit-based report generation * Unify schemas files * Move community reports extractor * Move NLP report prompt to prompts folder * Fix a few pandas warnings * Rename embeddings config to embed_text * Rename claim_extraction config to extract_claims * Remove nltk from standard graph extraction * Fix verb tests * Fix extract graph config naming * Fix moved file reference * Create v1-to-v2 migration notebook * Semver * Fix smoke test artifact count * Raise tpm/rpm on smoke tests * Update drift settings for smoke tests * Reuse project directory var in api notebook * Format * Format
141 lines
5.4 KiB
Plaintext
141 lines
5.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 41,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Copyright (c) 2024 Microsoft Corporation.\n",
|
|
"# Licensed under the MIT License."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Index Migration (v1 to v2)\n",
|
|
"\n",
|
|
"This notebook is used to maintain data model parity with older indexes for version 2.0 of GraphRAG. If you have a pre-2.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment. If you have a pre-1.0 index, please run the v1 migration notebook first!\n",
|
|
"\n",
|
|
"NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. This also ensures that you have default vector store config, which is now required or indexing will fail.\n",
|
|
"\n",
|
|
"WARNING: This will overwrite your parquet files, you may want to make a backup!"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 42,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# This is the directory that has your settings.yaml\n",
|
|
"PROJECT_DIRECTORY = \"<your project directory\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 43,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from pathlib import Path\n",
|
|
"\n",
|
|
"from graphrag.config.load_config import load_config\n",
|
|
"from graphrag.storage.factory import StorageFactory\n",
|
|
"\n",
|
|
"config = load_config(Path(PROJECT_DIRECTORY))\n",
|
|
"storage_config = config.output.model_dump()\n",
|
|
"storage = StorageFactory().create_storage(\n",
|
|
" storage_type=storage_config[\"type\"],\n",
|
|
" kwargs=storage_config,\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 44,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def remove_columns(df, columns):\n",
|
|
" \"\"\"Remove columns from a DataFrame, suppressing errors.\"\"\"\n",
|
|
" df.drop(labels=columns, axis=1, errors=\"ignore\", inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 45,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from graphrag.utils.storage import (\n",
|
|
" delete_table_from_storage,\n",
|
|
" load_table_from_storage,\n",
|
|
" write_table_to_storage,\n",
|
|
")\n",
|
|
"\n",
|
|
"final_documents = await load_table_from_storage(\"create_final_documents\", storage)\n",
|
|
"final_text_units = await load_table_from_storage(\"create_final_text_units\", storage)\n",
|
|
"final_entities = await load_table_from_storage(\"create_final_entities\", storage)\n",
|
|
"final_nodes = await load_table_from_storage(\"create_final_nodes\", storage)\n",
|
|
"final_relationships = await load_table_from_storage(\n",
|
|
" \"create_final_relationships\", storage\n",
|
|
")\n",
|
|
"final_communities = await load_table_from_storage(\"create_final_communities\", storage)\n",
|
|
"final_community_reports = await load_table_from_storage(\n",
|
|
" \"create_final_community_reports\", storage\n",
|
|
")\n",
|
|
"\n",
|
|
"# we've renamed document attributes as metadata\n",
|
|
"if \"attributes\" in final_documents.columns:\n",
|
|
" final_documents.rename(columns={\"attributes\": \"metadata\"}, inplace=True)\n",
|
|
"\n",
|
|
"# we're removing the nodes table, so we need to copy the graph columns into entities\n",
|
|
"graph_props = (\n",
|
|
" final_nodes.loc[:, [\"id\", \"degree\", \"x\", \"y\"]].groupby(\"id\").first().reset_index()\n",
|
|
")\n",
|
|
"final_entities = final_entities.merge(graph_props, on=\"id\", how=\"left\")\n",
|
|
"\n",
|
|
"# we renamed all the output files for better clarity now that we don't have workflow naming constraints from DataShaper\n",
|
|
"await write_table_to_storage(final_documents, \"documents\", storage)\n",
|
|
"await write_table_to_storage(final_text_units, \"text_units\", storage)\n",
|
|
"await write_table_to_storage(final_entities, \"entities\", storage)\n",
|
|
"await write_table_to_storage(final_relationships, \"relationships\", storage)\n",
|
|
"await write_table_to_storage(final_communities, \"communities\", storage)\n",
|
|
"await write_table_to_storage(final_community_reports, \"community_reports\", storage)\n",
|
|
"\n",
|
|
"# delete all the old versions\n",
|
|
"await delete_table_from_storage(\"create_final_documents\", storage)\n",
|
|
"await delete_table_from_storage(\"create_final_text_units\", storage)\n",
|
|
"await delete_table_from_storage(\"create_final_entities\", storage)\n",
|
|
"await delete_table_from_storage(\"create_final_nodes\", storage)\n",
|
|
"await delete_table_from_storage(\"create_final_relationships\", storage)\n",
|
|
"await delete_table_from_storage(\"create_final_communities\", storage)\n",
|
|
"await delete_table_from_storage(\"create_final_community_reports\", storage)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|