mirror of
https://github.com/microsoft/graphrag.git
synced 2025-06-26 23:19:58 +00:00

* Wordind updates * Update yam lconfig and add notes to "deprecated" env * Add basic search section * Update versioning docs * Minor edits for clarity * Update init command * Update init to add --force in docs * Add NLP extraction params * Move vector_store to root * Add workflows to config * Add FastGraphRAG docs * add metadata column changes * Added documentation for multi index search. * Minor fixes. * Add config and table renames * Update migration notebook and comments to specify v1 * Add frequency to entity table docs * add new chunking options for metadata * Update output docs * Minor edits and cleanup * Add model ids to search configs * Spruce up migration notebook * Lint/format multi-index notebook * SpaCy model note * Update SpaCy footnote * Updated multi_index_search.ipynb to remove ruff errors. * add spacy to dictionary --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com> Co-authored-by: Dayenne Souza <ddesouza@microsoft.com> Co-authored-by: dorbaker <dorbaker@microsoft.com>
172 lines
6.7 KiB
Plaintext
172 lines
6.7 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Copyright (c) 2024 Microsoft Corporation.\n",
|
|
"# Licensed under the MIT License."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Index Migration (v1 to v2)\n",
|
|
"\n",
|
|
"This notebook is used to maintain data model parity with older indexes for version 2.0 of GraphRAG. If you have a pre-2.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment. If you have a pre-1.0 index, please run the v1 migration notebook first!\n",
|
|
"\n",
|
|
"NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. This also ensures that you have default vector store config, which is now required or indexing will fail.\n",
|
|
"\n",
|
|
"WARNING: This will overwrite your parquet files, you may want to make a backup!"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# This is the directory that has your settings.yaml\n",
|
|
"PROJECT_DIRECTORY = \"<your project directory>\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from pathlib import Path\n",
|
|
"\n",
|
|
"from graphrag.config.load_config import load_config\n",
|
|
"from graphrag.storage.factory import StorageFactory\n",
|
|
"\n",
|
|
"config = load_config(Path(PROJECT_DIRECTORY))\n",
|
|
"storage_config = config.output.model_dump()\n",
|
|
"storage = StorageFactory().create_storage(\n",
|
|
" storage_type=storage_config[\"type\"],\n",
|
|
" kwargs=storage_config,\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def remove_columns(df, columns):\n",
|
|
" \"\"\"Remove columns from a DataFrame, suppressing errors.\"\"\"\n",
|
|
" df.drop(labels=columns, axis=1, errors=\"ignore\", inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"from graphrag.utils.storage import (\n",
|
|
" delete_table_from_storage,\n",
|
|
" load_table_from_storage,\n",
|
|
" write_table_to_storage,\n",
|
|
")\n",
|
|
"\n",
|
|
"final_documents = await load_table_from_storage(\"create_final_documents\", storage)\n",
|
|
"final_text_units = await load_table_from_storage(\"create_final_text_units\", storage)\n",
|
|
"final_entities = await load_table_from_storage(\"create_final_entities\", storage)\n",
|
|
"final_covariates = await load_table_from_storage(\"create_final_covariates\", storage)\n",
|
|
"final_nodes = await load_table_from_storage(\"create_final_nodes\", storage)\n",
|
|
"final_relationships = await load_table_from_storage(\n",
|
|
" \"create_final_relationships\", storage\n",
|
|
")\n",
|
|
"final_communities = await load_table_from_storage(\"create_final_communities\", storage)\n",
|
|
"final_community_reports = await load_table_from_storage(\n",
|
|
" \"create_final_community_reports\", storage\n",
|
|
")\n",
|
|
"\n",
|
|
"# we've renamed document attributes as metadata\n",
|
|
"if \"attributes\" in final_documents.columns:\n",
|
|
" final_documents.rename(columns={\"attributes\": \"metadata\"}, inplace=True)\n",
|
|
"\n",
|
|
"# we're removing the nodes table, so we need to copy the graph columns into entities\n",
|
|
"graph_props = (\n",
|
|
" final_nodes.loc[:, [\"id\", \"degree\", \"x\", \"y\"]].groupby(\"id\").first().reset_index()\n",
|
|
")\n",
|
|
"final_entities = final_entities.merge(graph_props, on=\"id\", how=\"left\")\n",
|
|
"# we're also persisting the frequency column\n",
|
|
"final_entities[\"frequency\"] = final_entities[\"text_unit_ids\"].count()\n",
|
|
"\n",
|
|
"\n",
|
|
"# we added children to communities to eliminate query-time reconstruction\n",
|
|
"parent_grouped = final_communities.groupby(\"parent\").agg(\n",
|
|
" children=(\"community\", \"unique\")\n",
|
|
")\n",
|
|
"final_communities = final_communities.merge(\n",
|
|
" parent_grouped,\n",
|
|
" left_on=\"community\",\n",
|
|
" right_on=\"parent\",\n",
|
|
" how=\"left\",\n",
|
|
")\n",
|
|
"# replace NaN children with empty list\n",
|
|
"final_communities[\"children\"] = final_communities[\"children\"].apply(\n",
|
|
" lambda x: x if isinstance(x, np.ndarray) else [] # type: ignore\n",
|
|
")\n",
|
|
"\n",
|
|
"# add children to the reports as well\n",
|
|
"final_community_reports = final_community_reports.merge(\n",
|
|
" parent_grouped,\n",
|
|
" left_on=\"community\",\n",
|
|
" right_on=\"parent\",\n",
|
|
" how=\"left\",\n",
|
|
")\n",
|
|
"\n",
|
|
"# we renamed all the output files for better clarity now that we don't have workflow naming constraints from DataShaper\n",
|
|
"await write_table_to_storage(final_documents, \"documents\", storage)\n",
|
|
"await write_table_to_storage(final_text_units, \"text_units\", storage)\n",
|
|
"await write_table_to_storage(final_entities, \"entities\", storage)\n",
|
|
"await write_table_to_storage(final_relationships, \"relationships\", storage)\n",
|
|
"await write_table_to_storage(final_covariates, \"covariates\", storage)\n",
|
|
"await write_table_to_storage(final_communities, \"communities\", storage)\n",
|
|
"await write_table_to_storage(final_community_reports, \"community_reports\", storage)\n",
|
|
"\n",
|
|
"# delete all the old versions\n",
|
|
"await delete_table_from_storage(\"create_final_documents\", storage)\n",
|
|
"await delete_table_from_storage(\"create_final_text_units\", storage)\n",
|
|
"await delete_table_from_storage(\"create_final_entities\", storage)\n",
|
|
"await delete_table_from_storage(\"create_final_nodes\", storage)\n",
|
|
"await delete_table_from_storage(\"create_final_relationships\", storage)\n",
|
|
"await delete_table_from_storage(\"create_final_covariates\", storage)\n",
|
|
"await delete_table_from_storage(\"create_final_communities\", storage)\n",
|
|
"await delete_table_from_storage(\"create_final_community_reports\", storage)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|