Feat/update cli (#1376)

* Add update cli option with default storage * Semver * Semver * Pyright * Format
2025-06-26 23:19:58 +00:00 · 2024-11-07 06:59:10 -06:00 · 2024-11-07 06:59:10 -06:00 · 20c120288b
commit 20c120288b
parent baa261c8e9
5 changed files with 138 additions and 2 deletions
--- a/.semversioner/next-release/patch-20241107010037320137.json
+++ b/.semversioner/next-release/patch-20241107010037320137.json
@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Add update cli entrypoint for incremental indexing"
+}
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -3,6 +3,7 @@
    "**/.yarn": true,
    "**/.pnp.*": true
  },
+  "editor.formatOnSave": false,
  "eslint.nodePath": ".yarn/sdks",
  "typescript.tsdk": ".yarn/sdks/typescript/lib",
  "typescript.enablePromptUseWorkspaceTsdk": true,
--- a/graphrag/cli/index.py
+++ b/graphrag/cli/index.py
@ -79,11 +79,76 @@ def index_cli(
    output_dir: Path | None,
 ):
    """Run the pipeline with the given config."""
+    config = load_config(root_dir, config_filepath)
+
+    _run_index(
+        config=config,
+        verbose=verbose,
+        resume=resume,
+        memprofile=memprofile,
+        cache=cache,
+        reporter=reporter,
+        emit=emit,
+        dry_run=dry_run,
+        skip_validation=skip_validation,
+        output_dir=output_dir,
+    )
+
+
+def update_cli(
+    root_dir: Path,
+    verbose: bool,
+    memprofile: bool,
+    cache: bool,
+    reporter: ReporterType,
+    config_filepath: Path | None,
+    emit: list[TableEmitterType],
+    skip_validation: bool,
+    output_dir: Path | None,
+):
+    """Run the pipeline with the given config."""
+    config = load_config(root_dir, config_filepath)
+
+    # Check if update storage exist, if not configure it with default values
+    if not config.update_index_storage:
+        from graphrag.config.defaults import STORAGE_TYPE, UPDATE_STORAGE_BASE_DIR
+        from graphrag.config.models.storage_config import StorageConfig
+
+        config.update_index_storage = StorageConfig(
+            type=STORAGE_TYPE,
+            base_dir=UPDATE_STORAGE_BASE_DIR,
+        )
+
+    _run_index(
+        config=config,
+        verbose=verbose,
+        resume=False,
+        memprofile=memprofile,
+        cache=cache,
+        reporter=reporter,
+        emit=emit,
+        dry_run=False,
+        skip_validation=skip_validation,
+        output_dir=output_dir,
+    )
+
+
+def _run_index(
+    config,
+    verbose,
+    resume,
+    memprofile,
+    cache,
+    reporter,
+    emit,
+    dry_run,
+    skip_validation,
+    output_dir,
+):
    progress_reporter = create_progress_reporter(reporter)
    info, error, success = _logger(progress_reporter)
    run_id = resume or time.strftime("%Y%m%d-%H%M%S")

-    config = load_config(root_dir, config_filepath)
    config.storage.base_dir = str(output_dir) if output_dir else config.storage.base_dir
    config.reporting.base_dir = (
        str(output_dir) if output_dir else config.reporting.base_dir
--- a/graphrag/cli/main.py
+++ b/graphrag/cli/main.py
@ -16,7 +16,7 @@ from graphrag.logging import ReporterType
 from graphrag.prompt_tune.generator import MAX_TOKEN_COUNT
 from graphrag.prompt_tune.loader import MIN_CHUNK_SIZE

-from .index import index_cli
+from .index import index_cli, update_cli
 from .initialize import initialize_project_at
 from .prompt_tune import prompt_tune
 from .query import run_drift_search, run_global_search, run_local_search
@ -129,6 +129,71 @@ def _index_cli(
    )


+@app.command("update")
+def _update_cli(
+    config: Annotated[
+        Path | None,
+        typer.Option(
+            help="The configuration to use.", exists=True, file_okay=True, readable=True
+        ),
+    ] = None,
+    root: Annotated[
+        Path,
+        typer.Option(
+            help="The project root directory.",
+            exists=True,
+            dir_okay=True,
+            writable=True,
+            resolve_path=True,
+        ),
+    ] = Path(),  # set default to current directory
+    verbose: Annotated[
+        bool, typer.Option(help="Run the indexing pipeline with verbose logging")
+    ] = False,
+    memprofile: Annotated[
+        bool, typer.Option(help="Run the indexing pipeline with memory profiling")
+    ] = False,
+    reporter: Annotated[
+        ReporterType, typer.Option(help="The progress reporter to use.")
+    ] = ReporterType.RICH,
+    emit: Annotated[
+        str, typer.Option(help="The data formats to emit, comma-separated.")
+    ] = TableEmitterType.Parquet.value,
+    cache: Annotated[bool, typer.Option(help="Use LLM cache.")] = True,
+    skip_validation: Annotated[
+        bool,
+        typer.Option(
+            help="Skip any preflight validation. Useful when running no LLM steps."
+        ),
+    ] = False,
+    output: Annotated[
+        Path | None,
+        typer.Option(
+            help="Indexing pipeline output directory. Overrides storage.base_dir in the configuration file.",
+            dir_okay=True,
+            writable=True,
+            resolve_path=True,
+        ),
+    ] = None,
+):
+    """
+    Update an existing knowledge graph index.
+
+    Applies a default storage configuration (if not provided by config), saving the new index to the local file system in the `update_output` folder.
+    """
+    update_cli(
+        root_dir=root,
+        verbose=verbose,
+        memprofile=memprofile,
+        cache=cache,
+        reporter=ReporterType(reporter),
+        config_filepath=config,
+        emit=[TableEmitterType(value.strip()) for value in emit.split(",")],
+        skip_validation=skip_validation,
+        output_dir=output,
+    )
+
+
@app.command("prompt-tune")
 def _prompt_tune_cli(
    root: Annotated[
--- a/pyproject.toml
+++ b/pyproject.toml
@ -139,6 +139,7 @@ test_smoke = "pytest ./tests/smoke"
 test_notebook = "pytest ./tests/notebook"
 test_verbs = "pytest ./tests/verbs"
 index = "python -m graphrag index"
+update = "python -m graphrag update"
 init = "python -m graphrag init"
 query = "python -m graphrag query"
 prompt_tune = "python -m graphrag prompt-tune"