Feat/update cli (#1376)

* Add update cli option with default storage * Semver * Semver * Pyright * Format
2025-06-26 23:19:58 +00:00 · 2024-11-07 06:59:10 -06:00 · 2024-11-07 06:59:10 -06:00 · 20c120288b
commit 20c120288b
parent baa261c8e9
5 changed files with 138 additions and 2 deletions
--- a/.semversioner/next-release/patch-20241107010037320137.json
+++ b/.semversioner/next-release/patch-20241107010037320137.json
@ -0,0 +1,4 @@
 {
  "type": "patch",
  "description": "Add update cli entrypoint for incremental indexing"
 }
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -3,6 +3,7 @@
    "**/.yarn": true,
    "**/.pnp.*": true
  },
  "editor.formatOnSave": false,
  "eslint.nodePath": ".yarn/sdks",
  "typescript.tsdk": ".yarn/sdks/typescript/lib",
  "typescript.enablePromptUseWorkspaceTsdk": true,
--- a/graphrag/cli/index.py
+++ b/graphrag/cli/index.py
@ -79,11 +79,76 @@ def index_cli(
    output_dir: Path | None,
 ):
    """Run the pipeline with the given config."""
    config = load_config(root_dir, config_filepath)
    _run_index(
        config=config,
        verbose=verbose,
        resume=resume,
        memprofile=memprofile,
        cache=cache,
        reporter=reporter,
        emit=emit,
        dry_run=dry_run,
        skip_validation=skip_validation,
        output_dir=output_dir,
    )
 def update_cli(
    root_dir: Path,
    verbose: bool,
    memprofile: bool,
    cache: bool,
    reporter: ReporterType,
    config_filepath: Path | None,
    emit: list[TableEmitterType],
    skip_validation: bool,
    output_dir: Path | None,
 ):
    """Run the pipeline with the given config."""
    config = load_config(root_dir, config_filepath)
    # Check if update storage exist, if not configure it with default values
    if not config.update_index_storage:
        from graphrag.config.defaults import STORAGE_TYPE, UPDATE_STORAGE_BASE_DIR
        from graphrag.config.models.storage_config import StorageConfig
        config.update_index_storage = StorageConfig(
            type=STORAGE_TYPE,
            base_dir=UPDATE_STORAGE_BASE_DIR,
        )
    _run_index(
        config=config,
        verbose=verbose,
        resume=False,
        memprofile=memprofile,
        cache=cache,
        reporter=reporter,
        emit=emit,
        dry_run=False,
        skip_validation=skip_validation,
        output_dir=output_dir,
    )
 def _run_index(
    config,
    verbose,
    resume,
    memprofile,
    cache,
    reporter,
    emit,
    dry_run,
    skip_validation,
    output_dir,
 ):
    progress_reporter = create_progress_reporter(reporter)
    info, error, success = _logger(progress_reporter)
    run_id = resume or time.strftime("%Y%m%d-%H%M%S")
    config = load_config(root_dir, config_filepath)
    config.storage.base_dir = str(output_dir) if output_dir else config.storage.base_dir
    config.reporting.base_dir = (
        str(output_dir) if output_dir else config.reporting.base_dir
--- a/graphrag/cli/main.py
+++ b/graphrag/cli/main.py
@ -16,7 +16,7 @@ from graphrag.logging import ReporterType
 from graphrag.prompt_tune.generator import MAX_TOKEN_COUNT
 from graphrag.prompt_tune.loader import MIN_CHUNK_SIZE
-from .index import index_cli
+from .index import index_cli, update_cli
 from .initialize import initialize_project_at
 from .prompt_tune import prompt_tune
 from .query import run_drift_search, run_global_search, run_local_search
@ -129,6 +129,71 @@ def _index_cli(
    )
@app.command("update")
 def _update_cli(
    config: Annotated[
        Path | None,
        typer.Option(
            help="The configuration to use.", exists=True, file_okay=True, readable=True
        ),
    ] = None,
    root: Annotated[
        Path,
        typer.Option(
            help="The project root directory.",
            exists=True,
            dir_okay=True,
            writable=True,
            resolve_path=True,
        ),
    ] = Path(),  # set default to current directory
    verbose: Annotated[
        bool, typer.Option(help="Run the indexing pipeline with verbose logging")
    ] = False,
    memprofile: Annotated[
        bool, typer.Option(help="Run the indexing pipeline with memory profiling")
    ] = False,
    reporter: Annotated[
        ReporterType, typer.Option(help="The progress reporter to use.")
    ] = ReporterType.RICH,
    emit: Annotated[
        str, typer.Option(help="The data formats to emit, comma-separated.")
    ] = TableEmitterType.Parquet.value,
    cache: Annotated[bool, typer.Option(help="Use LLM cache.")] = True,
    skip_validation: Annotated[
        bool,
        typer.Option(
            help="Skip any preflight validation. Useful when running no LLM steps."
        ),
    ] = False,
    output: Annotated[
        Path | None,
        typer.Option(
            help="Indexing pipeline output directory. Overrides storage.base_dir in the configuration file.",
            dir_okay=True,
            writable=True,
            resolve_path=True,
        ),
    ] = None,
 ):
    """
    Update an existing knowledge graph index.
    Applies a default storage configuration (if not provided by config), saving the new index to the local file system in the `update_output` folder.
    """
    update_cli(
        root_dir=root,
        verbose=verbose,
        memprofile=memprofile,
        cache=cache,
        reporter=ReporterType(reporter),
        config_filepath=config,
        emit=[TableEmitterType(value.strip()) for value in emit.split(",")],
        skip_validation=skip_validation,
        output_dir=output,
    )
@app.command("prompt-tune")
 def _prompt_tune_cli(
    root: Annotated[
--- a/pyproject.toml
+++ b/pyproject.toml
@ -139,6 +139,7 @@ test_smoke = "pytest ./tests/smoke"
 test_notebook = "pytest ./tests/notebook"
 test_verbs = "pytest ./tests/verbs"
 index = "python -m graphrag index"
 update = "python -m graphrag update"
 init = "python -m graphrag init"
 query = "python -m graphrag query"
 prompt_tune = "python -m graphrag prompt-tune"