From 20c120288b42135bf0849a2db4c2dd48d09ea345 Mon Sep 17 00:00:00 2001 From: Alonso Guevara Date: Thu, 7 Nov 2024 06:59:10 -0600 Subject: [PATCH] Feat/update cli (#1376) * Add update cli option with default storage * Semver * Semver * Pyright * Format --- .../patch-20241107010037320137.json | 4 ++ .vscode/settings.json | 1 + graphrag/cli/index.py | 67 ++++++++++++++++++- graphrag/cli/main.py | 67 ++++++++++++++++++- pyproject.toml | 1 + 5 files changed, 138 insertions(+), 2 deletions(-) create mode 100644 .semversioner/next-release/patch-20241107010037320137.json diff --git a/.semversioner/next-release/patch-20241107010037320137.json b/.semversioner/next-release/patch-20241107010037320137.json new file mode 100644 index 00000000..faf6e32e --- /dev/null +++ b/.semversioner/next-release/patch-20241107010037320137.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Add update cli entrypoint for incremental indexing" +} diff --git a/.vscode/settings.json b/.vscode/settings.json index 70dbdd44..0b678d5d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -3,6 +3,7 @@ "**/.yarn": true, "**/.pnp.*": true }, + "editor.formatOnSave": false, "eslint.nodePath": ".yarn/sdks", "typescript.tsdk": ".yarn/sdks/typescript/lib", "typescript.enablePromptUseWorkspaceTsdk": true, diff --git a/graphrag/cli/index.py b/graphrag/cli/index.py index e1db2bbc..c9ec2bdc 100644 --- a/graphrag/cli/index.py +++ b/graphrag/cli/index.py @@ -79,11 +79,76 @@ def index_cli( output_dir: Path | None, ): """Run the pipeline with the given config.""" + config = load_config(root_dir, config_filepath) + + _run_index( + config=config, + verbose=verbose, + resume=resume, + memprofile=memprofile, + cache=cache, + reporter=reporter, + emit=emit, + dry_run=dry_run, + skip_validation=skip_validation, + output_dir=output_dir, + ) + + +def update_cli( + root_dir: Path, + verbose: bool, + memprofile: bool, + cache: bool, + reporter: ReporterType, + config_filepath: Path | None, + emit: list[TableEmitterType], + skip_validation: bool, + output_dir: Path | None, +): + """Run the pipeline with the given config.""" + config = load_config(root_dir, config_filepath) + + # Check if update storage exist, if not configure it with default values + if not config.update_index_storage: + from graphrag.config.defaults import STORAGE_TYPE, UPDATE_STORAGE_BASE_DIR + from graphrag.config.models.storage_config import StorageConfig + + config.update_index_storage = StorageConfig( + type=STORAGE_TYPE, + base_dir=UPDATE_STORAGE_BASE_DIR, + ) + + _run_index( + config=config, + verbose=verbose, + resume=False, + memprofile=memprofile, + cache=cache, + reporter=reporter, + emit=emit, + dry_run=False, + skip_validation=skip_validation, + output_dir=output_dir, + ) + + +def _run_index( + config, + verbose, + resume, + memprofile, + cache, + reporter, + emit, + dry_run, + skip_validation, + output_dir, +): progress_reporter = create_progress_reporter(reporter) info, error, success = _logger(progress_reporter) run_id = resume or time.strftime("%Y%m%d-%H%M%S") - config = load_config(root_dir, config_filepath) config.storage.base_dir = str(output_dir) if output_dir else config.storage.base_dir config.reporting.base_dir = ( str(output_dir) if output_dir else config.reporting.base_dir diff --git a/graphrag/cli/main.py b/graphrag/cli/main.py index da50242a..dce9702a 100644 --- a/graphrag/cli/main.py +++ b/graphrag/cli/main.py @@ -16,7 +16,7 @@ from graphrag.logging import ReporterType from graphrag.prompt_tune.generator import MAX_TOKEN_COUNT from graphrag.prompt_tune.loader import MIN_CHUNK_SIZE -from .index import index_cli +from .index import index_cli, update_cli from .initialize import initialize_project_at from .prompt_tune import prompt_tune from .query import run_drift_search, run_global_search, run_local_search @@ -129,6 +129,71 @@ def _index_cli( ) +@app.command("update") +def _update_cli( + config: Annotated[ + Path | None, + typer.Option( + help="The configuration to use.", exists=True, file_okay=True, readable=True + ), + ] = None, + root: Annotated[ + Path, + typer.Option( + help="The project root directory.", + exists=True, + dir_okay=True, + writable=True, + resolve_path=True, + ), + ] = Path(), # set default to current directory + verbose: Annotated[ + bool, typer.Option(help="Run the indexing pipeline with verbose logging") + ] = False, + memprofile: Annotated[ + bool, typer.Option(help="Run the indexing pipeline with memory profiling") + ] = False, + reporter: Annotated[ + ReporterType, typer.Option(help="The progress reporter to use.") + ] = ReporterType.RICH, + emit: Annotated[ + str, typer.Option(help="The data formats to emit, comma-separated.") + ] = TableEmitterType.Parquet.value, + cache: Annotated[bool, typer.Option(help="Use LLM cache.")] = True, + skip_validation: Annotated[ + bool, + typer.Option( + help="Skip any preflight validation. Useful when running no LLM steps." + ), + ] = False, + output: Annotated[ + Path | None, + typer.Option( + help="Indexing pipeline output directory. Overrides storage.base_dir in the configuration file.", + dir_okay=True, + writable=True, + resolve_path=True, + ), + ] = None, +): + """ + Update an existing knowledge graph index. + + Applies a default storage configuration (if not provided by config), saving the new index to the local file system in the `update_output` folder. + """ + update_cli( + root_dir=root, + verbose=verbose, + memprofile=memprofile, + cache=cache, + reporter=ReporterType(reporter), + config_filepath=config, + emit=[TableEmitterType(value.strip()) for value in emit.split(",")], + skip_validation=skip_validation, + output_dir=output, + ) + + @app.command("prompt-tune") def _prompt_tune_cli( root: Annotated[ diff --git a/pyproject.toml b/pyproject.toml index b9ae32aa..8a12b26a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -139,6 +139,7 @@ test_smoke = "pytest ./tests/smoke" test_notebook = "pytest ./tests/notebook" test_verbs = "pytest ./tests/verbs" index = "python -m graphrag index" +update = "python -m graphrag update" init = "python -m graphrag init" query = "python -m graphrag query" prompt_tune = "python -m graphrag prompt-tune"