Auto-generate CLI doc pages (#1325)

2025-12-27 15:10:00 +00:00 · 2024-10-25 19:00:24 -04:00 · 2024-10-25 19:00:24 -04:00 · 083de12bcf
commit 083de12bcf
parent d6e6f5c077
10 changed files with 36 additions and 75 deletions
--- a/.semversioner/next-release/patch-20241025215416188681.json
+++ b/.semversioner/next-release/patch-20241025215416188681.json
@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "add-autogenerated-cli-docs"
+}
--- a/dictionary.txt
+++ b/dictionary.txt
@ -67,6 +67,7 @@ pypi
 nbformat
 semversioner
 mkdocs
+typer

 # Library Methods
 iterrows
--- a/docs/cli.md
+++ b/docs/cli.md
@ -0,0 +1,9 @@
+# CLI Reference
+
+This page documents the command-line interface of the graphrag library.
+
+::: mkdocs-typer
+    :module: graphrag.cli.main
+    :prog_name: graphrag
+    :command: app
+    :depth: 0
--- a/docs/get_started.md
+++ b/docs/get_started.md
@ -30,6 +30,8 @@ It shows how to use the system to index some text, and then use the indexed data
 pip install graphrag
 ```

+The graphrag library includes a CLI for a no-code approach to getting started. Please review the full [CLI documentation](cli.md) for further detail.
+
 # Running the Indexer

 Now we need to set up a data project and some initial configuration. Let's set that up. We're using the [default configuration mode](config/overview.md), which you can customize as needed using a [config file](config/json_yaml.md), which we recommend, or [environment variables](config/env_vars.md).
--- a/docs/index/cli.md
+++ b/docs/index/cli.md
@ -1,23 +0,0 @@
-# Indexer CLI
-
-The GraphRAG indexer CLI allows for no-code usage of the GraphRAG Indexer.
-
-```bash
-graphrag index --verbose --root </workspace/project/root> \
--config <custom_config.yml> --resume <timestamp> \
--reporter <rich|print|none> --emit json,csv,parquet \
--no-cache
-```
-
-## CLI Arguments
-
- `--verbose` - Adds extra logging information during the run.
- `--root <data-project-dir>` - the data root directory. This should contain an `input` directory with the input data, and an `.env` file with environment variables. These are described below.
- `--resume <output-timestamp>` - if specified, the pipeline will attempt to resume a prior run. The parquet files from the prior run will be loaded into the system as inputs, and the workflows that generated those files will be skipped. The input value should be the timestamped output folder, e.g. "20240105-143721".
- `--config <config_file.yml>` - This will opt-out of the Default Configuration mode and execute a custom configuration. If this is used, then none of the environment-variables below will apply.
- `--reporter <reporter>` - This will specify the progress reporter to use. The default is `rich`. Valid values are `rich`, `print`, and `none`.
- `--dry-run` - Runs the indexing pipeline without executing any steps in order to inspect and validate the configuration file.
- `--emit <types>` - This specifies the table output formats the pipeline should emit. The default is `parquet`. Valid values are `parquet`, `csv`, and `json`, comma-separated.
- `--no-cache` - This will disable the caching mechanism. This is useful for debugging and development, but should not be used in production.
- `--output <directory>` - Specify the output directory for pipeline artifacts.
- `--reports <directory>` - Specify the output directory for reporting.
--- a/docs/query/cli.md
+++ b/docs/query/cli.md
@ -1,44 +0,0 @@
-# Query CLI
-
-The GraphRAG query CLI allows for no-code usage of the GraphRAG Query engine.
-
-```bash
-graphrag query --config <config_file.yml> --data <path-to-data> --community-level <comunit-level> --response-type <response-type> --method <"local"|"global"> <query>
-```
-
-## CLI Arguments
-
- `--config <config_file.yml>` - The configuration yaml file to use when running the query. If this is used, then none of the environment-variables below will apply.
- `--data <path-to-data>` - Folder containing the `.parquet` output files from running the Indexer.
- `--community-level <community-level>` - Community level in the Leiden community hierarchy from which we will load the community reports higher value means we use reports on smaller communities. Default: 2
- `--response-type <response-type>` - Free form text describing the response type and format, can be anything, e.g. `Multiple Paragraphs`, `Single Paragraph`, `Single Sentence`, `List of 3-7 Points`, `Single Page`, `Multi-Page Report`. Default: `Multiple Paragraphs`.
- `--method <"local"|"global">` - Method to use to answer the query, one of local or global. For more information check [Overview](overview.md)
- `--streaming` - Stream back the LLM response
-
-## Env Variables
-
-Required environment variables to execute:
- `GRAPHRAG_API_KEY` - API Key for executing the model, will fallback to `OPENAI_API_KEY` if one is not provided.
- `GRAPHRAG_LLM_MODEL` - Model to use for Chat Completions.
- `GRAPHRAG_EMBEDDING_MODEL` - Model to use for Embeddings.
-
-You can further customize the execution by providing these environment variables:
-
- `GRAPHRAG_LLM_API_BASE` - The API Base URL. Default: `None`
- `GRAPHRAG_LLM_TYPE` - The LLM operation type. Either `openai_chat` or `azure_openai_chat`. Default: `openai_chat`
- `GRAPHRAG_LLM_MAX_RETRIES` - The maximum number of retries to attempt when a request fails. Default: `20`
- `GRAPHRAG_EMBEDDING_API_BASE` - The API Base URL. Default: `None`
- `GRAPHRAG_EMBEDDING_TYPE` - The embedding client to use. Either `openai_embedding` or `azure_openai_embedding`. Default: `openai_embedding`
- `GRAPHRAG_EMBEDDING_MAX_RETRIES` - The maximum number of retries to attempt when a request fails. Default: `20`
- `GRAPHRAG_LOCAL_SEARCH_TEXT_UNIT_PROP` - Proportion of context window dedicated to related text units. Default: `0.5`
- `GRAPHRAG_LOCAL_SEARCH_COMMUNITY_PROP` - Proportion of context window dedicated to community reports. Default: `0.1`
- `GRAPHRAG_LOCAL_SEARCH_CONVERSATION_HISTORY_MAX_TURNS` - Maximum number of turns to include in the conversation history. Default: `5`
- `GRAPHRAG_LOCAL_SEARCH_TOP_K_ENTITIES` - Number of related entities to retrieve from the entity description embedding store. Default: `10`
- `GRAPHRAG_LOCAL_SEARCH_TOP_K_RELATIONSHIPS` - Control the number of out-of-network relationships to pull into the context window. Default: `10`
- `GRAPHRAG_LOCAL_SEARCH_MAX_TOKENS` - Change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000). Default: `12000`
- `GRAPHRAG_LOCAL_SEARCH_LLM_MAX_TOKENS` - Change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500). Default: `2000`
- `GRAPHRAG_GLOBAL_SEARCH_MAX_TOKENS` - Change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000). Default: `12000`
- `GRAPHRAG_GLOBAL_SEARCH_DATA_MAX_TOKENS` - Change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000). Default: `12000`
- `GRAPHRAG_GLOBAL_SEARCH_MAP_MAX_TOKENS` - Default: `500`
- `GRAPHRAG_GLOBAL_SEARCH_REDUCE_MAX_TOKENS` - Change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500). Default: `2000`
- `GRAPHRAG_GLOBAL_SEARCH_CONCURRENCY` - Default: `32`
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@ -29,7 +29,6 @@ nav:
      - Overview: "index/overview.md"
      - Architecture: "index/architecture.md"
      - Dataflow: "index/default_dataflow.md"
-      - CLI: "index/cli.md"
      - Configuration:
          - Overview: "config/overview.md"
          - Init Command: "config/init.md"
@ -46,13 +45,13 @@ nav:
      - Local Search: "query/local_search.md"
      - Question Generation: "query/question_generation.md"
      - Global Search: "query/global_search.md"
-      - CLI: "query/cli.md"
      - Notebooks:
          - Overview: "query/notebooks/overview.md"
          - Global Search: "examples_notebooks/global_search.ipynb"
          - Local Search: "examples_notebooks/local_search.ipynb"
  - Microsoft Research Blog: "blog_posts.md"
  - Extras:
+      - CLI: "cli.md"
      - Operation Dulce:
          - About: "data/operation_dulce/ABOUT.md"
          - Document: "data/operation_dulce/Operation Dulce v2 1 1.md"
@ -104,3 +103,4 @@ markdown_extensions:
      slugify: !!python/object/apply:pymdownx.slugs.slugify
        kwds:
          case: lower
+  - mkdocs-typer
--- a/poetry.lock
+++ b/poetry.lock
@ -2545,6 +2545,21 @@ files = [
    {file = "mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443"},
 ]

+[[package]]
+name = "mkdocs-typer"
+version = "0.0.3"
+description = "An MkDocs extension to generate documentation for Typer command line applications"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "mkdocs_typer-0.0.3-py3-none-any.whl", hash = "sha256:b2a9a44da590a7100114fde4de9123fedfea692d229379984db20ee3b3f12d7c"},
+    {file = "mkdocs_typer-0.0.3.tar.gz", hash = "sha256:4dd37f024190a82aaf0f6c984faafb15167d34eab7e29a6a85e61362423a4eb7"},
+]
+
+[package.dependencies]
+markdown = "==3.*"
+typer = "==0.*"
+
 [[package]]
 name = "msal"
 version = "1.31.0"
@ -5200,4 +5215,4 @@ files = [
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.13"
-content-hash = "0bcb3b8ebe38153edddd48f8077ddf58e4628e7b714731a9fa48785288d206b9"
+content-hash = "7f78e10fa0099c66763c74fd0846581bfd760fb466bc3479c166a613e4881a3a"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -92,6 +92,7 @@ json-repair = "^0.30.0"
 future = "^1.0.0" # Needed until graspologic fixes their dependency
 typer = "^0.12.5"

+mkdocs-typer = "^0.0.3"
 [tool.poetry.group.dev.dependencies]
 coverage = "^7.6.0"
 ipykernel = "^6.29.4"
--- a/tests/unit/config/test_default_config.py
+++ b/tests/unit/config/test_default_config.py
@ -482,10 +482,8 @@ class TestDefaultConfig(unittest.TestCase):

    def test_all_env_vars_is_accurate(self):
        env_var_docs_path = Path("docs/config/env_vars.md")
-        query_docs_path = Path("docs/query/cli.md")

        env_var_docs = env_var_docs_path.read_text(encoding="utf-8")
-        query_docs = query_docs_path.read_text(encoding="utf-8")

        def find_envvar_names(text) -> set[str]:
            pattern = r"`(GRAPHRAG_[^`]+)`"
@ -493,9 +491,7 @@ class TestDefaultConfig(unittest.TestCase):
            found = {f for f in found if not f.endswith("_")}
            return {*found}

-        graphrag_strings = find_envvar_names(env_var_docs) | find_envvar_names(
-            query_docs
-        )
+        graphrag_strings = find_envvar_names(env_var_docs)

        missing = {s for s in graphrag_strings if s not in ALL_ENV_VARS} - {
            # Remove configs covered by the base LLM connection configs