Merge branch 'main' into incremental_indexing/main

2025-12-14 16:47:18 +00:00 · 2024-09-10 16:04:01 -06:00 · 2024-09-10 16:04:01 -06:00 · 67f4b02ecd
commit 67f4b02ecd
parent 3295e2b861 e7ee8cb8a5
18 changed files with 511 additions and 434 deletions
--- a/.semversioner/0.3.3.json
+++ b/.semversioner/0.3.3.json
@ -0,0 +1,66 @@
 {
  "changes": [
    {
      "description": "Add entrypoints for incremental indexing",
      "type": "patch"
    },
    {
      "description": "Clean up and organize run index code",
      "type": "patch"
    },
    {
      "description": "Consistent config loading. Resolves #99 and Resolves #1049",
      "type": "patch"
    },
    {
      "description": "Fix circular dependency when running prompt tune api directly",
      "type": "patch"
    },
    {
      "description": "Fix default settings for embedding",
      "type": "patch"
    },
    {
      "description": "Fix img for auto tune",
      "type": "patch"
    },
    {
      "description": "Fix img width",
      "type": "patch"
    },
    {
      "description": "Fixed a bug in prompt tuning process",
      "type": "patch"
    },
    {
      "description": "Refactor text unit build at local search",
      "type": "patch"
    },
    {
      "description": "Update Prompt Tuning docs",
      "type": "patch"
    },
    {
      "description": "Update create_pipeline_config.py",
      "type": "patch"
    },
    {
      "description": "Update prompt tune command in docs",
      "type": "patch"
    },
    {
      "description": "add querying from azure blob storage",
      "type": "patch"
    },
    {
      "description": "fix setting base_dir to full paths when not using file system.",
      "type": "patch"
    },
    {
      "description": "fix strategy config in entity_extraction",
      "type": "patch"
    }
  ],
  "created_at": "2024-09-10T19:51:24+00:00",
  "version": "0.3.3"
 }
--- a/.semversioner/next-release/patch-20240712071506108985.json
+++ b/.semversioner/next-release/patch-20240712071506108985.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "fix strategy config in entity_extraction"
 }
--- a/.semversioner/next-release/patch-20240814063732868394.json
+++ b/.semversioner/next-release/patch-20240814063732868394.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Fixed a bug in prompt tuning process"
 }
--- a/.semversioner/next-release/patch-20240827203354884800.json
+++ b/.semversioner/next-release/patch-20240827203354884800.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Fix default settings for embedding"
 }
--- a/.semversioner/next-release/patch-20240827212041426794.json
+++ b/.semversioner/next-release/patch-20240827212041426794.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Refactor text unit build at local search"
 }
--- a/.semversioner/next-release/patch-20240829175336332224.json
+++ b/.semversioner/next-release/patch-20240829175336332224.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Fix circular dependency when running prompt tune api directly"
 }
--- a/.semversioner/next-release/patch-20240829213842840703.json
+++ b/.semversioner/next-release/patch-20240829213842840703.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Update Prompt Tuning docs"
 }
--- a/.semversioner/next-release/patch-20240829222117086645.json
+++ b/.semversioner/next-release/patch-20240829222117086645.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Update prompt tune command in docs"
 }
--- a/.semversioner/next-release/patch-20240829223855375571.json
+++ b/.semversioner/next-release/patch-20240829223855375571.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Fix img for auto tune"
 }
--- a/.semversioner/next-release/patch-20240829230018473667.json
+++ b/.semversioner/next-release/patch-20240829230018473667.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Fix img width"
 }
--- a/.semversioner/next-release/patch-20240830151802543194.json
+++ b/.semversioner/next-release/patch-20240830151802543194.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Consistent config loading. Resolves #99 and Resolves #1049"
 }
--- a/.semversioner/next-release/patch-20240830181135475287.json
+++ b/.semversioner/next-release/patch-20240830181135475287.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Add entrypoints for incremental indexing"
 }
--- a/.semversioner/next-release/patch-20240903205022597458.json
+++ b/.semversioner/next-release/patch-20240903205022597458.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Clean up and organize run index code"
 }
--- a/.semversioner/next-release/patch-20240904173227165702.json
+++ b/.semversioner/next-release/patch-20240904173227165702.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "fix setting base_dir to full paths when not using file system."
 }
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,88 +1,106 @@
-# Changelog
+# Changelog
-Note: version releases in the 0.x.y range may introduce breaking changes.
+Note: version releases in the 0.x.y range may introduce breaking changes.
-
+
-## 0.3.2
+## 0.3.3
-
+
- patch: Add context data to query API responses.
+- patch: Add entrypoints for incremental indexing
- patch: Add missing config parameter documentation for prompt tuning
+- patch: Clean up and organize run index code
- patch: Add neo4j community notebook
+- patch: Consistent config loading. Resolves #99 and Resolves #1049
- patch: Ensure entity types to be str when running prompt tuning
+- patch: Fix circular dependency when running prompt tune api directly
- patch: Fix weight casting during graph extraction
+- patch: Fix default settings for embedding
- patch: Patch "past" dependency issues
+- patch: Fix img for auto tune
- patch: Update developer guide.
+- patch: Fix img width
- patch: Update query type hints.
+- patch: Fixed a bug in prompt tuning process
- patch: change-lancedb-placement
+- patch: Refactor text unit build at local search
-
+- patch: Update Prompt Tuning docs
-## 0.3.1
+- patch: Update create_pipeline_config.py
-
+- patch: Update prompt tune command in docs
- patch: Add preflight check to check LLM connectivity.
+- patch: add querying from azure blob storage
- patch: Add streaming support for local/global search to query cli
+- patch: fix setting base_dir to full paths when not using file system.
- patch: Add support for both float and int on schema validation for community report generation
+- patch: fix strategy config in entity_extraction
- patch: Avoid running index on gh-pages publishing
+
- patch: Implement Index API
+## 0.3.2
- patch: Improves filtering for data dir inferring
+
- patch: Update to nltk 3.9.1
+- patch: Add context data to query API responses.
-
+- patch: Add missing config parameter documentation for prompt tuning
-## 0.3.0
+- patch: Add neo4j community notebook
-
+- patch: Ensure entity types to be str when running prompt tuning
- minor: Implement auto templating API.
+- patch: Fix weight casting during graph extraction
- minor: Implement query engine API.
+- patch: Patch "past" dependency issues
- patch: Fix file dumps using json for non ASCII chars
+- patch: Update developer guide.
- patch: Stabilize smoke tests for query context building
+- patch: Update query type hints.
- patch: fix query embedding
+- patch: change-lancedb-placement
- patch: fix sort_context & max_tokens params in verb
+
-
+## 0.3.1
-## 0.2.2
+
-
+- patch: Add preflight check to check LLM connectivity.
- patch: Add a check if there is no community record added in local search context
+- patch: Add streaming support for local/global search to query cli
- patch: Add sepparate workflow for Python Tests
+- patch: Add support for both float and int on schema validation for community report generation
- patch: Docs updates
+- patch: Avoid running index on gh-pages publishing
- patch: Run smoke tests on 4o
+- patch: Implement Index API
-
+- patch: Improves filtering for data dir inferring
-## 0.2.1
+- patch: Update to nltk 3.9.1
-
+
- patch: Added default columns for vector store at create_pipeline_config. No change for other cases.
+## 0.3.0
- patch: Change json parsing error in the map step of global search to warning
+
- patch: Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config
+- minor: Implement auto templating API.
- patch: Fix json parsing when LLM returns faulty responses
+- minor: Implement query engine API.
- patch: Fix missing community reports and refactor community context builder
+- patch: Fix file dumps using json for non ASCII chars
- patch: Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly.
+- patch: Stabilize smoke tests for query context building
- patch: Try parsing json before even repairing
+- patch: fix query embedding
- patch: Update Prompt Tuning meta prompts with finer examples
+- patch: fix sort_context & max_tokens params in verb
- patch: Update default entity extraction and gleaning prompts to reduce hallucinations
+
- patch: add encoding-model to entity/claim extraction config
+## 0.2.2
- patch: add encoding-model to text chunking config
+
- patch: add user prompt to history-tracking llm
+- patch: Add a check if there is no community record added in local search context
- patch: update config reader to allow for zero gleans
+- patch: Add sepparate workflow for Python Tests
- patch: update config-reader to allow for empty chunk-by arrays
+- patch: Docs updates
- patch: update history-tracking LLm to use 'assistant' instead of 'system' in output history.
+- patch: Run smoke tests on 4o
- patch: use history argument in hash key computation; add history input to cache data
+
-
+## 0.2.1
-## 0.2.0
+
-
+- patch: Added default columns for vector store at create_pipeline_config. No change for other cases.
- minor: Add content-based KNN for selecting prompt tune few shot examples
+- patch: Change json parsing error in the map step of global search to warning
- minor: Add dynamic community report rating to the prompt tuning engine
+- patch: Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config
- patch: Add Minute-based Rate Limiting and fix rpm, tpm settings
+- patch: Fix json parsing when LLM returns faulty responses
- patch: Add N parameter support
+- patch: Fix missing community reports and refactor community context builder
- patch: Add cli flag to overlay default values onto a provided config.
+- patch: Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly.
- patch: Add exception handling on file load
+- patch: Try parsing json before even repairing
- patch: Add language support to prompt tuning
+- patch: Update Prompt Tuning meta prompts with finer examples
- patch: Add llm params to local and global search
+- patch: Update default entity extraction and gleaning prompts to reduce hallucinations
- patch: Fix broken prompt tuning link on docs
+- patch: add encoding-model to entity/claim extraction config
- patch: Fix delta none on query calls
+- patch: add encoding-model to text chunking config
- patch: Fix docsite base url
+- patch: add user prompt to history-tracking llm
- patch: Fix encoding model parameter on prompt tune
+- patch: update config reader to allow for zero gleans
- patch: Fix for --limit exceeding the dataframe length
+- patch: update config-reader to allow for empty chunk-by arrays
- patch: Fix for Ruff 0.5.2
+- patch: update history-tracking LLm to use 'assistant' instead of 'system' in output history.
- patch: Fixed an issue where base OpenAI embeddings can't work with Azure OpenAI LLM
+- patch: use history argument in hash key computation; add history input to cache data
- patch: Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls
+
- patch: fix community_report doesn't work in settings.yaml
+## 0.2.0
- patch: fix llm response content is None in query
+
- patch: fix the organization parameter is ineffective during queries
+- minor: Add content-based KNN for selecting prompt tune few shot examples
- patch: remove duplicate file read
+- minor: Add dynamic community report rating to the prompt tuning engine
- patch: support non-open ai model config to prompt tune
+- patch: Add Minute-based Rate Limiting and fix rpm, tpm settings
- patch: use binary io processing for all file io operations
+- patch: Add N parameter support
-
+- patch: Add cli flag to overlay default values onto a provided config.
-## 0.1.0
+- patch: Add exception handling on file load
-
+- patch: Add language support to prompt tuning
- minor: Initial Release
+- patch: Add llm params to local and global search
 - patch: Fix broken prompt tuning link on docs
 - patch: Fix delta none on query calls
 - patch: Fix docsite base url
 - patch: Fix encoding model parameter on prompt tune
 - patch: Fix for --limit exceeding the dataframe length
 - patch: Fix for Ruff 0.5.2
 - patch: Fixed an issue where base OpenAI embeddings can't work with Azure OpenAI LLM
 - patch: Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls
 - patch: fix community_report doesn't work in settings.yaml
 - patch: fix llm response content is None in query
 - patch: fix the organization parameter is ineffective during queries
 - patch: remove duplicate file read
 - patch: support non-open ai model config to prompt tune
 - patch: use binary io processing for all file io operations
 ## 0.1.0
 - minor: Initial Release
--- a/graphrag/index/create_pipeline_config.py
+++ b/graphrag/index/create_pipeline_config.py
@ -274,7 +274,7 @@ def _get_embedding_settings(
    #
    strategy = settings.resolved_strategy()  # get the default strategy
    strategy.update({
-        "vector_store": {**vector_store_settings, **(vector_store_params or {})}
+        "vector_store": {**(vector_store_params or {}), **vector_store_settings}
    })  # update the default strategy with the vector store settings
    # This ensures the vector store config is part of the strategy and not the global config
    return {
--- a/graphrag/query/cli.py
+++ b/graphrag/query/cli.py
@ -9,8 +9,14 @@ from pathlib import Path
 import pandas as pd
-from graphrag.config import load_config, resolve_path
+from graphrag.config import (
    GraphRagConfig,
    load_config,
    resolve_path,
 )
 from graphrag.index.create_pipeline_config import create_pipeline_config
 from graphrag.index.progress import PrintProgressReporter
 from graphrag.utils.storage import _create_storage, _load_table_from_storage
 from . import api
@ -36,17 +42,21 @@ def run_global_search(
    if data_dir:
        config.storage.base_dir = str(resolve_path(data_dir, root))
-    data_path = Path(config.storage.base_dir).resolve()
+    dataframe_dict = _resolve_parquet_files(
-
+        root_dir=root_dir,
-    final_nodes: pd.DataFrame = pd.read_parquet(
+        config=config,
-        data_path / "create_final_nodes.parquet"
+        parquet_list=[
-    )
+            "create_final_nodes.parquet",
-    final_entities: pd.DataFrame = pd.read_parquet(
+            "create_final_entities.parquet",
-        data_path / "create_final_entities.parquet"
+            "create_final_community_reports.parquet",
-    )
+        ],
-    final_community_reports: pd.DataFrame = pd.read_parquet(
+        optional_list=[],
        data_path / "create_final_community_reports.parquet"
    )
    final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
    final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
    final_community_reports: pd.DataFrame = dataframe_dict[
        "create_final_community_reports"
    ]
    # call the Query API
    if streaming:
@ -112,23 +122,26 @@ def run_local_search(
    if data_dir:
        config.storage.base_dir = str(resolve_path(data_dir, root))
-    data_path = Path(config.storage.base_dir).resolve()
+    dataframe_dict = _resolve_parquet_files(
-
+        root_dir=root_dir,
-    final_nodes = pd.read_parquet(data_path / "create_final_nodes.parquet")
+        config=config,
-    final_community_reports = pd.read_parquet(
+        parquet_list=[
-        data_path / "create_final_community_reports.parquet"
+            "create_final_nodes.parquet",
-    )
+            "create_final_community_reports.parquet",
-    final_text_units = pd.read_parquet(data_path / "create_final_text_units.parquet")
+            "create_final_text_units.parquet",
-    final_relationships = pd.read_parquet(
+            "create_final_relationships.parquet",
-        data_path / "create_final_relationships.parquet"
+            "create_final_entities.parquet",
-    )
+        ],
-    final_entities = pd.read_parquet(data_path / "create_final_entities.parquet")
+        optional_list=["create_final_covariates.parquet"],
    final_covariates_path = data_path / "create_final_covariates.parquet"
    final_covariates = (
        pd.read_parquet(final_covariates_path)
        if final_covariates_path.exists()
        else None
    )
    final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
    final_community_reports: pd.DataFrame = dataframe_dict[
        "create_final_community_reports"
    ]
    final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"]
    final_relationships: pd.DataFrame = dataframe_dict["create_final_relationships"]
    final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
    final_covariates: pd.DataFrame | None = dataframe_dict["create_final_covariates"]
    # call the Query API
    if streaming:
@ -179,3 +192,35 @@ def run_local_search(
    # NOTE: we return the response and context data here purely as a complete demonstration of the API.
    # External users should use the API directly to get the response and context data.
    return response, context_data
 def _resolve_parquet_files(
    root_dir: str,
    config: GraphRagConfig,
    parquet_list: list[str],
    optional_list: list[str],
 ) -> dict[str, pd.DataFrame]:
    """Read parquet files to a dataframe dict."""
    dataframe_dict = {}
    pipeline_config = create_pipeline_config(config)
    storage_obj = _create_storage(root_dir=root_dir, config=pipeline_config.storage)
    for parquet_file in parquet_list:
        df_key = parquet_file.split(".")[0]
        df_value = asyncio.run(
            _load_table_from_storage(name=parquet_file, storage=storage_obj)
        )
        dataframe_dict[df_key] = df_value
    # for optional parquet files, set the dict entry to None instead of erroring out if it does not exist
    for optional_file in optional_list:
        file_exists = asyncio.run(storage_obj.has(optional_file))
        df_key = optional_file.split(".")[0]
        if file_exists:
            df_value = asyncio.run(
                _load_table_from_storage(name=optional_file, storage=storage_obj)
            )
            dataframe_dict[df_key] = df_value
        else:
            dataframe_dict[df_key] = None
    return dataframe_dict
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,266 +1,266 @@
-[tool.poetry]
+[tool.poetry]
-name = "graphrag"
+name = "graphrag"
-# Maintainers: do not change the version here manually, use ./scripts/release.sh
+# Maintainers: do not change the version here manually, use ./scripts/release.sh
-version = "0.3.2"
+version = "0.3.3"
-description = ""
+description = ""
-authors = [
+authors = [
-    "Alonso Guevara Fernández <alonsog@microsoft.com>",
+    "Alonso Guevara Fernández <alonsog@microsoft.com>",
-    "Andrés Morales Esquivel <andresmor@microsoft.com>",
+    "Andrés Morales Esquivel <andresmor@microsoft.com>",
-    "Chris Trevino <chtrevin@microsoft.com>",
+    "Chris Trevino <chtrevin@microsoft.com>",
-    "David Tittsworth <datittsw@microsoft.com>",
+    "David Tittsworth <datittsw@microsoft.com>",
-    "Dayenne de Souza <ddesouza@microsoft.com>",
+    "Dayenne de Souza <ddesouza@microsoft.com>",
-    "Derek Worthen <deworthe@microsoft.com>",
+    "Derek Worthen <deworthe@microsoft.com>",
-    "Gaudy Blanco Meneses <gaudyb@microsoft.com>",
+    "Gaudy Blanco Meneses <gaudyb@microsoft.com>",
-    "Ha Trinh <trinhha@microsoft.com>",
+    "Ha Trinh <trinhha@microsoft.com>",
-    "Jonathan Larson <jolarso@microsoft.com>",
+    "Jonathan Larson <jolarso@microsoft.com>",
-    "Josh Bradley <joshbradley@microsoft.com>",
+    "Josh Bradley <joshbradley@microsoft.com>",
-    "Kate Lytvynets <kalytv@microsoft.com>",
+    "Kate Lytvynets <kalytv@microsoft.com>",
-    "Kenny Zhang <zhangken@microsoft.com>",
+    "Kenny Zhang <zhangken@microsoft.com>",
-    "Mónica Carvajal",
+    "Mónica Carvajal",
-    "Nathan Evans <naevans@microsoft.com>",
+    "Nathan Evans <naevans@microsoft.com>",
-    "Rodrigo Racanicci <rracanicci@microsoft.com>",
+    "Rodrigo Racanicci <rracanicci@microsoft.com>",
-    "Sarah Smith <smithsarah@microsoft.com>",
+    "Sarah Smith <smithsarah@microsoft.com>",
-]
+]
-license = "MIT"
+license = "MIT"
-readme = "README.md"
+readme = "README.md"
-packages = [{ include = "graphrag" }]
+packages = [{ include = "graphrag" }]
-
+
-[tool.poetry.urls]
+[tool.poetry.urls]
-"Source" = "https://github.com/microsoft/graphrag"
+"Source" = "https://github.com/microsoft/graphrag"
-
+
-[tool.poetry-dynamic-versioning]
+[tool.poetry-dynamic-versioning]
-enable = true
+enable = true
-style = "pep440"
+style = "pep440"
-vcs = "git"
+vcs = "git"
-bump = true
+bump = true
-format-jinja = """
+format-jinja = """
-    {%- if distance == 0 -%}
+    {%- if distance == 0 -%}
-        {{ serialize_pep440(base, stage, revision) }}
+        {{ serialize_pep440(base, stage, revision) }}
-    {%- else -%}
+    {%- else -%}
-        {{ serialize_pep440(base, stage, revision, dev=distance) }}
+        {{ serialize_pep440(base, stage, revision, dev=distance) }}
-    {%- endif -%}
+    {%- endif -%}
-"""
+"""
-
+
-[tool.poetry.dependencies]
+[tool.poetry.dependencies]
-python = ">=3.10,<3.13"
+python = ">=3.10,<3.13"
-environs = "^11.0.0"
+environs = "^11.0.0"
-datashaper = "^0.0.49"
+datashaper = "^0.0.49"
-
+
-# Vector Stores
+# Vector Stores
-azure-search-documents = "^11.4.0"
+azure-search-documents = "^11.4.0"
-lancedb = "^0.12.0"
+lancedb = "^0.12.0"
-
+
-# Event Loops
+# Event Loops
-uvloop = { version = "^0.20.0", markers = "platform_system != 'Windows'" }
+uvloop = { version = "^0.20.0", markers = "platform_system != 'Windows'" }
-nest-asyncio = { version = "^1.6.0", markers = "platform_system == 'Windows'" }
+nest-asyncio = { version = "^1.6.0", markers = "platform_system == 'Windows'" }
-
+
-# Async IO
+# Async IO
-aiolimiter = "^1.1.0"
+aiolimiter = "^1.1.0"
-aiofiles = "^24.1.0"
+aiofiles = "^24.1.0"
-
+
-# LLM
+# LLM
-openai = "^1.37.1"
+openai = "^1.37.1"
-nltk = "3.9.1"
+nltk = "3.9.1"
-tiktoken = "^0.7.0"
+tiktoken = "^0.7.0"
-
+
-# Data-Sci
+# Data-Sci
-numba = "0.60.0"
+numba = "0.60.0"
-numpy = "^1.25.2"
+numpy = "^1.25.2"
-graspologic = "^3.4.1"
+graspologic = "^3.4.1"
-networkx = "^3"
+networkx = "^3"
-fastparquet = "^2024.2.0"
+fastparquet = "^2024.2.0"
-# 1.13.0 was a footgun
+# 1.13.0 was a footgun
-scipy = "1.12.0"
+scipy = "1.12.0"
-
+
-# Configuration
+# Configuration
-pyyaml = "^6.0.2"
+pyyaml = "^6.0.2"
-pyaml-env = "^1.2.1"
+pyaml-env = "^1.2.1"
-python-dotenv = "^1.0.0"
+python-dotenv = "^1.0.0"
-
+
-# Network
+# Network
-tenacity = "^9.0.0"
+tenacity = "^9.0.0"
-
+
-swifter = "^1.4.0"
+swifter = "^1.4.0"
-pydantic = "^2"
+pydantic = "^2"
-rich = "^13.6.0"
+rich = "^13.6.0"
-textual = "^0.78.0"
+textual = "^0.78.0"
-devtools = "^0.12.2"
+devtools = "^0.12.2"
-
+
-typing-extensions = "^4.12.2"
+typing-extensions = "^4.12.2"
-
+
-#Azure
+#Azure
-azure-storage-blob = "^12.22.0"
+azure-storage-blob = "^12.22.0"
-azure-identity = "^1.17.1"
+azure-identity = "^1.17.1"
-json-repair = "^0.28.4"
+json-repair = "^0.28.4"
-
+
-future = "^1.0.0"
+future = "^1.0.0"
-[tool.poetry.group.dev.dependencies]
+[tool.poetry.group.dev.dependencies]
-coverage = "^7.6.0"
+coverage = "^7.6.0"
-ipykernel = "^6.29.4"
+ipykernel = "^6.29.4"
-jupyter = "^1.0.0"
+jupyter = "^1.0.0"
-nbconvert = "^7.16.3"
+nbconvert = "^7.16.3"
-poethepoet = "^0.27.0"
+poethepoet = "^0.27.0"
-pyright = "^1.1.371"
+pyright = "^1.1.371"
-pytest = "^8.3.2"
+pytest = "^8.3.2"
-pytest-asyncio = "^0.24.0"
+pytest-asyncio = "^0.24.0"
-pytest-timeout = "^2.3.1"
+pytest-timeout = "^2.3.1"
-ruff = "^0.6.2"
+ruff = "^0.6.2"
-semversioner = "^2.0.3"
+semversioner = "^2.0.3"
-
+
-update-toml = "^0.2.1"
+update-toml = "^0.2.1"
-
+
-[build-system]
+[build-system]
-requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
+requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
-build-backend = "poetry_dynamic_versioning.backend"
+build-backend = "poetry_dynamic_versioning.backend"
-
+
-[tool.poe.tasks]
+[tool.poe.tasks]
-_sort_imports = "ruff check --select I --fix . --preview"
+_sort_imports = "ruff check --select I --fix . --preview"
-_format_code = "ruff format  . --preview"
+_format_code = "ruff format  . --preview"
-_ruff_check = 'ruff check . --preview'
+_ruff_check = 'ruff check . --preview'
-_pyright = "pyright"
+_pyright = "pyright"
-_convert_local_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/local_search.ipynb'
+_convert_local_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/local_search.ipynb'
-_convert_global_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/global_search.ipynb'
+_convert_global_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/global_search.ipynb'
-_semversioner_release = "semversioner release"
+_semversioner_release = "semversioner release"
-_semversioner_changelog = "semversioner changelog > CHANGELOG.md"
+_semversioner_changelog = "semversioner changelog > CHANGELOG.md"
-_semversioner_update_toml_version = "update-toml update --path tool.poetry.version --value $(poetry run semversioner current-version)"
+_semversioner_update_toml_version = "update-toml update --path tool.poetry.version --value $(poetry run semversioner current-version)"
-semversioner_add = "semversioner add-change"
+semversioner_add = "semversioner add-change"
-coverage_report = 'coverage report --omit "**/tests/**" --show-missing'
+coverage_report = 'coverage report --omit "**/tests/**" --show-missing'
-check_format = 'ruff format . --check --preview'
+check_format = 'ruff format . --check --preview'
-fix = "ruff  --preview check --fix ."
+fix = "ruff  --preview check --fix ."
-fix_unsafe = "ruff check --preview --fix --unsafe-fixes ."
+fix_unsafe = "ruff check --preview --fix --unsafe-fixes ."
-
+
-_test_all = "coverage run -m pytest ./tests"
+_test_all = "coverage run -m pytest ./tests"
-test_unit = "pytest ./tests/unit"
+test_unit = "pytest ./tests/unit"
-test_integration = "pytest ./tests/integration"
+test_integration = "pytest ./tests/integration"
-test_smoke = "pytest ./tests/smoke"
+test_smoke = "pytest ./tests/smoke"
-test_notebook = "pytest ./tests/notebook"
+test_notebook = "pytest ./tests/notebook"
-index = "python -m graphrag.index"
+index = "python -m graphrag.index"
-query = "python -m graphrag.query"
+query = "python -m graphrag.query"
-prompt_tune = "python -m graphrag.prompt_tune"
+prompt_tune = "python -m graphrag.prompt_tune"
-# Pass in a test pattern
+# Pass in a test pattern
-test_only = "pytest -s -k"
+test_only = "pytest -s -k"
-
+
-[[tool.poe.tasks.release]]
+[[tool.poe.tasks.release]]
-sequence = [
+sequence = [
-    '_semversioner_release',
+    '_semversioner_release',
-    '_semversioner_changelog',
+    '_semversioner_changelog',
-    '_semversioner_update_toml_version',
+    '_semversioner_update_toml_version',
-]
+]
-ignore_fail = 'return_non_zero'
+ignore_fail = 'return_non_zero'
-
+
-[[tool.poe.tasks.convert_docsite_notebooks]]
+[[tool.poe.tasks.convert_docsite_notebooks]]
-sequence = ['_convert_local_search_nb', '_convert_global_search_nb']
+sequence = ['_convert_local_search_nb', '_convert_global_search_nb']
-ignore_fail = 'return_non_zero'
+ignore_fail = 'return_non_zero'
-
+
-[[tool.poe.tasks.format]]
+[[tool.poe.tasks.format]]
-sequence = ['_sort_imports', '_format_code']
+sequence = ['_sort_imports', '_format_code']
-ignore_fail = 'return_non_zero'
+ignore_fail = 'return_non_zero'
-
+
-[[tool.poe.tasks.check]]
+[[tool.poe.tasks.check]]
-sequence = ['check_format', '_ruff_check', '_pyright']
+sequence = ['check_format', '_ruff_check', '_pyright']
-ignore_fail = 'return_non_zero'
+ignore_fail = 'return_non_zero'
-
+
-[[tool.poe.tasks.test]]
+[[tool.poe.tasks.test]]
-sequence = ['_test_all', 'coverage_report']
+sequence = ['_test_all', 'coverage_report']
-ignore_fail = 'return_non_zero'
+ignore_fail = 'return_non_zero'
-
+
-[tool.ruff]
+[tool.ruff]
-target-version = "py310"
+target-version = "py310"
-extend-include = ["*.ipynb"]
+extend-include = ["*.ipynb"]
-
+
-[tool.ruff.format]
+[tool.ruff.format]
-docstring-code-format = true
+docstring-code-format = true
-docstring-code-line-length = 20
+docstring-code-line-length = 20
-
+
-[tool.ruff.lint]
+[tool.ruff.lint]
-select = [
+select = [
-    "E4",
+    "E4",
-    "E7",
+    "E7",
-    "E9",
+    "E9",
-    "W291",
+    "W291",
-    "YTT",
+    "YTT",
-    "T10",
+    "T10",
-    "ICN",
+    "ICN",
-    "INP",
+    "INP",
-    "Q",
+    "Q",
-    "RSE",
+    "RSE",
-    "SLOT",
+    "SLOT",
-    "INT",
+    "INT",
-    "FLY",
+    "FLY",
-    "LOG",
+    "LOG",
-    "C90",
+    "C90",
-    "T20",
+    "T20",
-    "D",
+    "D",
-    "RET",
+    "RET",
-    "PD",
+    "PD",
-    "N",
+    "N",
-    "PIE",
+    "PIE",
-    "SIM",
+    "SIM",
-    "S",
+    "S",
-    "G",
+    "G",
-    "ERA",
+    "ERA",
-    "ASYNC",
+    "ASYNC",
-    "TID",
+    "TID",
-    "UP",
+    "UP",
-    "SLF",
+    "SLF",
-    "BLE",
+    "BLE",
-    "C4",
+    "C4",
-    "I",
+    "I",
-    "F",
+    "F",
-    "A",
+    "A",
-    "ARG",
+    "ARG",
-    "PTH",
+    "PTH",
-    "RUF",
+    "RUF",
-    "B",
+    "B",
-    "TCH",
+    "TCH",
-    "DTZ",
+    "DTZ",
-    "PYI",
+    "PYI",
-    "PT",
+    "PT",
-    "EM",
+    "EM",
-    "TRY",
+    "TRY",
-    "PERF",
+    "PERF",
-    "CPY",
+    "CPY",
-    # "FBT", # use named arguments for boolean flags
+    # "FBT", # use named arguments for boolean flags
-    # "TD", # todos
+    # "TD", # todos
-    # "FIX", # fixme
+    # "FIX", # fixme
-    # "FURB" # preview rules
+    # "FURB" # preview rules
-    # ANN # Type annotations, re-enable when we get bandwidth
+    # ANN # Type annotations, re-enable when we get bandwidth
-]
+]
-ignore = [
+ignore = [
-    # Ignore module names shadowing Python builtins
+    # Ignore module names shadowing Python builtins
-    "A005",
+    "A005",
-    # Deprecated Rules
+    # Deprecated Rules
-    "ANN101",
+    "ANN101",
-    "ANN102",
+    "ANN102",
-    # Conflicts with interface argument checking
+    # Conflicts with interface argument checking
-    "ARG002",
+    "ARG002",
-    "ANN204",
+    "ANN204",
-    # TODO: Inspect these pandas rules for validity
+    # TODO: Inspect these pandas rules for validity
-    "PD002", # prevents inplace=True
+    "PD002", # prevents inplace=True
-    # TODO RE-Enable when we get bandwidth
+    # TODO RE-Enable when we get bandwidth
-    "PERF203", # Needs restructuring of errors, we should bail-out on first error
+    "PERF203", # Needs restructuring of errors, we should bail-out on first error
-    "C901",    # needs refactoring to remove cyclomatic complexity
+    "C901",    # needs refactoring to remove cyclomatic complexity
-]
+]
-
+
-[tool.ruff.lint.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
-"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"]
+"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"]
-"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"]
+"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"]
-"graphrag/index/config/*" = ["TCH"]
+"graphrag/index/config/*" = ["TCH"]
-"*.ipynb" = ["T201"]
+"*.ipynb" = ["T201"]
-
+
-[tool.ruff.lint.flake8-builtins]
+[tool.ruff.lint.flake8-builtins]
-builtins-ignorelist = ["input", "id", "bytes"]
+builtins-ignorelist = ["input", "id", "bytes"]
-
+
-[tool.ruff.lint.pydocstyle]
+[tool.ruff.lint.pydocstyle]
-convention = "numpy"
+convention = "numpy"
-
+
-# https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file
+# https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file
-[tool.pyright]
+[tool.pyright]
-include = ["graphrag", "tests", "examples", "examples_notebooks"]
+include = ["graphrag", "tests", "examples", "examples_notebooks"]
-exclude = ["**/node_modules", "**/__pycache__"]
+exclude = ["**/node_modules", "**/__pycache__"]
-
+
-[tool.pytest.ini_options]
+[tool.pytest.ini_options]
-asyncio_mode = "auto"
+asyncio_mode = "auto"
-timeout = 800
+timeout = 800
-# log_cli = true
+# log_cli = true
-# log_cli_level = "INFO"
+# log_cli_level = "INFO"