Merge branch 'main' into incremental_indexing/main

This commit is contained in:
Alonso Guevara 2024-09-10 16:04:01 -06:00
commit 67f4b02ecd
18 changed files with 511 additions and 434 deletions

66
.semversioner/0.3.3.json Normal file
View File

@ -0,0 +1,66 @@
{
"changes": [
{
"description": "Add entrypoints for incremental indexing",
"type": "patch"
},
{
"description": "Clean up and organize run index code",
"type": "patch"
},
{
"description": "Consistent config loading. Resolves #99 and Resolves #1049",
"type": "patch"
},
{
"description": "Fix circular dependency when running prompt tune api directly",
"type": "patch"
},
{
"description": "Fix default settings for embedding",
"type": "patch"
},
{
"description": "Fix img for auto tune",
"type": "patch"
},
{
"description": "Fix img width",
"type": "patch"
},
{
"description": "Fixed a bug in prompt tuning process",
"type": "patch"
},
{
"description": "Refactor text unit build at local search",
"type": "patch"
},
{
"description": "Update Prompt Tuning docs",
"type": "patch"
},
{
"description": "Update create_pipeline_config.py",
"type": "patch"
},
{
"description": "Update prompt tune command in docs",
"type": "patch"
},
{
"description": "add querying from azure blob storage",
"type": "patch"
},
{
"description": "fix setting base_dir to full paths when not using file system.",
"type": "patch"
},
{
"description": "fix strategy config in entity_extraction",
"type": "patch"
}
],
"created_at": "2024-09-10T19:51:24+00:00",
"version": "0.3.3"
}

View File

@ -1,4 +0,0 @@
{
"type": "patch",
"description": "fix strategy config in entity_extraction"
}

View File

@ -1,4 +0,0 @@
{
"type": "patch",
"description": "Fixed a bug in prompt tuning process"
}

View File

@ -1,4 +0,0 @@
{
"type": "patch",
"description": "Fix default settings for embedding"
}

View File

@ -1,4 +0,0 @@
{
"type": "patch",
"description": "Refactor text unit build at local search"
}

View File

@ -1,4 +0,0 @@
{
"type": "patch",
"description": "Fix circular dependency when running prompt tune api directly"
}

View File

@ -1,4 +0,0 @@
{
"type": "patch",
"description": "Update Prompt Tuning docs"
}

View File

@ -1,4 +0,0 @@
{
"type": "patch",
"description": "Update prompt tune command in docs"
}

View File

@ -1,4 +0,0 @@
{
"type": "patch",
"description": "Fix img for auto tune"
}

View File

@ -1,4 +0,0 @@
{
"type": "patch",
"description": "Fix img width"
}

View File

@ -1,4 +0,0 @@
{
"type": "patch",
"description": "Consistent config loading. Resolves #99 and Resolves #1049"
}

View File

@ -1,4 +0,0 @@
{
"type": "patch",
"description": "Add entrypoints for incremental indexing"
}

View File

@ -1,4 +0,0 @@
{
"type": "patch",
"description": "Clean up and organize run index code"
}

View File

@ -1,4 +0,0 @@
{
"type": "patch",
"description": "fix setting base_dir to full paths when not using file system."
}

View File

@ -1,88 +1,106 @@
# Changelog # Changelog
Note: version releases in the 0.x.y range may introduce breaking changes. Note: version releases in the 0.x.y range may introduce breaking changes.
## 0.3.2 ## 0.3.3
- patch: Add context data to query API responses. - patch: Add entrypoints for incremental indexing
- patch: Add missing config parameter documentation for prompt tuning - patch: Clean up and organize run index code
- patch: Add neo4j community notebook - patch: Consistent config loading. Resolves #99 and Resolves #1049
- patch: Ensure entity types to be str when running prompt tuning - patch: Fix circular dependency when running prompt tune api directly
- patch: Fix weight casting during graph extraction - patch: Fix default settings for embedding
- patch: Patch "past" dependency issues - patch: Fix img for auto tune
- patch: Update developer guide. - patch: Fix img width
- patch: Update query type hints. - patch: Fixed a bug in prompt tuning process
- patch: change-lancedb-placement - patch: Refactor text unit build at local search
- patch: Update Prompt Tuning docs
## 0.3.1 - patch: Update create_pipeline_config.py
- patch: Update prompt tune command in docs
- patch: Add preflight check to check LLM connectivity. - patch: add querying from azure blob storage
- patch: Add streaming support for local/global search to query cli - patch: fix setting base_dir to full paths when not using file system.
- patch: Add support for both float and int on schema validation for community report generation - patch: fix strategy config in entity_extraction
- patch: Avoid running index on gh-pages publishing
- patch: Implement Index API ## 0.3.2
- patch: Improves filtering for data dir inferring
- patch: Update to nltk 3.9.1 - patch: Add context data to query API responses.
- patch: Add missing config parameter documentation for prompt tuning
## 0.3.0 - patch: Add neo4j community notebook
- patch: Ensure entity types to be str when running prompt tuning
- minor: Implement auto templating API. - patch: Fix weight casting during graph extraction
- minor: Implement query engine API. - patch: Patch "past" dependency issues
- patch: Fix file dumps using json for non ASCII chars - patch: Update developer guide.
- patch: Stabilize smoke tests for query context building - patch: Update query type hints.
- patch: fix query embedding - patch: change-lancedb-placement
- patch: fix sort_context & max_tokens params in verb
## 0.3.1
## 0.2.2
- patch: Add preflight check to check LLM connectivity.
- patch: Add a check if there is no community record added in local search context - patch: Add streaming support for local/global search to query cli
- patch: Add sepparate workflow for Python Tests - patch: Add support for both float and int on schema validation for community report generation
- patch: Docs updates - patch: Avoid running index on gh-pages publishing
- patch: Run smoke tests on 4o - patch: Implement Index API
- patch: Improves filtering for data dir inferring
## 0.2.1 - patch: Update to nltk 3.9.1
- patch: Added default columns for vector store at create_pipeline_config. No change for other cases. ## 0.3.0
- patch: Change json parsing error in the map step of global search to warning
- patch: Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config - minor: Implement auto templating API.
- patch: Fix json parsing when LLM returns faulty responses - minor: Implement query engine API.
- patch: Fix missing community reports and refactor community context builder - patch: Fix file dumps using json for non ASCII chars
- patch: Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly. - patch: Stabilize smoke tests for query context building
- patch: Try parsing json before even repairing - patch: fix query embedding
- patch: Update Prompt Tuning meta prompts with finer examples - patch: fix sort_context & max_tokens params in verb
- patch: Update default entity extraction and gleaning prompts to reduce hallucinations
- patch: add encoding-model to entity/claim extraction config ## 0.2.2
- patch: add encoding-model to text chunking config
- patch: add user prompt to history-tracking llm - patch: Add a check if there is no community record added in local search context
- patch: update config reader to allow for zero gleans - patch: Add sepparate workflow for Python Tests
- patch: update config-reader to allow for empty chunk-by arrays - patch: Docs updates
- patch: update history-tracking LLm to use 'assistant' instead of 'system' in output history. - patch: Run smoke tests on 4o
- patch: use history argument in hash key computation; add history input to cache data
## 0.2.1
## 0.2.0
- patch: Added default columns for vector store at create_pipeline_config. No change for other cases.
- minor: Add content-based KNN for selecting prompt tune few shot examples - patch: Change json parsing error in the map step of global search to warning
- minor: Add dynamic community report rating to the prompt tuning engine - patch: Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config
- patch: Add Minute-based Rate Limiting and fix rpm, tpm settings - patch: Fix json parsing when LLM returns faulty responses
- patch: Add N parameter support - patch: Fix missing community reports and refactor community context builder
- patch: Add cli flag to overlay default values onto a provided config. - patch: Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly.
- patch: Add exception handling on file load - patch: Try parsing json before even repairing
- patch: Add language support to prompt tuning - patch: Update Prompt Tuning meta prompts with finer examples
- patch: Add llm params to local and global search - patch: Update default entity extraction and gleaning prompts to reduce hallucinations
- patch: Fix broken prompt tuning link on docs - patch: add encoding-model to entity/claim extraction config
- patch: Fix delta none on query calls - patch: add encoding-model to text chunking config
- patch: Fix docsite base url - patch: add user prompt to history-tracking llm
- patch: Fix encoding model parameter on prompt tune - patch: update config reader to allow for zero gleans
- patch: Fix for --limit exceeding the dataframe length - patch: update config-reader to allow for empty chunk-by arrays
- patch: Fix for Ruff 0.5.2 - patch: update history-tracking LLm to use 'assistant' instead of 'system' in output history.
- patch: Fixed an issue where base OpenAI embeddings can't work with Azure OpenAI LLM - patch: use history argument in hash key computation; add history input to cache data
- patch: Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls
- patch: fix community_report doesn't work in settings.yaml ## 0.2.0
- patch: fix llm response content is None in query
- patch: fix the organization parameter is ineffective during queries - minor: Add content-based KNN for selecting prompt tune few shot examples
- patch: remove duplicate file read - minor: Add dynamic community report rating to the prompt tuning engine
- patch: support non-open ai model config to prompt tune - patch: Add Minute-based Rate Limiting and fix rpm, tpm settings
- patch: use binary io processing for all file io operations - patch: Add N parameter support
- patch: Add cli flag to overlay default values onto a provided config.
## 0.1.0 - patch: Add exception handling on file load
- patch: Add language support to prompt tuning
- minor: Initial Release - patch: Add llm params to local and global search
- patch: Fix broken prompt tuning link on docs
- patch: Fix delta none on query calls
- patch: Fix docsite base url
- patch: Fix encoding model parameter on prompt tune
- patch: Fix for --limit exceeding the dataframe length
- patch: Fix for Ruff 0.5.2
- patch: Fixed an issue where base OpenAI embeddings can't work with Azure OpenAI LLM
- patch: Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls
- patch: fix community_report doesn't work in settings.yaml
- patch: fix llm response content is None in query
- patch: fix the organization parameter is ineffective during queries
- patch: remove duplicate file read
- patch: support non-open ai model config to prompt tune
- patch: use binary io processing for all file io operations
## 0.1.0
- minor: Initial Release

View File

@ -274,7 +274,7 @@ def _get_embedding_settings(
# #
strategy = settings.resolved_strategy() # get the default strategy strategy = settings.resolved_strategy() # get the default strategy
strategy.update({ strategy.update({
"vector_store": {**vector_store_settings, **(vector_store_params or {})} "vector_store": {**(vector_store_params or {}), **vector_store_settings}
}) # update the default strategy with the vector store settings }) # update the default strategy with the vector store settings
# This ensures the vector store config is part of the strategy and not the global config # This ensures the vector store config is part of the strategy and not the global config
return { return {

View File

@ -9,8 +9,14 @@ from pathlib import Path
import pandas as pd import pandas as pd
from graphrag.config import load_config, resolve_path from graphrag.config import (
GraphRagConfig,
load_config,
resolve_path,
)
from graphrag.index.create_pipeline_config import create_pipeline_config
from graphrag.index.progress import PrintProgressReporter from graphrag.index.progress import PrintProgressReporter
from graphrag.utils.storage import _create_storage, _load_table_from_storage
from . import api from . import api
@ -36,17 +42,21 @@ def run_global_search(
if data_dir: if data_dir:
config.storage.base_dir = str(resolve_path(data_dir, root)) config.storage.base_dir = str(resolve_path(data_dir, root))
data_path = Path(config.storage.base_dir).resolve() dataframe_dict = _resolve_parquet_files(
root_dir=root_dir,
final_nodes: pd.DataFrame = pd.read_parquet( config=config,
data_path / "create_final_nodes.parquet" parquet_list=[
) "create_final_nodes.parquet",
final_entities: pd.DataFrame = pd.read_parquet( "create_final_entities.parquet",
data_path / "create_final_entities.parquet" "create_final_community_reports.parquet",
) ],
final_community_reports: pd.DataFrame = pd.read_parquet( optional_list=[],
data_path / "create_final_community_reports.parquet"
) )
final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
final_community_reports: pd.DataFrame = dataframe_dict[
"create_final_community_reports"
]
# call the Query API # call the Query API
if streaming: if streaming:
@ -112,23 +122,26 @@ def run_local_search(
if data_dir: if data_dir:
config.storage.base_dir = str(resolve_path(data_dir, root)) config.storage.base_dir = str(resolve_path(data_dir, root))
data_path = Path(config.storage.base_dir).resolve() dataframe_dict = _resolve_parquet_files(
root_dir=root_dir,
final_nodes = pd.read_parquet(data_path / "create_final_nodes.parquet") config=config,
final_community_reports = pd.read_parquet( parquet_list=[
data_path / "create_final_community_reports.parquet" "create_final_nodes.parquet",
) "create_final_community_reports.parquet",
final_text_units = pd.read_parquet(data_path / "create_final_text_units.parquet") "create_final_text_units.parquet",
final_relationships = pd.read_parquet( "create_final_relationships.parquet",
data_path / "create_final_relationships.parquet" "create_final_entities.parquet",
) ],
final_entities = pd.read_parquet(data_path / "create_final_entities.parquet") optional_list=["create_final_covariates.parquet"],
final_covariates_path = data_path / "create_final_covariates.parquet"
final_covariates = (
pd.read_parquet(final_covariates_path)
if final_covariates_path.exists()
else None
) )
final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
final_community_reports: pd.DataFrame = dataframe_dict[
"create_final_community_reports"
]
final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"]
final_relationships: pd.DataFrame = dataframe_dict["create_final_relationships"]
final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
final_covariates: pd.DataFrame | None = dataframe_dict["create_final_covariates"]
# call the Query API # call the Query API
if streaming: if streaming:
@ -179,3 +192,35 @@ def run_local_search(
# NOTE: we return the response and context data here purely as a complete demonstration of the API. # NOTE: we return the response and context data here purely as a complete demonstration of the API.
# External users should use the API directly to get the response and context data. # External users should use the API directly to get the response and context data.
return response, context_data return response, context_data
def _resolve_parquet_files(
root_dir: str,
config: GraphRagConfig,
parquet_list: list[str],
optional_list: list[str],
) -> dict[str, pd.DataFrame]:
"""Read parquet files to a dataframe dict."""
dataframe_dict = {}
pipeline_config = create_pipeline_config(config)
storage_obj = _create_storage(root_dir=root_dir, config=pipeline_config.storage)
for parquet_file in parquet_list:
df_key = parquet_file.split(".")[0]
df_value = asyncio.run(
_load_table_from_storage(name=parquet_file, storage=storage_obj)
)
dataframe_dict[df_key] = df_value
# for optional parquet files, set the dict entry to None instead of erroring out if it does not exist
for optional_file in optional_list:
file_exists = asyncio.run(storage_obj.has(optional_file))
df_key = optional_file.split(".")[0]
if file_exists:
df_value = asyncio.run(
_load_table_from_storage(name=optional_file, storage=storage_obj)
)
dataframe_dict[df_key] = df_value
else:
dataframe_dict[df_key] = None
return dataframe_dict

View File

@ -1,266 +1,266 @@
[tool.poetry] [tool.poetry]
name = "graphrag" name = "graphrag"
# Maintainers: do not change the version here manually, use ./scripts/release.sh # Maintainers: do not change the version here manually, use ./scripts/release.sh
version = "0.3.2" version = "0.3.3"
description = "" description = ""
authors = [ authors = [
"Alonso Guevara Fernández <alonsog@microsoft.com>", "Alonso Guevara Fernández <alonsog@microsoft.com>",
"Andrés Morales Esquivel <andresmor@microsoft.com>", "Andrés Morales Esquivel <andresmor@microsoft.com>",
"Chris Trevino <chtrevin@microsoft.com>", "Chris Trevino <chtrevin@microsoft.com>",
"David Tittsworth <datittsw@microsoft.com>", "David Tittsworth <datittsw@microsoft.com>",
"Dayenne de Souza <ddesouza@microsoft.com>", "Dayenne de Souza <ddesouza@microsoft.com>",
"Derek Worthen <deworthe@microsoft.com>", "Derek Worthen <deworthe@microsoft.com>",
"Gaudy Blanco Meneses <gaudyb@microsoft.com>", "Gaudy Blanco Meneses <gaudyb@microsoft.com>",
"Ha Trinh <trinhha@microsoft.com>", "Ha Trinh <trinhha@microsoft.com>",
"Jonathan Larson <jolarso@microsoft.com>", "Jonathan Larson <jolarso@microsoft.com>",
"Josh Bradley <joshbradley@microsoft.com>", "Josh Bradley <joshbradley@microsoft.com>",
"Kate Lytvynets <kalytv@microsoft.com>", "Kate Lytvynets <kalytv@microsoft.com>",
"Kenny Zhang <zhangken@microsoft.com>", "Kenny Zhang <zhangken@microsoft.com>",
"Mónica Carvajal", "Mónica Carvajal",
"Nathan Evans <naevans@microsoft.com>", "Nathan Evans <naevans@microsoft.com>",
"Rodrigo Racanicci <rracanicci@microsoft.com>", "Rodrigo Racanicci <rracanicci@microsoft.com>",
"Sarah Smith <smithsarah@microsoft.com>", "Sarah Smith <smithsarah@microsoft.com>",
] ]
license = "MIT" license = "MIT"
readme = "README.md" readme = "README.md"
packages = [{ include = "graphrag" }] packages = [{ include = "graphrag" }]
[tool.poetry.urls] [tool.poetry.urls]
"Source" = "https://github.com/microsoft/graphrag" "Source" = "https://github.com/microsoft/graphrag"
[tool.poetry-dynamic-versioning] [tool.poetry-dynamic-versioning]
enable = true enable = true
style = "pep440" style = "pep440"
vcs = "git" vcs = "git"
bump = true bump = true
format-jinja = """ format-jinja = """
{%- if distance == 0 -%} {%- if distance == 0 -%}
{{ serialize_pep440(base, stage, revision) }} {{ serialize_pep440(base, stage, revision) }}
{%- else -%} {%- else -%}
{{ serialize_pep440(base, stage, revision, dev=distance) }} {{ serialize_pep440(base, stage, revision, dev=distance) }}
{%- endif -%} {%- endif -%}
""" """
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.10,<3.13" python = ">=3.10,<3.13"
environs = "^11.0.0" environs = "^11.0.0"
datashaper = "^0.0.49" datashaper = "^0.0.49"
# Vector Stores # Vector Stores
azure-search-documents = "^11.4.0" azure-search-documents = "^11.4.0"
lancedb = "^0.12.0" lancedb = "^0.12.0"
# Event Loops # Event Loops
uvloop = { version = "^0.20.0", markers = "platform_system != 'Windows'" } uvloop = { version = "^0.20.0", markers = "platform_system != 'Windows'" }
nest-asyncio = { version = "^1.6.0", markers = "platform_system == 'Windows'" } nest-asyncio = { version = "^1.6.0", markers = "platform_system == 'Windows'" }
# Async IO # Async IO
aiolimiter = "^1.1.0" aiolimiter = "^1.1.0"
aiofiles = "^24.1.0" aiofiles = "^24.1.0"
# LLM # LLM
openai = "^1.37.1" openai = "^1.37.1"
nltk = "3.9.1" nltk = "3.9.1"
tiktoken = "^0.7.0" tiktoken = "^0.7.0"
# Data-Sci # Data-Sci
numba = "0.60.0" numba = "0.60.0"
numpy = "^1.25.2" numpy = "^1.25.2"
graspologic = "^3.4.1" graspologic = "^3.4.1"
networkx = "^3" networkx = "^3"
fastparquet = "^2024.2.0" fastparquet = "^2024.2.0"
# 1.13.0 was a footgun # 1.13.0 was a footgun
scipy = "1.12.0" scipy = "1.12.0"
# Configuration # Configuration
pyyaml = "^6.0.2" pyyaml = "^6.0.2"
pyaml-env = "^1.2.1" pyaml-env = "^1.2.1"
python-dotenv = "^1.0.0" python-dotenv = "^1.0.0"
# Network # Network
tenacity = "^9.0.0" tenacity = "^9.0.0"
swifter = "^1.4.0" swifter = "^1.4.0"
pydantic = "^2" pydantic = "^2"
rich = "^13.6.0" rich = "^13.6.0"
textual = "^0.78.0" textual = "^0.78.0"
devtools = "^0.12.2" devtools = "^0.12.2"
typing-extensions = "^4.12.2" typing-extensions = "^4.12.2"
#Azure #Azure
azure-storage-blob = "^12.22.0" azure-storage-blob = "^12.22.0"
azure-identity = "^1.17.1" azure-identity = "^1.17.1"
json-repair = "^0.28.4" json-repair = "^0.28.4"
future = "^1.0.0" future = "^1.0.0"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
coverage = "^7.6.0" coverage = "^7.6.0"
ipykernel = "^6.29.4" ipykernel = "^6.29.4"
jupyter = "^1.0.0" jupyter = "^1.0.0"
nbconvert = "^7.16.3" nbconvert = "^7.16.3"
poethepoet = "^0.27.0" poethepoet = "^0.27.0"
pyright = "^1.1.371" pyright = "^1.1.371"
pytest = "^8.3.2" pytest = "^8.3.2"
pytest-asyncio = "^0.24.0" pytest-asyncio = "^0.24.0"
pytest-timeout = "^2.3.1" pytest-timeout = "^2.3.1"
ruff = "^0.6.2" ruff = "^0.6.2"
semversioner = "^2.0.3" semversioner = "^2.0.3"
update-toml = "^0.2.1" update-toml = "^0.2.1"
[build-system] [build-system]
requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"] requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
build-backend = "poetry_dynamic_versioning.backend" build-backend = "poetry_dynamic_versioning.backend"
[tool.poe.tasks] [tool.poe.tasks]
_sort_imports = "ruff check --select I --fix . --preview" _sort_imports = "ruff check --select I --fix . --preview"
_format_code = "ruff format . --preview" _format_code = "ruff format . --preview"
_ruff_check = 'ruff check . --preview' _ruff_check = 'ruff check . --preview'
_pyright = "pyright" _pyright = "pyright"
_convert_local_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/local_search.ipynb' _convert_local_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/local_search.ipynb'
_convert_global_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/global_search.ipynb' _convert_global_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/global_search.ipynb'
_semversioner_release = "semversioner release" _semversioner_release = "semversioner release"
_semversioner_changelog = "semversioner changelog > CHANGELOG.md" _semversioner_changelog = "semversioner changelog > CHANGELOG.md"
_semversioner_update_toml_version = "update-toml update --path tool.poetry.version --value $(poetry run semversioner current-version)" _semversioner_update_toml_version = "update-toml update --path tool.poetry.version --value $(poetry run semversioner current-version)"
semversioner_add = "semversioner add-change" semversioner_add = "semversioner add-change"
coverage_report = 'coverage report --omit "**/tests/**" --show-missing' coverage_report = 'coverage report --omit "**/tests/**" --show-missing'
check_format = 'ruff format . --check --preview' check_format = 'ruff format . --check --preview'
fix = "ruff --preview check --fix ." fix = "ruff --preview check --fix ."
fix_unsafe = "ruff check --preview --fix --unsafe-fixes ." fix_unsafe = "ruff check --preview --fix --unsafe-fixes ."
_test_all = "coverage run -m pytest ./tests" _test_all = "coverage run -m pytest ./tests"
test_unit = "pytest ./tests/unit" test_unit = "pytest ./tests/unit"
test_integration = "pytest ./tests/integration" test_integration = "pytest ./tests/integration"
test_smoke = "pytest ./tests/smoke" test_smoke = "pytest ./tests/smoke"
test_notebook = "pytest ./tests/notebook" test_notebook = "pytest ./tests/notebook"
index = "python -m graphrag.index" index = "python -m graphrag.index"
query = "python -m graphrag.query" query = "python -m graphrag.query"
prompt_tune = "python -m graphrag.prompt_tune" prompt_tune = "python -m graphrag.prompt_tune"
# Pass in a test pattern # Pass in a test pattern
test_only = "pytest -s -k" test_only = "pytest -s -k"
[[tool.poe.tasks.release]] [[tool.poe.tasks.release]]
sequence = [ sequence = [
'_semversioner_release', '_semversioner_release',
'_semversioner_changelog', '_semversioner_changelog',
'_semversioner_update_toml_version', '_semversioner_update_toml_version',
] ]
ignore_fail = 'return_non_zero' ignore_fail = 'return_non_zero'
[[tool.poe.tasks.convert_docsite_notebooks]] [[tool.poe.tasks.convert_docsite_notebooks]]
sequence = ['_convert_local_search_nb', '_convert_global_search_nb'] sequence = ['_convert_local_search_nb', '_convert_global_search_nb']
ignore_fail = 'return_non_zero' ignore_fail = 'return_non_zero'
[[tool.poe.tasks.format]] [[tool.poe.tasks.format]]
sequence = ['_sort_imports', '_format_code'] sequence = ['_sort_imports', '_format_code']
ignore_fail = 'return_non_zero' ignore_fail = 'return_non_zero'
[[tool.poe.tasks.check]] [[tool.poe.tasks.check]]
sequence = ['check_format', '_ruff_check', '_pyright'] sequence = ['check_format', '_ruff_check', '_pyright']
ignore_fail = 'return_non_zero' ignore_fail = 'return_non_zero'
[[tool.poe.tasks.test]] [[tool.poe.tasks.test]]
sequence = ['_test_all', 'coverage_report'] sequence = ['_test_all', 'coverage_report']
ignore_fail = 'return_non_zero' ignore_fail = 'return_non_zero'
[tool.ruff] [tool.ruff]
target-version = "py310" target-version = "py310"
extend-include = ["*.ipynb"] extend-include = ["*.ipynb"]
[tool.ruff.format] [tool.ruff.format]
docstring-code-format = true docstring-code-format = true
docstring-code-line-length = 20 docstring-code-line-length = 20
[tool.ruff.lint] [tool.ruff.lint]
select = [ select = [
"E4", "E4",
"E7", "E7",
"E9", "E9",
"W291", "W291",
"YTT", "YTT",
"T10", "T10",
"ICN", "ICN",
"INP", "INP",
"Q", "Q",
"RSE", "RSE",
"SLOT", "SLOT",
"INT", "INT",
"FLY", "FLY",
"LOG", "LOG",
"C90", "C90",
"T20", "T20",
"D", "D",
"RET", "RET",
"PD", "PD",
"N", "N",
"PIE", "PIE",
"SIM", "SIM",
"S", "S",
"G", "G",
"ERA", "ERA",
"ASYNC", "ASYNC",
"TID", "TID",
"UP", "UP",
"SLF", "SLF",
"BLE", "BLE",
"C4", "C4",
"I", "I",
"F", "F",
"A", "A",
"ARG", "ARG",
"PTH", "PTH",
"RUF", "RUF",
"B", "B",
"TCH", "TCH",
"DTZ", "DTZ",
"PYI", "PYI",
"PT", "PT",
"EM", "EM",
"TRY", "TRY",
"PERF", "PERF",
"CPY", "CPY",
# "FBT", # use named arguments for boolean flags # "FBT", # use named arguments for boolean flags
# "TD", # todos # "TD", # todos
# "FIX", # fixme # "FIX", # fixme
# "FURB" # preview rules # "FURB" # preview rules
# ANN # Type annotations, re-enable when we get bandwidth # ANN # Type annotations, re-enable when we get bandwidth
] ]
ignore = [ ignore = [
# Ignore module names shadowing Python builtins # Ignore module names shadowing Python builtins
"A005", "A005",
# Deprecated Rules # Deprecated Rules
"ANN101", "ANN101",
"ANN102", "ANN102",
# Conflicts with interface argument checking # Conflicts with interface argument checking
"ARG002", "ARG002",
"ANN204", "ANN204",
# TODO: Inspect these pandas rules for validity # TODO: Inspect these pandas rules for validity
"PD002", # prevents inplace=True "PD002", # prevents inplace=True
# TODO RE-Enable when we get bandwidth # TODO RE-Enable when we get bandwidth
"PERF203", # Needs restructuring of errors, we should bail-out on first error "PERF203", # Needs restructuring of errors, we should bail-out on first error
"C901", # needs refactoring to remove cyclomatic complexity "C901", # needs refactoring to remove cyclomatic complexity
] ]
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"] "tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"]
"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"] "examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"]
"graphrag/index/config/*" = ["TCH"] "graphrag/index/config/*" = ["TCH"]
"*.ipynb" = ["T201"] "*.ipynb" = ["T201"]
[tool.ruff.lint.flake8-builtins] [tool.ruff.lint.flake8-builtins]
builtins-ignorelist = ["input", "id", "bytes"] builtins-ignorelist = ["input", "id", "bytes"]
[tool.ruff.lint.pydocstyle] [tool.ruff.lint.pydocstyle]
convention = "numpy" convention = "numpy"
# https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file # https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file
[tool.pyright] [tool.pyright]
include = ["graphrag", "tests", "examples", "examples_notebooks"] include = ["graphrag", "tests", "examples", "examples_notebooks"]
exclude = ["**/node_modules", "**/__pycache__"] exclude = ["**/node_modules", "**/__pycache__"]
[tool.pytest.ini_options] [tool.pytest.ini_options]
asyncio_mode = "auto" asyncio_mode = "auto"
timeout = 800 timeout = 800
# log_cli = true # log_cli = true
# log_cli_level = "INFO" # log_cli_level = "INFO"