mirror of
https://github.com/microsoft/graphrag.git
synced 2025-12-14 16:47:18 +00:00
Merge branch 'main' into incremental_indexing/main
This commit is contained in:
commit
67f4b02ecd
66
.semversioner/0.3.3.json
Normal file
66
.semversioner/0.3.3.json
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
{
|
||||||
|
"changes": [
|
||||||
|
{
|
||||||
|
"description": "Add entrypoints for incremental indexing",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Clean up and organize run index code",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Consistent config loading. Resolves #99 and Resolves #1049",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Fix circular dependency when running prompt tune api directly",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Fix default settings for embedding",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Fix img for auto tune",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Fix img width",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Fixed a bug in prompt tuning process",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Refactor text unit build at local search",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Update Prompt Tuning docs",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Update create_pipeline_config.py",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Update prompt tune command in docs",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "add querying from azure blob storage",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "fix setting base_dir to full paths when not using file system.",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "fix strategy config in entity_extraction",
|
||||||
|
"type": "patch"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"created_at": "2024-09-10T19:51:24+00:00",
|
||||||
|
"version": "0.3.3"
|
||||||
|
}
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "fix strategy config in entity_extraction"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Fixed a bug in prompt tuning process"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Fix default settings for embedding"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Refactor text unit build at local search"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Fix circular dependency when running prompt tune api directly"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Update Prompt Tuning docs"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Update prompt tune command in docs"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Fix img for auto tune"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Fix img width"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Consistent config loading. Resolves #99 and Resolves #1049"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Add entrypoints for incremental indexing"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Clean up and organize run index code"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "fix setting base_dir to full paths when not using file system."
|
|
||||||
}
|
|
||||||
194
CHANGELOG.md
194
CHANGELOG.md
@ -1,88 +1,106 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
Note: version releases in the 0.x.y range may introduce breaking changes.
|
Note: version releases in the 0.x.y range may introduce breaking changes.
|
||||||
|
|
||||||
## 0.3.2
|
## 0.3.3
|
||||||
|
|
||||||
- patch: Add context data to query API responses.
|
- patch: Add entrypoints for incremental indexing
|
||||||
- patch: Add missing config parameter documentation for prompt tuning
|
- patch: Clean up and organize run index code
|
||||||
- patch: Add neo4j community notebook
|
- patch: Consistent config loading. Resolves #99 and Resolves #1049
|
||||||
- patch: Ensure entity types to be str when running prompt tuning
|
- patch: Fix circular dependency when running prompt tune api directly
|
||||||
- patch: Fix weight casting during graph extraction
|
- patch: Fix default settings for embedding
|
||||||
- patch: Patch "past" dependency issues
|
- patch: Fix img for auto tune
|
||||||
- patch: Update developer guide.
|
- patch: Fix img width
|
||||||
- patch: Update query type hints.
|
- patch: Fixed a bug in prompt tuning process
|
||||||
- patch: change-lancedb-placement
|
- patch: Refactor text unit build at local search
|
||||||
|
- patch: Update Prompt Tuning docs
|
||||||
## 0.3.1
|
- patch: Update create_pipeline_config.py
|
||||||
|
- patch: Update prompt tune command in docs
|
||||||
- patch: Add preflight check to check LLM connectivity.
|
- patch: add querying from azure blob storage
|
||||||
- patch: Add streaming support for local/global search to query cli
|
- patch: fix setting base_dir to full paths when not using file system.
|
||||||
- patch: Add support for both float and int on schema validation for community report generation
|
- patch: fix strategy config in entity_extraction
|
||||||
- patch: Avoid running index on gh-pages publishing
|
|
||||||
- patch: Implement Index API
|
## 0.3.2
|
||||||
- patch: Improves filtering for data dir inferring
|
|
||||||
- patch: Update to nltk 3.9.1
|
- patch: Add context data to query API responses.
|
||||||
|
- patch: Add missing config parameter documentation for prompt tuning
|
||||||
## 0.3.0
|
- patch: Add neo4j community notebook
|
||||||
|
- patch: Ensure entity types to be str when running prompt tuning
|
||||||
- minor: Implement auto templating API.
|
- patch: Fix weight casting during graph extraction
|
||||||
- minor: Implement query engine API.
|
- patch: Patch "past" dependency issues
|
||||||
- patch: Fix file dumps using json for non ASCII chars
|
- patch: Update developer guide.
|
||||||
- patch: Stabilize smoke tests for query context building
|
- patch: Update query type hints.
|
||||||
- patch: fix query embedding
|
- patch: change-lancedb-placement
|
||||||
- patch: fix sort_context & max_tokens params in verb
|
|
||||||
|
## 0.3.1
|
||||||
## 0.2.2
|
|
||||||
|
- patch: Add preflight check to check LLM connectivity.
|
||||||
- patch: Add a check if there is no community record added in local search context
|
- patch: Add streaming support for local/global search to query cli
|
||||||
- patch: Add sepparate workflow for Python Tests
|
- patch: Add support for both float and int on schema validation for community report generation
|
||||||
- patch: Docs updates
|
- patch: Avoid running index on gh-pages publishing
|
||||||
- patch: Run smoke tests on 4o
|
- patch: Implement Index API
|
||||||
|
- patch: Improves filtering for data dir inferring
|
||||||
## 0.2.1
|
- patch: Update to nltk 3.9.1
|
||||||
|
|
||||||
- patch: Added default columns for vector store at create_pipeline_config. No change for other cases.
|
## 0.3.0
|
||||||
- patch: Change json parsing error in the map step of global search to warning
|
|
||||||
- patch: Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config
|
- minor: Implement auto templating API.
|
||||||
- patch: Fix json parsing when LLM returns faulty responses
|
- minor: Implement query engine API.
|
||||||
- patch: Fix missing community reports and refactor community context builder
|
- patch: Fix file dumps using json for non ASCII chars
|
||||||
- patch: Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly.
|
- patch: Stabilize smoke tests for query context building
|
||||||
- patch: Try parsing json before even repairing
|
- patch: fix query embedding
|
||||||
- patch: Update Prompt Tuning meta prompts with finer examples
|
- patch: fix sort_context & max_tokens params in verb
|
||||||
- patch: Update default entity extraction and gleaning prompts to reduce hallucinations
|
|
||||||
- patch: add encoding-model to entity/claim extraction config
|
## 0.2.2
|
||||||
- patch: add encoding-model to text chunking config
|
|
||||||
- patch: add user prompt to history-tracking llm
|
- patch: Add a check if there is no community record added in local search context
|
||||||
- patch: update config reader to allow for zero gleans
|
- patch: Add sepparate workflow for Python Tests
|
||||||
- patch: update config-reader to allow for empty chunk-by arrays
|
- patch: Docs updates
|
||||||
- patch: update history-tracking LLm to use 'assistant' instead of 'system' in output history.
|
- patch: Run smoke tests on 4o
|
||||||
- patch: use history argument in hash key computation; add history input to cache data
|
|
||||||
|
## 0.2.1
|
||||||
## 0.2.0
|
|
||||||
|
- patch: Added default columns for vector store at create_pipeline_config. No change for other cases.
|
||||||
- minor: Add content-based KNN for selecting prompt tune few shot examples
|
- patch: Change json parsing error in the map step of global search to warning
|
||||||
- minor: Add dynamic community report rating to the prompt tuning engine
|
- patch: Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config
|
||||||
- patch: Add Minute-based Rate Limiting and fix rpm, tpm settings
|
- patch: Fix json parsing when LLM returns faulty responses
|
||||||
- patch: Add N parameter support
|
- patch: Fix missing community reports and refactor community context builder
|
||||||
- patch: Add cli flag to overlay default values onto a provided config.
|
- patch: Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly.
|
||||||
- patch: Add exception handling on file load
|
- patch: Try parsing json before even repairing
|
||||||
- patch: Add language support to prompt tuning
|
- patch: Update Prompt Tuning meta prompts with finer examples
|
||||||
- patch: Add llm params to local and global search
|
- patch: Update default entity extraction and gleaning prompts to reduce hallucinations
|
||||||
- patch: Fix broken prompt tuning link on docs
|
- patch: add encoding-model to entity/claim extraction config
|
||||||
- patch: Fix delta none on query calls
|
- patch: add encoding-model to text chunking config
|
||||||
- patch: Fix docsite base url
|
- patch: add user prompt to history-tracking llm
|
||||||
- patch: Fix encoding model parameter on prompt tune
|
- patch: update config reader to allow for zero gleans
|
||||||
- patch: Fix for --limit exceeding the dataframe length
|
- patch: update config-reader to allow for empty chunk-by arrays
|
||||||
- patch: Fix for Ruff 0.5.2
|
- patch: update history-tracking LLm to use 'assistant' instead of 'system' in output history.
|
||||||
- patch: Fixed an issue where base OpenAI embeddings can't work with Azure OpenAI LLM
|
- patch: use history argument in hash key computation; add history input to cache data
|
||||||
- patch: Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls
|
|
||||||
- patch: fix community_report doesn't work in settings.yaml
|
## 0.2.0
|
||||||
- patch: fix llm response content is None in query
|
|
||||||
- patch: fix the organization parameter is ineffective during queries
|
- minor: Add content-based KNN for selecting prompt tune few shot examples
|
||||||
- patch: remove duplicate file read
|
- minor: Add dynamic community report rating to the prompt tuning engine
|
||||||
- patch: support non-open ai model config to prompt tune
|
- patch: Add Minute-based Rate Limiting and fix rpm, tpm settings
|
||||||
- patch: use binary io processing for all file io operations
|
- patch: Add N parameter support
|
||||||
|
- patch: Add cli flag to overlay default values onto a provided config.
|
||||||
## 0.1.0
|
- patch: Add exception handling on file load
|
||||||
|
- patch: Add language support to prompt tuning
|
||||||
- minor: Initial Release
|
- patch: Add llm params to local and global search
|
||||||
|
- patch: Fix broken prompt tuning link on docs
|
||||||
|
- patch: Fix delta none on query calls
|
||||||
|
- patch: Fix docsite base url
|
||||||
|
- patch: Fix encoding model parameter on prompt tune
|
||||||
|
- patch: Fix for --limit exceeding the dataframe length
|
||||||
|
- patch: Fix for Ruff 0.5.2
|
||||||
|
- patch: Fixed an issue where base OpenAI embeddings can't work with Azure OpenAI LLM
|
||||||
|
- patch: Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls
|
||||||
|
- patch: fix community_report doesn't work in settings.yaml
|
||||||
|
- patch: fix llm response content is None in query
|
||||||
|
- patch: fix the organization parameter is ineffective during queries
|
||||||
|
- patch: remove duplicate file read
|
||||||
|
- patch: support non-open ai model config to prompt tune
|
||||||
|
- patch: use binary io processing for all file io operations
|
||||||
|
|
||||||
|
## 0.1.0
|
||||||
|
|
||||||
|
- minor: Initial Release
|
||||||
|
|||||||
@ -274,7 +274,7 @@ def _get_embedding_settings(
|
|||||||
#
|
#
|
||||||
strategy = settings.resolved_strategy() # get the default strategy
|
strategy = settings.resolved_strategy() # get the default strategy
|
||||||
strategy.update({
|
strategy.update({
|
||||||
"vector_store": {**vector_store_settings, **(vector_store_params or {})}
|
"vector_store": {**(vector_store_params or {}), **vector_store_settings}
|
||||||
}) # update the default strategy with the vector store settings
|
}) # update the default strategy with the vector store settings
|
||||||
# This ensures the vector store config is part of the strategy and not the global config
|
# This ensures the vector store config is part of the strategy and not the global config
|
||||||
return {
|
return {
|
||||||
|
|||||||
@ -9,8 +9,14 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from graphrag.config import load_config, resolve_path
|
from graphrag.config import (
|
||||||
|
GraphRagConfig,
|
||||||
|
load_config,
|
||||||
|
resolve_path,
|
||||||
|
)
|
||||||
|
from graphrag.index.create_pipeline_config import create_pipeline_config
|
||||||
from graphrag.index.progress import PrintProgressReporter
|
from graphrag.index.progress import PrintProgressReporter
|
||||||
|
from graphrag.utils.storage import _create_storage, _load_table_from_storage
|
||||||
|
|
||||||
from . import api
|
from . import api
|
||||||
|
|
||||||
@ -36,17 +42,21 @@ def run_global_search(
|
|||||||
if data_dir:
|
if data_dir:
|
||||||
config.storage.base_dir = str(resolve_path(data_dir, root))
|
config.storage.base_dir = str(resolve_path(data_dir, root))
|
||||||
|
|
||||||
data_path = Path(config.storage.base_dir).resolve()
|
dataframe_dict = _resolve_parquet_files(
|
||||||
|
root_dir=root_dir,
|
||||||
final_nodes: pd.DataFrame = pd.read_parquet(
|
config=config,
|
||||||
data_path / "create_final_nodes.parquet"
|
parquet_list=[
|
||||||
)
|
"create_final_nodes.parquet",
|
||||||
final_entities: pd.DataFrame = pd.read_parquet(
|
"create_final_entities.parquet",
|
||||||
data_path / "create_final_entities.parquet"
|
"create_final_community_reports.parquet",
|
||||||
)
|
],
|
||||||
final_community_reports: pd.DataFrame = pd.read_parquet(
|
optional_list=[],
|
||||||
data_path / "create_final_community_reports.parquet"
|
|
||||||
)
|
)
|
||||||
|
final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
|
||||||
|
final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
|
||||||
|
final_community_reports: pd.DataFrame = dataframe_dict[
|
||||||
|
"create_final_community_reports"
|
||||||
|
]
|
||||||
|
|
||||||
# call the Query API
|
# call the Query API
|
||||||
if streaming:
|
if streaming:
|
||||||
@ -112,23 +122,26 @@ def run_local_search(
|
|||||||
if data_dir:
|
if data_dir:
|
||||||
config.storage.base_dir = str(resolve_path(data_dir, root))
|
config.storage.base_dir = str(resolve_path(data_dir, root))
|
||||||
|
|
||||||
data_path = Path(config.storage.base_dir).resolve()
|
dataframe_dict = _resolve_parquet_files(
|
||||||
|
root_dir=root_dir,
|
||||||
final_nodes = pd.read_parquet(data_path / "create_final_nodes.parquet")
|
config=config,
|
||||||
final_community_reports = pd.read_parquet(
|
parquet_list=[
|
||||||
data_path / "create_final_community_reports.parquet"
|
"create_final_nodes.parquet",
|
||||||
)
|
"create_final_community_reports.parquet",
|
||||||
final_text_units = pd.read_parquet(data_path / "create_final_text_units.parquet")
|
"create_final_text_units.parquet",
|
||||||
final_relationships = pd.read_parquet(
|
"create_final_relationships.parquet",
|
||||||
data_path / "create_final_relationships.parquet"
|
"create_final_entities.parquet",
|
||||||
)
|
],
|
||||||
final_entities = pd.read_parquet(data_path / "create_final_entities.parquet")
|
optional_list=["create_final_covariates.parquet"],
|
||||||
final_covariates_path = data_path / "create_final_covariates.parquet"
|
|
||||||
final_covariates = (
|
|
||||||
pd.read_parquet(final_covariates_path)
|
|
||||||
if final_covariates_path.exists()
|
|
||||||
else None
|
|
||||||
)
|
)
|
||||||
|
final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
|
||||||
|
final_community_reports: pd.DataFrame = dataframe_dict[
|
||||||
|
"create_final_community_reports"
|
||||||
|
]
|
||||||
|
final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"]
|
||||||
|
final_relationships: pd.DataFrame = dataframe_dict["create_final_relationships"]
|
||||||
|
final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
|
||||||
|
final_covariates: pd.DataFrame | None = dataframe_dict["create_final_covariates"]
|
||||||
|
|
||||||
# call the Query API
|
# call the Query API
|
||||||
if streaming:
|
if streaming:
|
||||||
@ -179,3 +192,35 @@ def run_local_search(
|
|||||||
# NOTE: we return the response and context data here purely as a complete demonstration of the API.
|
# NOTE: we return the response and context data here purely as a complete demonstration of the API.
|
||||||
# External users should use the API directly to get the response and context data.
|
# External users should use the API directly to get the response and context data.
|
||||||
return response, context_data
|
return response, context_data
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_parquet_files(
|
||||||
|
root_dir: str,
|
||||||
|
config: GraphRagConfig,
|
||||||
|
parquet_list: list[str],
|
||||||
|
optional_list: list[str],
|
||||||
|
) -> dict[str, pd.DataFrame]:
|
||||||
|
"""Read parquet files to a dataframe dict."""
|
||||||
|
dataframe_dict = {}
|
||||||
|
pipeline_config = create_pipeline_config(config)
|
||||||
|
storage_obj = _create_storage(root_dir=root_dir, config=pipeline_config.storage)
|
||||||
|
for parquet_file in parquet_list:
|
||||||
|
df_key = parquet_file.split(".")[0]
|
||||||
|
df_value = asyncio.run(
|
||||||
|
_load_table_from_storage(name=parquet_file, storage=storage_obj)
|
||||||
|
)
|
||||||
|
dataframe_dict[df_key] = df_value
|
||||||
|
|
||||||
|
# for optional parquet files, set the dict entry to None instead of erroring out if it does not exist
|
||||||
|
for optional_file in optional_list:
|
||||||
|
file_exists = asyncio.run(storage_obj.has(optional_file))
|
||||||
|
df_key = optional_file.split(".")[0]
|
||||||
|
if file_exists:
|
||||||
|
df_value = asyncio.run(
|
||||||
|
_load_table_from_storage(name=optional_file, storage=storage_obj)
|
||||||
|
)
|
||||||
|
dataframe_dict[df_key] = df_value
|
||||||
|
else:
|
||||||
|
dataframe_dict[df_key] = None
|
||||||
|
|
||||||
|
return dataframe_dict
|
||||||
|
|||||||
532
pyproject.toml
532
pyproject.toml
@ -1,266 +1,266 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "graphrag"
|
name = "graphrag"
|
||||||
# Maintainers: do not change the version here manually, use ./scripts/release.sh
|
# Maintainers: do not change the version here manually, use ./scripts/release.sh
|
||||||
version = "0.3.2"
|
version = "0.3.3"
|
||||||
description = ""
|
description = ""
|
||||||
authors = [
|
authors = [
|
||||||
"Alonso Guevara Fernández <alonsog@microsoft.com>",
|
"Alonso Guevara Fernández <alonsog@microsoft.com>",
|
||||||
"Andrés Morales Esquivel <andresmor@microsoft.com>",
|
"Andrés Morales Esquivel <andresmor@microsoft.com>",
|
||||||
"Chris Trevino <chtrevin@microsoft.com>",
|
"Chris Trevino <chtrevin@microsoft.com>",
|
||||||
"David Tittsworth <datittsw@microsoft.com>",
|
"David Tittsworth <datittsw@microsoft.com>",
|
||||||
"Dayenne de Souza <ddesouza@microsoft.com>",
|
"Dayenne de Souza <ddesouza@microsoft.com>",
|
||||||
"Derek Worthen <deworthe@microsoft.com>",
|
"Derek Worthen <deworthe@microsoft.com>",
|
||||||
"Gaudy Blanco Meneses <gaudyb@microsoft.com>",
|
"Gaudy Blanco Meneses <gaudyb@microsoft.com>",
|
||||||
"Ha Trinh <trinhha@microsoft.com>",
|
"Ha Trinh <trinhha@microsoft.com>",
|
||||||
"Jonathan Larson <jolarso@microsoft.com>",
|
"Jonathan Larson <jolarso@microsoft.com>",
|
||||||
"Josh Bradley <joshbradley@microsoft.com>",
|
"Josh Bradley <joshbradley@microsoft.com>",
|
||||||
"Kate Lytvynets <kalytv@microsoft.com>",
|
"Kate Lytvynets <kalytv@microsoft.com>",
|
||||||
"Kenny Zhang <zhangken@microsoft.com>",
|
"Kenny Zhang <zhangken@microsoft.com>",
|
||||||
"Mónica Carvajal",
|
"Mónica Carvajal",
|
||||||
"Nathan Evans <naevans@microsoft.com>",
|
"Nathan Evans <naevans@microsoft.com>",
|
||||||
"Rodrigo Racanicci <rracanicci@microsoft.com>",
|
"Rodrigo Racanicci <rracanicci@microsoft.com>",
|
||||||
"Sarah Smith <smithsarah@microsoft.com>",
|
"Sarah Smith <smithsarah@microsoft.com>",
|
||||||
]
|
]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
packages = [{ include = "graphrag" }]
|
packages = [{ include = "graphrag" }]
|
||||||
|
|
||||||
[tool.poetry.urls]
|
[tool.poetry.urls]
|
||||||
"Source" = "https://github.com/microsoft/graphrag"
|
"Source" = "https://github.com/microsoft/graphrag"
|
||||||
|
|
||||||
[tool.poetry-dynamic-versioning]
|
[tool.poetry-dynamic-versioning]
|
||||||
enable = true
|
enable = true
|
||||||
style = "pep440"
|
style = "pep440"
|
||||||
vcs = "git"
|
vcs = "git"
|
||||||
bump = true
|
bump = true
|
||||||
format-jinja = """
|
format-jinja = """
|
||||||
{%- if distance == 0 -%}
|
{%- if distance == 0 -%}
|
||||||
{{ serialize_pep440(base, stage, revision) }}
|
{{ serialize_pep440(base, stage, revision) }}
|
||||||
{%- else -%}
|
{%- else -%}
|
||||||
{{ serialize_pep440(base, stage, revision, dev=distance) }}
|
{{ serialize_pep440(base, stage, revision, dev=distance) }}
|
||||||
{%- endif -%}
|
{%- endif -%}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = ">=3.10,<3.13"
|
python = ">=3.10,<3.13"
|
||||||
environs = "^11.0.0"
|
environs = "^11.0.0"
|
||||||
datashaper = "^0.0.49"
|
datashaper = "^0.0.49"
|
||||||
|
|
||||||
# Vector Stores
|
# Vector Stores
|
||||||
azure-search-documents = "^11.4.0"
|
azure-search-documents = "^11.4.0"
|
||||||
lancedb = "^0.12.0"
|
lancedb = "^0.12.0"
|
||||||
|
|
||||||
# Event Loops
|
# Event Loops
|
||||||
uvloop = { version = "^0.20.0", markers = "platform_system != 'Windows'" }
|
uvloop = { version = "^0.20.0", markers = "platform_system != 'Windows'" }
|
||||||
nest-asyncio = { version = "^1.6.0", markers = "platform_system == 'Windows'" }
|
nest-asyncio = { version = "^1.6.0", markers = "platform_system == 'Windows'" }
|
||||||
|
|
||||||
# Async IO
|
# Async IO
|
||||||
aiolimiter = "^1.1.0"
|
aiolimiter = "^1.1.0"
|
||||||
aiofiles = "^24.1.0"
|
aiofiles = "^24.1.0"
|
||||||
|
|
||||||
# LLM
|
# LLM
|
||||||
openai = "^1.37.1"
|
openai = "^1.37.1"
|
||||||
nltk = "3.9.1"
|
nltk = "3.9.1"
|
||||||
tiktoken = "^0.7.0"
|
tiktoken = "^0.7.0"
|
||||||
|
|
||||||
# Data-Sci
|
# Data-Sci
|
||||||
numba = "0.60.0"
|
numba = "0.60.0"
|
||||||
numpy = "^1.25.2"
|
numpy = "^1.25.2"
|
||||||
graspologic = "^3.4.1"
|
graspologic = "^3.4.1"
|
||||||
networkx = "^3"
|
networkx = "^3"
|
||||||
fastparquet = "^2024.2.0"
|
fastparquet = "^2024.2.0"
|
||||||
# 1.13.0 was a footgun
|
# 1.13.0 was a footgun
|
||||||
scipy = "1.12.0"
|
scipy = "1.12.0"
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
pyyaml = "^6.0.2"
|
pyyaml = "^6.0.2"
|
||||||
pyaml-env = "^1.2.1"
|
pyaml-env = "^1.2.1"
|
||||||
python-dotenv = "^1.0.0"
|
python-dotenv = "^1.0.0"
|
||||||
|
|
||||||
# Network
|
# Network
|
||||||
tenacity = "^9.0.0"
|
tenacity = "^9.0.0"
|
||||||
|
|
||||||
swifter = "^1.4.0"
|
swifter = "^1.4.0"
|
||||||
pydantic = "^2"
|
pydantic = "^2"
|
||||||
rich = "^13.6.0"
|
rich = "^13.6.0"
|
||||||
textual = "^0.78.0"
|
textual = "^0.78.0"
|
||||||
devtools = "^0.12.2"
|
devtools = "^0.12.2"
|
||||||
|
|
||||||
typing-extensions = "^4.12.2"
|
typing-extensions = "^4.12.2"
|
||||||
|
|
||||||
#Azure
|
#Azure
|
||||||
azure-storage-blob = "^12.22.0"
|
azure-storage-blob = "^12.22.0"
|
||||||
azure-identity = "^1.17.1"
|
azure-identity = "^1.17.1"
|
||||||
json-repair = "^0.28.4"
|
json-repair = "^0.28.4"
|
||||||
|
|
||||||
future = "^1.0.0"
|
future = "^1.0.0"
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
coverage = "^7.6.0"
|
coverage = "^7.6.0"
|
||||||
ipykernel = "^6.29.4"
|
ipykernel = "^6.29.4"
|
||||||
jupyter = "^1.0.0"
|
jupyter = "^1.0.0"
|
||||||
nbconvert = "^7.16.3"
|
nbconvert = "^7.16.3"
|
||||||
poethepoet = "^0.27.0"
|
poethepoet = "^0.27.0"
|
||||||
pyright = "^1.1.371"
|
pyright = "^1.1.371"
|
||||||
pytest = "^8.3.2"
|
pytest = "^8.3.2"
|
||||||
pytest-asyncio = "^0.24.0"
|
pytest-asyncio = "^0.24.0"
|
||||||
pytest-timeout = "^2.3.1"
|
pytest-timeout = "^2.3.1"
|
||||||
ruff = "^0.6.2"
|
ruff = "^0.6.2"
|
||||||
semversioner = "^2.0.3"
|
semversioner = "^2.0.3"
|
||||||
|
|
||||||
update-toml = "^0.2.1"
|
update-toml = "^0.2.1"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
|
requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
|
||||||
build-backend = "poetry_dynamic_versioning.backend"
|
build-backend = "poetry_dynamic_versioning.backend"
|
||||||
|
|
||||||
[tool.poe.tasks]
|
[tool.poe.tasks]
|
||||||
_sort_imports = "ruff check --select I --fix . --preview"
|
_sort_imports = "ruff check --select I --fix . --preview"
|
||||||
_format_code = "ruff format . --preview"
|
_format_code = "ruff format . --preview"
|
||||||
_ruff_check = 'ruff check . --preview'
|
_ruff_check = 'ruff check . --preview'
|
||||||
_pyright = "pyright"
|
_pyright = "pyright"
|
||||||
_convert_local_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/local_search.ipynb'
|
_convert_local_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/local_search.ipynb'
|
||||||
_convert_global_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/global_search.ipynb'
|
_convert_global_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/global_search.ipynb'
|
||||||
_semversioner_release = "semversioner release"
|
_semversioner_release = "semversioner release"
|
||||||
_semversioner_changelog = "semversioner changelog > CHANGELOG.md"
|
_semversioner_changelog = "semversioner changelog > CHANGELOG.md"
|
||||||
_semversioner_update_toml_version = "update-toml update --path tool.poetry.version --value $(poetry run semversioner current-version)"
|
_semversioner_update_toml_version = "update-toml update --path tool.poetry.version --value $(poetry run semversioner current-version)"
|
||||||
semversioner_add = "semversioner add-change"
|
semversioner_add = "semversioner add-change"
|
||||||
coverage_report = 'coverage report --omit "**/tests/**" --show-missing'
|
coverage_report = 'coverage report --omit "**/tests/**" --show-missing'
|
||||||
check_format = 'ruff format . --check --preview'
|
check_format = 'ruff format . --check --preview'
|
||||||
fix = "ruff --preview check --fix ."
|
fix = "ruff --preview check --fix ."
|
||||||
fix_unsafe = "ruff check --preview --fix --unsafe-fixes ."
|
fix_unsafe = "ruff check --preview --fix --unsafe-fixes ."
|
||||||
|
|
||||||
_test_all = "coverage run -m pytest ./tests"
|
_test_all = "coverage run -m pytest ./tests"
|
||||||
test_unit = "pytest ./tests/unit"
|
test_unit = "pytest ./tests/unit"
|
||||||
test_integration = "pytest ./tests/integration"
|
test_integration = "pytest ./tests/integration"
|
||||||
test_smoke = "pytest ./tests/smoke"
|
test_smoke = "pytest ./tests/smoke"
|
||||||
test_notebook = "pytest ./tests/notebook"
|
test_notebook = "pytest ./tests/notebook"
|
||||||
index = "python -m graphrag.index"
|
index = "python -m graphrag.index"
|
||||||
query = "python -m graphrag.query"
|
query = "python -m graphrag.query"
|
||||||
prompt_tune = "python -m graphrag.prompt_tune"
|
prompt_tune = "python -m graphrag.prompt_tune"
|
||||||
# Pass in a test pattern
|
# Pass in a test pattern
|
||||||
test_only = "pytest -s -k"
|
test_only = "pytest -s -k"
|
||||||
|
|
||||||
[[tool.poe.tasks.release]]
|
[[tool.poe.tasks.release]]
|
||||||
sequence = [
|
sequence = [
|
||||||
'_semversioner_release',
|
'_semversioner_release',
|
||||||
'_semversioner_changelog',
|
'_semversioner_changelog',
|
||||||
'_semversioner_update_toml_version',
|
'_semversioner_update_toml_version',
|
||||||
]
|
]
|
||||||
ignore_fail = 'return_non_zero'
|
ignore_fail = 'return_non_zero'
|
||||||
|
|
||||||
[[tool.poe.tasks.convert_docsite_notebooks]]
|
[[tool.poe.tasks.convert_docsite_notebooks]]
|
||||||
sequence = ['_convert_local_search_nb', '_convert_global_search_nb']
|
sequence = ['_convert_local_search_nb', '_convert_global_search_nb']
|
||||||
ignore_fail = 'return_non_zero'
|
ignore_fail = 'return_non_zero'
|
||||||
|
|
||||||
[[tool.poe.tasks.format]]
|
[[tool.poe.tasks.format]]
|
||||||
sequence = ['_sort_imports', '_format_code']
|
sequence = ['_sort_imports', '_format_code']
|
||||||
ignore_fail = 'return_non_zero'
|
ignore_fail = 'return_non_zero'
|
||||||
|
|
||||||
[[tool.poe.tasks.check]]
|
[[tool.poe.tasks.check]]
|
||||||
sequence = ['check_format', '_ruff_check', '_pyright']
|
sequence = ['check_format', '_ruff_check', '_pyright']
|
||||||
ignore_fail = 'return_non_zero'
|
ignore_fail = 'return_non_zero'
|
||||||
|
|
||||||
[[tool.poe.tasks.test]]
|
[[tool.poe.tasks.test]]
|
||||||
sequence = ['_test_all', 'coverage_report']
|
sequence = ['_test_all', 'coverage_report']
|
||||||
ignore_fail = 'return_non_zero'
|
ignore_fail = 'return_non_zero'
|
||||||
|
|
||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
target-version = "py310"
|
target-version = "py310"
|
||||||
extend-include = ["*.ipynb"]
|
extend-include = ["*.ipynb"]
|
||||||
|
|
||||||
[tool.ruff.format]
|
[tool.ruff.format]
|
||||||
docstring-code-format = true
|
docstring-code-format = true
|
||||||
docstring-code-line-length = 20
|
docstring-code-line-length = 20
|
||||||
|
|
||||||
[tool.ruff.lint]
|
[tool.ruff.lint]
|
||||||
select = [
|
select = [
|
||||||
"E4",
|
"E4",
|
||||||
"E7",
|
"E7",
|
||||||
"E9",
|
"E9",
|
||||||
"W291",
|
"W291",
|
||||||
"YTT",
|
"YTT",
|
||||||
"T10",
|
"T10",
|
||||||
"ICN",
|
"ICN",
|
||||||
"INP",
|
"INP",
|
||||||
"Q",
|
"Q",
|
||||||
"RSE",
|
"RSE",
|
||||||
"SLOT",
|
"SLOT",
|
||||||
"INT",
|
"INT",
|
||||||
"FLY",
|
"FLY",
|
||||||
"LOG",
|
"LOG",
|
||||||
"C90",
|
"C90",
|
||||||
"T20",
|
"T20",
|
||||||
"D",
|
"D",
|
||||||
"RET",
|
"RET",
|
||||||
"PD",
|
"PD",
|
||||||
"N",
|
"N",
|
||||||
"PIE",
|
"PIE",
|
||||||
"SIM",
|
"SIM",
|
||||||
"S",
|
"S",
|
||||||
"G",
|
"G",
|
||||||
"ERA",
|
"ERA",
|
||||||
"ASYNC",
|
"ASYNC",
|
||||||
"TID",
|
"TID",
|
||||||
"UP",
|
"UP",
|
||||||
"SLF",
|
"SLF",
|
||||||
"BLE",
|
"BLE",
|
||||||
"C4",
|
"C4",
|
||||||
"I",
|
"I",
|
||||||
"F",
|
"F",
|
||||||
"A",
|
"A",
|
||||||
"ARG",
|
"ARG",
|
||||||
"PTH",
|
"PTH",
|
||||||
"RUF",
|
"RUF",
|
||||||
"B",
|
"B",
|
||||||
"TCH",
|
"TCH",
|
||||||
"DTZ",
|
"DTZ",
|
||||||
"PYI",
|
"PYI",
|
||||||
"PT",
|
"PT",
|
||||||
"EM",
|
"EM",
|
||||||
"TRY",
|
"TRY",
|
||||||
"PERF",
|
"PERF",
|
||||||
"CPY",
|
"CPY",
|
||||||
# "FBT", # use named arguments for boolean flags
|
# "FBT", # use named arguments for boolean flags
|
||||||
# "TD", # todos
|
# "TD", # todos
|
||||||
# "FIX", # fixme
|
# "FIX", # fixme
|
||||||
# "FURB" # preview rules
|
# "FURB" # preview rules
|
||||||
# ANN # Type annotations, re-enable when we get bandwidth
|
# ANN # Type annotations, re-enable when we get bandwidth
|
||||||
]
|
]
|
||||||
ignore = [
|
ignore = [
|
||||||
# Ignore module names shadowing Python builtins
|
# Ignore module names shadowing Python builtins
|
||||||
"A005",
|
"A005",
|
||||||
# Deprecated Rules
|
# Deprecated Rules
|
||||||
"ANN101",
|
"ANN101",
|
||||||
"ANN102",
|
"ANN102",
|
||||||
# Conflicts with interface argument checking
|
# Conflicts with interface argument checking
|
||||||
"ARG002",
|
"ARG002",
|
||||||
"ANN204",
|
"ANN204",
|
||||||
# TODO: Inspect these pandas rules for validity
|
# TODO: Inspect these pandas rules for validity
|
||||||
"PD002", # prevents inplace=True
|
"PD002", # prevents inplace=True
|
||||||
# TODO RE-Enable when we get bandwidth
|
# TODO RE-Enable when we get bandwidth
|
||||||
"PERF203", # Needs restructuring of errors, we should bail-out on first error
|
"PERF203", # Needs restructuring of errors, we should bail-out on first error
|
||||||
"C901", # needs refactoring to remove cyclomatic complexity
|
"C901", # needs refactoring to remove cyclomatic complexity
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
[tool.ruff.lint.per-file-ignores]
|
||||||
"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"]
|
"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"]
|
||||||
"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"]
|
"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"]
|
||||||
"graphrag/index/config/*" = ["TCH"]
|
"graphrag/index/config/*" = ["TCH"]
|
||||||
"*.ipynb" = ["T201"]
|
"*.ipynb" = ["T201"]
|
||||||
|
|
||||||
[tool.ruff.lint.flake8-builtins]
|
[tool.ruff.lint.flake8-builtins]
|
||||||
builtins-ignorelist = ["input", "id", "bytes"]
|
builtins-ignorelist = ["input", "id", "bytes"]
|
||||||
|
|
||||||
[tool.ruff.lint.pydocstyle]
|
[tool.ruff.lint.pydocstyle]
|
||||||
convention = "numpy"
|
convention = "numpy"
|
||||||
|
|
||||||
# https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file
|
# https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file
|
||||||
[tool.pyright]
|
[tool.pyright]
|
||||||
include = ["graphrag", "tests", "examples", "examples_notebooks"]
|
include = ["graphrag", "tests", "examples", "examples_notebooks"]
|
||||||
exclude = ["**/node_modules", "**/__pycache__"]
|
exclude = ["**/node_modules", "**/__pycache__"]
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
asyncio_mode = "auto"
|
asyncio_mode = "auto"
|
||||||
timeout = 800
|
timeout = 800
|
||||||
# log_cli = true
|
# log_cli = true
|
||||||
# log_cli_level = "INFO"
|
# log_cli_level = "INFO"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user