diff --git a/.semversioner/0.3.3.json b/.semversioner/0.3.3.json new file mode 100644 index 00000000..99726199 --- /dev/null +++ b/.semversioner/0.3.3.json @@ -0,0 +1,66 @@ +{ + "changes": [ + { + "description": "Add entrypoints for incremental indexing", + "type": "patch" + }, + { + "description": "Clean up and organize run index code", + "type": "patch" + }, + { + "description": "Consistent config loading. Resolves #99 and Resolves #1049", + "type": "patch" + }, + { + "description": "Fix circular dependency when running prompt tune api directly", + "type": "patch" + }, + { + "description": "Fix default settings for embedding", + "type": "patch" + }, + { + "description": "Fix img for auto tune", + "type": "patch" + }, + { + "description": "Fix img width", + "type": "patch" + }, + { + "description": "Fixed a bug in prompt tuning process", + "type": "patch" + }, + { + "description": "Refactor text unit build at local search", + "type": "patch" + }, + { + "description": "Update Prompt Tuning docs", + "type": "patch" + }, + { + "description": "Update create_pipeline_config.py", + "type": "patch" + }, + { + "description": "Update prompt tune command in docs", + "type": "patch" + }, + { + "description": "add querying from azure blob storage", + "type": "patch" + }, + { + "description": "fix setting base_dir to full paths when not using file system.", + "type": "patch" + }, + { + "description": "fix strategy config in entity_extraction", + "type": "patch" + } + ], + "created_at": "2024-09-10T19:51:24+00:00", + "version": "0.3.3" +} \ No newline at end of file diff --git a/.semversioner/next-release/patch-20240712071506108985.json b/.semversioner/next-release/patch-20240712071506108985.json deleted file mode 100644 index ac0891c7..00000000 --- a/.semversioner/next-release/patch-20240712071506108985.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "patch", - "description": "fix strategy config in entity_extraction" -} diff --git a/.semversioner/next-release/patch-20240814063732868394.json b/.semversioner/next-release/patch-20240814063732868394.json deleted file mode 100644 index 78d68523..00000000 --- a/.semversioner/next-release/patch-20240814063732868394.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "patch", - "description": "Fixed a bug in prompt tuning process" -} diff --git a/.semversioner/next-release/patch-20240827203354884800.json b/.semversioner/next-release/patch-20240827203354884800.json deleted file mode 100644 index 71f905c6..00000000 --- a/.semversioner/next-release/patch-20240827203354884800.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "patch", - "description": "Fix default settings for embedding" -} diff --git a/.semversioner/next-release/patch-20240827212041426794.json b/.semversioner/next-release/patch-20240827212041426794.json deleted file mode 100644 index f3646013..00000000 --- a/.semversioner/next-release/patch-20240827212041426794.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "patch", - "description": "Refactor text unit build at local search" -} diff --git a/.semversioner/next-release/patch-20240829175336332224.json b/.semversioner/next-release/patch-20240829175336332224.json deleted file mode 100644 index d850036b..00000000 --- a/.semversioner/next-release/patch-20240829175336332224.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "patch", - "description": "Fix circular dependency when running prompt tune api directly" -} diff --git a/.semversioner/next-release/patch-20240829213842840703.json b/.semversioner/next-release/patch-20240829213842840703.json deleted file mode 100644 index f39b6254..00000000 --- a/.semversioner/next-release/patch-20240829213842840703.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "patch", - "description": "Update Prompt Tuning docs" -} diff --git a/.semversioner/next-release/patch-20240829222117086645.json b/.semversioner/next-release/patch-20240829222117086645.json deleted file mode 100644 index 7204eb51..00000000 --- a/.semversioner/next-release/patch-20240829222117086645.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "patch", - "description": "Update prompt tune command in docs" -} diff --git a/.semversioner/next-release/patch-20240829223855375571.json b/.semversioner/next-release/patch-20240829223855375571.json deleted file mode 100644 index 941b36f8..00000000 --- a/.semversioner/next-release/patch-20240829223855375571.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "patch", - "description": "Fix img for auto tune" -} diff --git a/.semversioner/next-release/patch-20240829230018473667.json b/.semversioner/next-release/patch-20240829230018473667.json deleted file mode 100644 index e0a1d20c..00000000 --- a/.semversioner/next-release/patch-20240829230018473667.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "patch", - "description": "Fix img width" -} diff --git a/.semversioner/next-release/patch-20240830151802543194.json b/.semversioner/next-release/patch-20240830151802543194.json deleted file mode 100644 index d7805109..00000000 --- a/.semversioner/next-release/patch-20240830151802543194.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "patch", - "description": "Consistent config loading. Resolves #99 and Resolves #1049" -} diff --git a/.semversioner/next-release/patch-20240830181135475287.json b/.semversioner/next-release/patch-20240830181135475287.json deleted file mode 100644 index b9b3f16f..00000000 --- a/.semversioner/next-release/patch-20240830181135475287.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "patch", - "description": "Add entrypoints for incremental indexing" -} diff --git a/.semversioner/next-release/patch-20240903205022597458.json b/.semversioner/next-release/patch-20240903205022597458.json deleted file mode 100644 index 7e43fdea..00000000 --- a/.semversioner/next-release/patch-20240903205022597458.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "patch", - "description": "Clean up and organize run index code" -} diff --git a/.semversioner/next-release/patch-20240904173227165702.json b/.semversioner/next-release/patch-20240904173227165702.json deleted file mode 100644 index 4010ac4f..00000000 --- a/.semversioner/next-release/patch-20240904173227165702.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "patch", - "description": "fix setting base_dir to full paths when not using file system." -} diff --git a/CHANGELOG.md b/CHANGELOG.md index 00ddb22b..9a237d30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,88 +1,106 @@ -# Changelog -Note: version releases in the 0.x.y range may introduce breaking changes. - -## 0.3.2 - -- patch: Add context data to query API responses. -- patch: Add missing config parameter documentation for prompt tuning -- patch: Add neo4j community notebook -- patch: Ensure entity types to be str when running prompt tuning -- patch: Fix weight casting during graph extraction -- patch: Patch "past" dependency issues -- patch: Update developer guide. -- patch: Update query type hints. -- patch: change-lancedb-placement - -## 0.3.1 - -- patch: Add preflight check to check LLM connectivity. -- patch: Add streaming support for local/global search to query cli -- patch: Add support for both float and int on schema validation for community report generation -- patch: Avoid running index on gh-pages publishing -- patch: Implement Index API -- patch: Improves filtering for data dir inferring -- patch: Update to nltk 3.9.1 - -## 0.3.0 - -- minor: Implement auto templating API. -- minor: Implement query engine API. -- patch: Fix file dumps using json for non ASCII chars -- patch: Stabilize smoke tests for query context building -- patch: fix query embedding -- patch: fix sort_context & max_tokens params in verb - -## 0.2.2 - -- patch: Add a check if there is no community record added in local search context -- patch: Add sepparate workflow for Python Tests -- patch: Docs updates -- patch: Run smoke tests on 4o - -## 0.2.1 - -- patch: Added default columns for vector store at create_pipeline_config. No change for other cases. -- patch: Change json parsing error in the map step of global search to warning -- patch: Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config -- patch: Fix json parsing when LLM returns faulty responses -- patch: Fix missing community reports and refactor community context builder -- patch: Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly. -- patch: Try parsing json before even repairing -- patch: Update Prompt Tuning meta prompts with finer examples -- patch: Update default entity extraction and gleaning prompts to reduce hallucinations -- patch: add encoding-model to entity/claim extraction config -- patch: add encoding-model to text chunking config -- patch: add user prompt to history-tracking llm -- patch: update config reader to allow for zero gleans -- patch: update config-reader to allow for empty chunk-by arrays -- patch: update history-tracking LLm to use 'assistant' instead of 'system' in output history. -- patch: use history argument in hash key computation; add history input to cache data - -## 0.2.0 - -- minor: Add content-based KNN for selecting prompt tune few shot examples -- minor: Add dynamic community report rating to the prompt tuning engine -- patch: Add Minute-based Rate Limiting and fix rpm, tpm settings -- patch: Add N parameter support -- patch: Add cli flag to overlay default values onto a provided config. -- patch: Add exception handling on file load -- patch: Add language support to prompt tuning -- patch: Add llm params to local and global search -- patch: Fix broken prompt tuning link on docs -- patch: Fix delta none on query calls -- patch: Fix docsite base url -- patch: Fix encoding model parameter on prompt tune -- patch: Fix for --limit exceeding the dataframe length -- patch: Fix for Ruff 0.5.2 -- patch: Fixed an issue where base OpenAI embeddings can't work with Azure OpenAI LLM -- patch: Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls -- patch: fix community_report doesn't work in settings.yaml -- patch: fix llm response content is None in query -- patch: fix the organization parameter is ineffective during queries -- patch: remove duplicate file read -- patch: support non-open ai model config to prompt tune -- patch: use binary io processing for all file io operations - -## 0.1.0 - -- minor: Initial Release +# Changelog +Note: version releases in the 0.x.y range may introduce breaking changes. + +## 0.3.3 + +- patch: Add entrypoints for incremental indexing +- patch: Clean up and organize run index code +- patch: Consistent config loading. Resolves #99 and Resolves #1049 +- patch: Fix circular dependency when running prompt tune api directly +- patch: Fix default settings for embedding +- patch: Fix img for auto tune +- patch: Fix img width +- patch: Fixed a bug in prompt tuning process +- patch: Refactor text unit build at local search +- patch: Update Prompt Tuning docs +- patch: Update create_pipeline_config.py +- patch: Update prompt tune command in docs +- patch: add querying from azure blob storage +- patch: fix setting base_dir to full paths when not using file system. +- patch: fix strategy config in entity_extraction + +## 0.3.2 + +- patch: Add context data to query API responses. +- patch: Add missing config parameter documentation for prompt tuning +- patch: Add neo4j community notebook +- patch: Ensure entity types to be str when running prompt tuning +- patch: Fix weight casting during graph extraction +- patch: Patch "past" dependency issues +- patch: Update developer guide. +- patch: Update query type hints. +- patch: change-lancedb-placement + +## 0.3.1 + +- patch: Add preflight check to check LLM connectivity. +- patch: Add streaming support for local/global search to query cli +- patch: Add support for both float and int on schema validation for community report generation +- patch: Avoid running index on gh-pages publishing +- patch: Implement Index API +- patch: Improves filtering for data dir inferring +- patch: Update to nltk 3.9.1 + +## 0.3.0 + +- minor: Implement auto templating API. +- minor: Implement query engine API. +- patch: Fix file dumps using json for non ASCII chars +- patch: Stabilize smoke tests for query context building +- patch: fix query embedding +- patch: fix sort_context & max_tokens params in verb + +## 0.2.2 + +- patch: Add a check if there is no community record added in local search context +- patch: Add sepparate workflow for Python Tests +- patch: Docs updates +- patch: Run smoke tests on 4o + +## 0.2.1 + +- patch: Added default columns for vector store at create_pipeline_config. No change for other cases. +- patch: Change json parsing error in the map step of global search to warning +- patch: Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config +- patch: Fix json parsing when LLM returns faulty responses +- patch: Fix missing community reports and refactor community context builder +- patch: Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly. +- patch: Try parsing json before even repairing +- patch: Update Prompt Tuning meta prompts with finer examples +- patch: Update default entity extraction and gleaning prompts to reduce hallucinations +- patch: add encoding-model to entity/claim extraction config +- patch: add encoding-model to text chunking config +- patch: add user prompt to history-tracking llm +- patch: update config reader to allow for zero gleans +- patch: update config-reader to allow for empty chunk-by arrays +- patch: update history-tracking LLm to use 'assistant' instead of 'system' in output history. +- patch: use history argument in hash key computation; add history input to cache data + +## 0.2.0 + +- minor: Add content-based KNN for selecting prompt tune few shot examples +- minor: Add dynamic community report rating to the prompt tuning engine +- patch: Add Minute-based Rate Limiting and fix rpm, tpm settings +- patch: Add N parameter support +- patch: Add cli flag to overlay default values onto a provided config. +- patch: Add exception handling on file load +- patch: Add language support to prompt tuning +- patch: Add llm params to local and global search +- patch: Fix broken prompt tuning link on docs +- patch: Fix delta none on query calls +- patch: Fix docsite base url +- patch: Fix encoding model parameter on prompt tune +- patch: Fix for --limit exceeding the dataframe length +- patch: Fix for Ruff 0.5.2 +- patch: Fixed an issue where base OpenAI embeddings can't work with Azure OpenAI LLM +- patch: Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls +- patch: fix community_report doesn't work in settings.yaml +- patch: fix llm response content is None in query +- patch: fix the organization parameter is ineffective during queries +- patch: remove duplicate file read +- patch: support non-open ai model config to prompt tune +- patch: use binary io processing for all file io operations + +## 0.1.0 + +- minor: Initial Release diff --git a/graphrag/index/create_pipeline_config.py b/graphrag/index/create_pipeline_config.py index 22dba200..3743c91f 100644 --- a/graphrag/index/create_pipeline_config.py +++ b/graphrag/index/create_pipeline_config.py @@ -274,7 +274,7 @@ def _get_embedding_settings( # strategy = settings.resolved_strategy() # get the default strategy strategy.update({ - "vector_store": {**vector_store_settings, **(vector_store_params or {})} + "vector_store": {**(vector_store_params or {}), **vector_store_settings} }) # update the default strategy with the vector store settings # This ensures the vector store config is part of the strategy and not the global config return { diff --git a/graphrag/query/cli.py b/graphrag/query/cli.py index 3ac76f81..872a4d09 100644 --- a/graphrag/query/cli.py +++ b/graphrag/query/cli.py @@ -9,8 +9,14 @@ from pathlib import Path import pandas as pd -from graphrag.config import load_config, resolve_path +from graphrag.config import ( + GraphRagConfig, + load_config, + resolve_path, +) +from graphrag.index.create_pipeline_config import create_pipeline_config from graphrag.index.progress import PrintProgressReporter +from graphrag.utils.storage import _create_storage, _load_table_from_storage from . import api @@ -36,17 +42,21 @@ def run_global_search( if data_dir: config.storage.base_dir = str(resolve_path(data_dir, root)) - data_path = Path(config.storage.base_dir).resolve() - - final_nodes: pd.DataFrame = pd.read_parquet( - data_path / "create_final_nodes.parquet" - ) - final_entities: pd.DataFrame = pd.read_parquet( - data_path / "create_final_entities.parquet" - ) - final_community_reports: pd.DataFrame = pd.read_parquet( - data_path / "create_final_community_reports.parquet" + dataframe_dict = _resolve_parquet_files( + root_dir=root_dir, + config=config, + parquet_list=[ + "create_final_nodes.parquet", + "create_final_entities.parquet", + "create_final_community_reports.parquet", + ], + optional_list=[], ) + final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"] + final_entities: pd.DataFrame = dataframe_dict["create_final_entities"] + final_community_reports: pd.DataFrame = dataframe_dict[ + "create_final_community_reports" + ] # call the Query API if streaming: @@ -112,23 +122,26 @@ def run_local_search( if data_dir: config.storage.base_dir = str(resolve_path(data_dir, root)) - data_path = Path(config.storage.base_dir).resolve() - - final_nodes = pd.read_parquet(data_path / "create_final_nodes.parquet") - final_community_reports = pd.read_parquet( - data_path / "create_final_community_reports.parquet" - ) - final_text_units = pd.read_parquet(data_path / "create_final_text_units.parquet") - final_relationships = pd.read_parquet( - data_path / "create_final_relationships.parquet" - ) - final_entities = pd.read_parquet(data_path / "create_final_entities.parquet") - final_covariates_path = data_path / "create_final_covariates.parquet" - final_covariates = ( - pd.read_parquet(final_covariates_path) - if final_covariates_path.exists() - else None + dataframe_dict = _resolve_parquet_files( + root_dir=root_dir, + config=config, + parquet_list=[ + "create_final_nodes.parquet", + "create_final_community_reports.parquet", + "create_final_text_units.parquet", + "create_final_relationships.parquet", + "create_final_entities.parquet", + ], + optional_list=["create_final_covariates.parquet"], ) + final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"] + final_community_reports: pd.DataFrame = dataframe_dict[ + "create_final_community_reports" + ] + final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"] + final_relationships: pd.DataFrame = dataframe_dict["create_final_relationships"] + final_entities: pd.DataFrame = dataframe_dict["create_final_entities"] + final_covariates: pd.DataFrame | None = dataframe_dict["create_final_covariates"] # call the Query API if streaming: @@ -179,3 +192,35 @@ def run_local_search( # NOTE: we return the response and context data here purely as a complete demonstration of the API. # External users should use the API directly to get the response and context data. return response, context_data + + +def _resolve_parquet_files( + root_dir: str, + config: GraphRagConfig, + parquet_list: list[str], + optional_list: list[str], +) -> dict[str, pd.DataFrame]: + """Read parquet files to a dataframe dict.""" + dataframe_dict = {} + pipeline_config = create_pipeline_config(config) + storage_obj = _create_storage(root_dir=root_dir, config=pipeline_config.storage) + for parquet_file in parquet_list: + df_key = parquet_file.split(".")[0] + df_value = asyncio.run( + _load_table_from_storage(name=parquet_file, storage=storage_obj) + ) + dataframe_dict[df_key] = df_value + + # for optional parquet files, set the dict entry to None instead of erroring out if it does not exist + for optional_file in optional_list: + file_exists = asyncio.run(storage_obj.has(optional_file)) + df_key = optional_file.split(".")[0] + if file_exists: + df_value = asyncio.run( + _load_table_from_storage(name=optional_file, storage=storage_obj) + ) + dataframe_dict[df_key] = df_value + else: + dataframe_dict[df_key] = None + + return dataframe_dict diff --git a/pyproject.toml b/pyproject.toml index bcd7b997..d9ca81f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,266 +1,266 @@ -[tool.poetry] -name = "graphrag" -# Maintainers: do not change the version here manually, use ./scripts/release.sh -version = "0.3.2" -description = "" -authors = [ - "Alonso Guevara Fernández ", - "Andrés Morales Esquivel ", - "Chris Trevino ", - "David Tittsworth ", - "Dayenne de Souza ", - "Derek Worthen ", - "Gaudy Blanco Meneses ", - "Ha Trinh ", - "Jonathan Larson ", - "Josh Bradley ", - "Kate Lytvynets ", - "Kenny Zhang ", - "Mónica Carvajal", - "Nathan Evans ", - "Rodrigo Racanicci ", - "Sarah Smith ", -] -license = "MIT" -readme = "README.md" -packages = [{ include = "graphrag" }] - -[tool.poetry.urls] -"Source" = "https://github.com/microsoft/graphrag" - -[tool.poetry-dynamic-versioning] -enable = true -style = "pep440" -vcs = "git" -bump = true -format-jinja = """ - {%- if distance == 0 -%} - {{ serialize_pep440(base, stage, revision) }} - {%- else -%} - {{ serialize_pep440(base, stage, revision, dev=distance) }} - {%- endif -%} -""" - -[tool.poetry.dependencies] -python = ">=3.10,<3.13" -environs = "^11.0.0" -datashaper = "^0.0.49" - -# Vector Stores -azure-search-documents = "^11.4.0" -lancedb = "^0.12.0" - -# Event Loops -uvloop = { version = "^0.20.0", markers = "platform_system != 'Windows'" } -nest-asyncio = { version = "^1.6.0", markers = "platform_system == 'Windows'" } - -# Async IO -aiolimiter = "^1.1.0" -aiofiles = "^24.1.0" - -# LLM -openai = "^1.37.1" -nltk = "3.9.1" -tiktoken = "^0.7.0" - -# Data-Sci -numba = "0.60.0" -numpy = "^1.25.2" -graspologic = "^3.4.1" -networkx = "^3" -fastparquet = "^2024.2.0" -# 1.13.0 was a footgun -scipy = "1.12.0" - -# Configuration -pyyaml = "^6.0.2" -pyaml-env = "^1.2.1" -python-dotenv = "^1.0.0" - -# Network -tenacity = "^9.0.0" - -swifter = "^1.4.0" -pydantic = "^2" -rich = "^13.6.0" -textual = "^0.78.0" -devtools = "^0.12.2" - -typing-extensions = "^4.12.2" - -#Azure -azure-storage-blob = "^12.22.0" -azure-identity = "^1.17.1" -json-repair = "^0.28.4" - -future = "^1.0.0" -[tool.poetry.group.dev.dependencies] -coverage = "^7.6.0" -ipykernel = "^6.29.4" -jupyter = "^1.0.0" -nbconvert = "^7.16.3" -poethepoet = "^0.27.0" -pyright = "^1.1.371" -pytest = "^8.3.2" -pytest-asyncio = "^0.24.0" -pytest-timeout = "^2.3.1" -ruff = "^0.6.2" -semversioner = "^2.0.3" - -update-toml = "^0.2.1" - -[build-system] -requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"] -build-backend = "poetry_dynamic_versioning.backend" - -[tool.poe.tasks] -_sort_imports = "ruff check --select I --fix . --preview" -_format_code = "ruff format . --preview" -_ruff_check = 'ruff check . --preview' -_pyright = "pyright" -_convert_local_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/local_search.ipynb' -_convert_global_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/global_search.ipynb' -_semversioner_release = "semversioner release" -_semversioner_changelog = "semversioner changelog > CHANGELOG.md" -_semversioner_update_toml_version = "update-toml update --path tool.poetry.version --value $(poetry run semversioner current-version)" -semversioner_add = "semversioner add-change" -coverage_report = 'coverage report --omit "**/tests/**" --show-missing' -check_format = 'ruff format . --check --preview' -fix = "ruff --preview check --fix ." -fix_unsafe = "ruff check --preview --fix --unsafe-fixes ." - -_test_all = "coverage run -m pytest ./tests" -test_unit = "pytest ./tests/unit" -test_integration = "pytest ./tests/integration" -test_smoke = "pytest ./tests/smoke" -test_notebook = "pytest ./tests/notebook" -index = "python -m graphrag.index" -query = "python -m graphrag.query" -prompt_tune = "python -m graphrag.prompt_tune" -# Pass in a test pattern -test_only = "pytest -s -k" - -[[tool.poe.tasks.release]] -sequence = [ - '_semversioner_release', - '_semversioner_changelog', - '_semversioner_update_toml_version', -] -ignore_fail = 'return_non_zero' - -[[tool.poe.tasks.convert_docsite_notebooks]] -sequence = ['_convert_local_search_nb', '_convert_global_search_nb'] -ignore_fail = 'return_non_zero' - -[[tool.poe.tasks.format]] -sequence = ['_sort_imports', '_format_code'] -ignore_fail = 'return_non_zero' - -[[tool.poe.tasks.check]] -sequence = ['check_format', '_ruff_check', '_pyright'] -ignore_fail = 'return_non_zero' - -[[tool.poe.tasks.test]] -sequence = ['_test_all', 'coverage_report'] -ignore_fail = 'return_non_zero' - -[tool.ruff] -target-version = "py310" -extend-include = ["*.ipynb"] - -[tool.ruff.format] -docstring-code-format = true -docstring-code-line-length = 20 - -[tool.ruff.lint] -select = [ - "E4", - "E7", - "E9", - "W291", - "YTT", - "T10", - "ICN", - "INP", - "Q", - "RSE", - "SLOT", - "INT", - "FLY", - "LOG", - "C90", - "T20", - "D", - "RET", - "PD", - "N", - "PIE", - "SIM", - "S", - "G", - "ERA", - "ASYNC", - "TID", - "UP", - "SLF", - "BLE", - "C4", - "I", - "F", - "A", - "ARG", - "PTH", - "RUF", - "B", - "TCH", - "DTZ", - "PYI", - "PT", - "EM", - "TRY", - "PERF", - "CPY", - # "FBT", # use named arguments for boolean flags - # "TD", # todos - # "FIX", # fixme - # "FURB" # preview rules - # ANN # Type annotations, re-enable when we get bandwidth -] -ignore = [ - # Ignore module names shadowing Python builtins - "A005", - # Deprecated Rules - "ANN101", - "ANN102", - # Conflicts with interface argument checking - "ARG002", - "ANN204", - # TODO: Inspect these pandas rules for validity - "PD002", # prevents inplace=True - # TODO RE-Enable when we get bandwidth - "PERF203", # Needs restructuring of errors, we should bail-out on first error - "C901", # needs refactoring to remove cyclomatic complexity -] - -[tool.ruff.lint.per-file-ignores] -"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"] -"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"] -"graphrag/index/config/*" = ["TCH"] -"*.ipynb" = ["T201"] - -[tool.ruff.lint.flake8-builtins] -builtins-ignorelist = ["input", "id", "bytes"] - -[tool.ruff.lint.pydocstyle] -convention = "numpy" - -# https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file -[tool.pyright] -include = ["graphrag", "tests", "examples", "examples_notebooks"] -exclude = ["**/node_modules", "**/__pycache__"] - -[tool.pytest.ini_options] -asyncio_mode = "auto" -timeout = 800 -# log_cli = true -# log_cli_level = "INFO" +[tool.poetry] +name = "graphrag" +# Maintainers: do not change the version here manually, use ./scripts/release.sh +version = "0.3.3" +description = "" +authors = [ + "Alonso Guevara Fernández ", + "Andrés Morales Esquivel ", + "Chris Trevino ", + "David Tittsworth ", + "Dayenne de Souza ", + "Derek Worthen ", + "Gaudy Blanco Meneses ", + "Ha Trinh ", + "Jonathan Larson ", + "Josh Bradley ", + "Kate Lytvynets ", + "Kenny Zhang ", + "Mónica Carvajal", + "Nathan Evans ", + "Rodrigo Racanicci ", + "Sarah Smith ", +] +license = "MIT" +readme = "README.md" +packages = [{ include = "graphrag" }] + +[tool.poetry.urls] +"Source" = "https://github.com/microsoft/graphrag" + +[tool.poetry-dynamic-versioning] +enable = true +style = "pep440" +vcs = "git" +bump = true +format-jinja = """ + {%- if distance == 0 -%} + {{ serialize_pep440(base, stage, revision) }} + {%- else -%} + {{ serialize_pep440(base, stage, revision, dev=distance) }} + {%- endif -%} +""" + +[tool.poetry.dependencies] +python = ">=3.10,<3.13" +environs = "^11.0.0" +datashaper = "^0.0.49" + +# Vector Stores +azure-search-documents = "^11.4.0" +lancedb = "^0.12.0" + +# Event Loops +uvloop = { version = "^0.20.0", markers = "platform_system != 'Windows'" } +nest-asyncio = { version = "^1.6.0", markers = "platform_system == 'Windows'" } + +# Async IO +aiolimiter = "^1.1.0" +aiofiles = "^24.1.0" + +# LLM +openai = "^1.37.1" +nltk = "3.9.1" +tiktoken = "^0.7.0" + +# Data-Sci +numba = "0.60.0" +numpy = "^1.25.2" +graspologic = "^3.4.1" +networkx = "^3" +fastparquet = "^2024.2.0" +# 1.13.0 was a footgun +scipy = "1.12.0" + +# Configuration +pyyaml = "^6.0.2" +pyaml-env = "^1.2.1" +python-dotenv = "^1.0.0" + +# Network +tenacity = "^9.0.0" + +swifter = "^1.4.0" +pydantic = "^2" +rich = "^13.6.0" +textual = "^0.78.0" +devtools = "^0.12.2" + +typing-extensions = "^4.12.2" + +#Azure +azure-storage-blob = "^12.22.0" +azure-identity = "^1.17.1" +json-repair = "^0.28.4" + +future = "^1.0.0" +[tool.poetry.group.dev.dependencies] +coverage = "^7.6.0" +ipykernel = "^6.29.4" +jupyter = "^1.0.0" +nbconvert = "^7.16.3" +poethepoet = "^0.27.0" +pyright = "^1.1.371" +pytest = "^8.3.2" +pytest-asyncio = "^0.24.0" +pytest-timeout = "^2.3.1" +ruff = "^0.6.2" +semversioner = "^2.0.3" + +update-toml = "^0.2.1" + +[build-system] +requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"] +build-backend = "poetry_dynamic_versioning.backend" + +[tool.poe.tasks] +_sort_imports = "ruff check --select I --fix . --preview" +_format_code = "ruff format . --preview" +_ruff_check = 'ruff check . --preview' +_pyright = "pyright" +_convert_local_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/local_search.ipynb' +_convert_global_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/global_search.ipynb' +_semversioner_release = "semversioner release" +_semversioner_changelog = "semversioner changelog > CHANGELOG.md" +_semversioner_update_toml_version = "update-toml update --path tool.poetry.version --value $(poetry run semversioner current-version)" +semversioner_add = "semversioner add-change" +coverage_report = 'coverage report --omit "**/tests/**" --show-missing' +check_format = 'ruff format . --check --preview' +fix = "ruff --preview check --fix ." +fix_unsafe = "ruff check --preview --fix --unsafe-fixes ." + +_test_all = "coverage run -m pytest ./tests" +test_unit = "pytest ./tests/unit" +test_integration = "pytest ./tests/integration" +test_smoke = "pytest ./tests/smoke" +test_notebook = "pytest ./tests/notebook" +index = "python -m graphrag.index" +query = "python -m graphrag.query" +prompt_tune = "python -m graphrag.prompt_tune" +# Pass in a test pattern +test_only = "pytest -s -k" + +[[tool.poe.tasks.release]] +sequence = [ + '_semversioner_release', + '_semversioner_changelog', + '_semversioner_update_toml_version', +] +ignore_fail = 'return_non_zero' + +[[tool.poe.tasks.convert_docsite_notebooks]] +sequence = ['_convert_local_search_nb', '_convert_global_search_nb'] +ignore_fail = 'return_non_zero' + +[[tool.poe.tasks.format]] +sequence = ['_sort_imports', '_format_code'] +ignore_fail = 'return_non_zero' + +[[tool.poe.tasks.check]] +sequence = ['check_format', '_ruff_check', '_pyright'] +ignore_fail = 'return_non_zero' + +[[tool.poe.tasks.test]] +sequence = ['_test_all', 'coverage_report'] +ignore_fail = 'return_non_zero' + +[tool.ruff] +target-version = "py310" +extend-include = ["*.ipynb"] + +[tool.ruff.format] +docstring-code-format = true +docstring-code-line-length = 20 + +[tool.ruff.lint] +select = [ + "E4", + "E7", + "E9", + "W291", + "YTT", + "T10", + "ICN", + "INP", + "Q", + "RSE", + "SLOT", + "INT", + "FLY", + "LOG", + "C90", + "T20", + "D", + "RET", + "PD", + "N", + "PIE", + "SIM", + "S", + "G", + "ERA", + "ASYNC", + "TID", + "UP", + "SLF", + "BLE", + "C4", + "I", + "F", + "A", + "ARG", + "PTH", + "RUF", + "B", + "TCH", + "DTZ", + "PYI", + "PT", + "EM", + "TRY", + "PERF", + "CPY", + # "FBT", # use named arguments for boolean flags + # "TD", # todos + # "FIX", # fixme + # "FURB" # preview rules + # ANN # Type annotations, re-enable when we get bandwidth +] +ignore = [ + # Ignore module names shadowing Python builtins + "A005", + # Deprecated Rules + "ANN101", + "ANN102", + # Conflicts with interface argument checking + "ARG002", + "ANN204", + # TODO: Inspect these pandas rules for validity + "PD002", # prevents inplace=True + # TODO RE-Enable when we get bandwidth + "PERF203", # Needs restructuring of errors, we should bail-out on first error + "C901", # needs refactoring to remove cyclomatic complexity +] + +[tool.ruff.lint.per-file-ignores] +"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"] +"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"] +"graphrag/index/config/*" = ["TCH"] +"*.ipynb" = ["T201"] + +[tool.ruff.lint.flake8-builtins] +builtins-ignorelist = ["input", "id", "bytes"] + +[tool.ruff.lint.pydocstyle] +convention = "numpy" + +# https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file +[tool.pyright] +include = ["graphrag", "tests", "examples", "examples_notebooks"] +exclude = ["**/node_modules", "**/__pycache__"] + +[tool.pytest.ini_options] +asyncio_mode = "auto" +timeout = 800 +# log_cli = true +# log_cli_level = "INFO"