Merge branch 'main' into incremental_indexing/main

2025-12-10 14:31:15 +00:00 · 2024-09-10 16:04:01 -06:00 · 2024-09-10 16:04:01 -06:00 · 67f4b02ecd
commit 67f4b02ecd
parent 3295e2b861 e7ee8cb8a5
18 changed files with 511 additions and 434 deletions
--- a/.semversioner/0.3.3.json
+++ b/.semversioner/0.3.3.json
@ -0,0 +1,66 @@
+{
+  "changes": [
+    {
+      "description": "Add entrypoints for incremental indexing",
+      "type": "patch"
+    },
+    {
+      "description": "Clean up and organize run index code",
+      "type": "patch"
+    },
+    {
+      "description": "Consistent config loading. Resolves #99 and Resolves #1049",
+      "type": "patch"
+    },
+    {
+      "description": "Fix circular dependency when running prompt tune api directly",
+      "type": "patch"
+    },
+    {
+      "description": "Fix default settings for embedding",
+      "type": "patch"
+    },
+    {
+      "description": "Fix img for auto tune",
+      "type": "patch"
+    },
+    {
+      "description": "Fix img width",
+      "type": "patch"
+    },
+    {
+      "description": "Fixed a bug in prompt tuning process",
+      "type": "patch"
+    },
+    {
+      "description": "Refactor text unit build at local search",
+      "type": "patch"
+    },
+    {
+      "description": "Update Prompt Tuning docs",
+      "type": "patch"
+    },
+    {
+      "description": "Update create_pipeline_config.py",
+      "type": "patch"
+    },
+    {
+      "description": "Update prompt tune command in docs",
+      "type": "patch"
+    },
+    {
+      "description": "add querying from azure blob storage",
+      "type": "patch"
+    },
+    {
+      "description": "fix setting base_dir to full paths when not using file system.",
+      "type": "patch"
+    },
+    {
+      "description": "fix strategy config in entity_extraction",
+      "type": "patch"
+    }
+  ],
+  "created_at": "2024-09-10T19:51:24+00:00",
+  "version": "0.3.3"
+}
--- a/.semversioner/next-release/patch-20240712071506108985.json
+++ b/.semversioner/next-release/patch-20240712071506108985.json
@ -1,4 +0,0 @@
-{
-  "type": "patch",
-  "description": "fix strategy config in entity_extraction"
-}
--- a/.semversioner/next-release/patch-20240814063732868394.json
+++ b/.semversioner/next-release/patch-20240814063732868394.json
@ -1,4 +0,0 @@
-{
-  "type": "patch",
-  "description": "Fixed a bug in prompt tuning process"
-}
--- a/.semversioner/next-release/patch-20240827203354884800.json
+++ b/.semversioner/next-release/patch-20240827203354884800.json
@ -1,4 +0,0 @@
-{
-  "type": "patch",
-  "description": "Fix default settings for embedding"
-}
--- a/.semversioner/next-release/patch-20240827212041426794.json
+++ b/.semversioner/next-release/patch-20240827212041426794.json
@ -1,4 +0,0 @@
-{
-  "type": "patch",
-  "description": "Refactor text unit build at local search"
-}
--- a/.semversioner/next-release/patch-20240829175336332224.json
+++ b/.semversioner/next-release/patch-20240829175336332224.json
@ -1,4 +0,0 @@
-{
-  "type": "patch",
-  "description": "Fix circular dependency when running prompt tune api directly"
-}
--- a/.semversioner/next-release/patch-20240829213842840703.json
+++ b/.semversioner/next-release/patch-20240829213842840703.json
@ -1,4 +0,0 @@
-{
-  "type": "patch",
-  "description": "Update Prompt Tuning docs"
-}
--- a/.semversioner/next-release/patch-20240829222117086645.json
+++ b/.semversioner/next-release/patch-20240829222117086645.json
@ -1,4 +0,0 @@
-{
-  "type": "patch",
-  "description": "Update prompt tune command in docs"
-}
--- a/.semversioner/next-release/patch-20240829223855375571.json
+++ b/.semversioner/next-release/patch-20240829223855375571.json
@ -1,4 +0,0 @@
-{
-  "type": "patch",
-  "description": "Fix img for auto tune"
-}
--- a/.semversioner/next-release/patch-20240829230018473667.json
+++ b/.semversioner/next-release/patch-20240829230018473667.json
@ -1,4 +0,0 @@
-{
-  "type": "patch",
-  "description": "Fix img width"
-}
--- a/.semversioner/next-release/patch-20240830151802543194.json
+++ b/.semversioner/next-release/patch-20240830151802543194.json
@ -1,4 +0,0 @@
-{
-  "type": "patch",
-  "description": "Consistent config loading. Resolves #99 and Resolves #1049"
-}
--- a/.semversioner/next-release/patch-20240830181135475287.json
+++ b/.semversioner/next-release/patch-20240830181135475287.json
@ -1,4 +0,0 @@
-{
-  "type": "patch",
-  "description": "Add entrypoints for incremental indexing"
-}
--- a/.semversioner/next-release/patch-20240903205022597458.json
+++ b/.semversioner/next-release/patch-20240903205022597458.json
@ -1,4 +0,0 @@
-{
-  "type": "patch",
-  "description": "Clean up and organize run index code"
-}
--- a/.semversioner/next-release/patch-20240904173227165702.json
+++ b/.semversioner/next-release/patch-20240904173227165702.json
@ -1,4 +0,0 @@
-{
-  "type": "patch",
-  "description": "fix setting base_dir to full paths when not using file system."
-}
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,88 +1,106 @@
-# Changelog
-Note: version releases in the 0.x.y range may introduce breaking changes.
-
-## 0.3.2
-
- patch: Add context data to query API responses.
- patch: Add missing config parameter documentation for prompt tuning
- patch: Add neo4j community notebook
- patch: Ensure entity types to be str when running prompt tuning
- patch: Fix weight casting during graph extraction
- patch: Patch "past" dependency issues
- patch: Update developer guide.
- patch: Update query type hints.
- patch: change-lancedb-placement
-
-## 0.3.1
-
- patch: Add preflight check to check LLM connectivity.
- patch: Add streaming support for local/global search to query cli
- patch: Add support for both float and int on schema validation for community report generation
- patch: Avoid running index on gh-pages publishing
- patch: Implement Index API
- patch: Improves filtering for data dir inferring
- patch: Update to nltk 3.9.1
-
-## 0.3.0
-
- minor: Implement auto templating API.
- minor: Implement query engine API.
- patch: Fix file dumps using json for non ASCII chars
- patch: Stabilize smoke tests for query context building
- patch: fix query embedding
- patch: fix sort_context & max_tokens params in verb
-
-## 0.2.2
-
- patch: Add a check if there is no community record added in local search context
- patch: Add sepparate workflow for Python Tests
- patch: Docs updates
- patch: Run smoke tests on 4o
-
-## 0.2.1
-
- patch: Added default columns for vector store at create_pipeline_config. No change for other cases.
- patch: Change json parsing error in the map step of global search to warning
- patch: Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config
- patch: Fix json parsing when LLM returns faulty responses
- patch: Fix missing community reports and refactor community context builder
- patch: Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly.
- patch: Try parsing json before even repairing
- patch: Update Prompt Tuning meta prompts with finer examples
- patch: Update default entity extraction and gleaning prompts to reduce hallucinations
- patch: add encoding-model to entity/claim extraction config
- patch: add encoding-model to text chunking config
- patch: add user prompt to history-tracking llm
- patch: update config reader to allow for zero gleans
- patch: update config-reader to allow for empty chunk-by arrays
- patch: update history-tracking LLm to use 'assistant' instead of 'system' in output history.
- patch: use history argument in hash key computation; add history input to cache data
-
-## 0.2.0
-
- minor: Add content-based KNN for selecting prompt tune few shot examples
- minor: Add dynamic community report rating to the prompt tuning engine
- patch: Add Minute-based Rate Limiting and fix rpm, tpm settings
- patch: Add N parameter support
- patch: Add cli flag to overlay default values onto a provided config.
- patch: Add exception handling on file load
- patch: Add language support to prompt tuning
- patch: Add llm params to local and global search
- patch: Fix broken prompt tuning link on docs
- patch: Fix delta none on query calls
- patch: Fix docsite base url
- patch: Fix encoding model parameter on prompt tune
- patch: Fix for --limit exceeding the dataframe length
- patch: Fix for Ruff 0.5.2
- patch: Fixed an issue where base OpenAI embeddings can't work with Azure OpenAI LLM
- patch: Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls
- patch: fix community_report doesn't work in settings.yaml
- patch: fix llm response content is None in query
- patch: fix the organization parameter is ineffective during queries
- patch: remove duplicate file read
- patch: support non-open ai model config to prompt tune
- patch: use binary io processing for all file io operations
-
-## 0.1.0
-
- minor: Initial Release
+# Changelog
+Note: version releases in the 0.x.y range may introduce breaking changes.
+
+## 0.3.3
+
+- patch: Add entrypoints for incremental indexing
+- patch: Clean up and organize run index code
+- patch: Consistent config loading. Resolves #99 and Resolves #1049
+- patch: Fix circular dependency when running prompt tune api directly
+- patch: Fix default settings for embedding
+- patch: Fix img for auto tune
+- patch: Fix img width
+- patch: Fixed a bug in prompt tuning process
+- patch: Refactor text unit build at local search
+- patch: Update Prompt Tuning docs
+- patch: Update create_pipeline_config.py
+- patch: Update prompt tune command in docs
+- patch: add querying from azure blob storage
+- patch: fix setting base_dir to full paths when not using file system.
+- patch: fix strategy config in entity_extraction
+
+## 0.3.2
+
+- patch: Add context data to query API responses.
+- patch: Add missing config parameter documentation for prompt tuning
+- patch: Add neo4j community notebook
+- patch: Ensure entity types to be str when running prompt tuning
+- patch: Fix weight casting during graph extraction
+- patch: Patch "past" dependency issues
+- patch: Update developer guide.
+- patch: Update query type hints.
+- patch: change-lancedb-placement
+
+## 0.3.1
+
+- patch: Add preflight check to check LLM connectivity.
+- patch: Add streaming support for local/global search to query cli
+- patch: Add support for both float and int on schema validation for community report generation
+- patch: Avoid running index on gh-pages publishing
+- patch: Implement Index API
+- patch: Improves filtering for data dir inferring
+- patch: Update to nltk 3.9.1
+
+## 0.3.0
+
+- minor: Implement auto templating API.
+- minor: Implement query engine API.
+- patch: Fix file dumps using json for non ASCII chars
+- patch: Stabilize smoke tests for query context building
+- patch: fix query embedding
+- patch: fix sort_context & max_tokens params in verb
+
+## 0.2.2
+
+- patch: Add a check if there is no community record added in local search context
+- patch: Add sepparate workflow for Python Tests
+- patch: Docs updates
+- patch: Run smoke tests on 4o
+
+## 0.2.1
+
+- patch: Added default columns for vector store at create_pipeline_config. No change for other cases.
+- patch: Change json parsing error in the map step of global search to warning
+- patch: Fix Local Search breaking when loading Embeddings input. Defaulting overwrite to True as in the rest of the vector store config
+- patch: Fix json parsing when LLM returns faulty responses
+- patch: Fix missing community reports and refactor community context builder
+- patch: Fixed a bug that erased the vector database, added a new parameter to specify the config file path, and updated the documentation accordingly.
+- patch: Try parsing json before even repairing
+- patch: Update Prompt Tuning meta prompts with finer examples
+- patch: Update default entity extraction and gleaning prompts to reduce hallucinations
+- patch: add encoding-model to entity/claim extraction config
+- patch: add encoding-model to text chunking config
+- patch: add user prompt to history-tracking llm
+- patch: update config reader to allow for zero gleans
+- patch: update config-reader to allow for empty chunk-by arrays
+- patch: update history-tracking LLm to use 'assistant' instead of 'system' in output history.
+- patch: use history argument in hash key computation; add history input to cache data
+
+## 0.2.0
+
+- minor: Add content-based KNN for selecting prompt tune few shot examples
+- minor: Add dynamic community report rating to the prompt tuning engine
+- patch: Add Minute-based Rate Limiting and fix rpm, tpm settings
+- patch: Add N parameter support
+- patch: Add cli flag to overlay default values onto a provided config.
+- patch: Add exception handling on file load
+- patch: Add language support to prompt tuning
+- patch: Add llm params to local and global search
+- patch: Fix broken prompt tuning link on docs
+- patch: Fix delta none on query calls
+- patch: Fix docsite base url
+- patch: Fix encoding model parameter on prompt tune
+- patch: Fix for --limit exceeding the dataframe length
+- patch: Fix for Ruff 0.5.2
+- patch: Fixed an issue where base OpenAI embeddings can't work with Azure OpenAI LLM
+- patch: Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls
+- patch: fix community_report doesn't work in settings.yaml
+- patch: fix llm response content is None in query
+- patch: fix the organization parameter is ineffective during queries
+- patch: remove duplicate file read
+- patch: support non-open ai model config to prompt tune
+- patch: use binary io processing for all file io operations
+
+## 0.1.0
+
+- minor: Initial Release
--- a/graphrag/index/create_pipeline_config.py
+++ b/graphrag/index/create_pipeline_config.py
@ -274,7 +274,7 @@ def _get_embedding_settings(
    #
    strategy = settings.resolved_strategy()  # get the default strategy
    strategy.update({
-        "vector_store": {**vector_store_settings, **(vector_store_params or {})}
+        "vector_store": {**(vector_store_params or {}), **vector_store_settings}
    })  # update the default strategy with the vector store settings
    # This ensures the vector store config is part of the strategy and not the global config
    return {
--- a/graphrag/query/cli.py
+++ b/graphrag/query/cli.py
@ -9,8 +9,14 @@ from pathlib import Path

 import pandas as pd

-from graphrag.config import load_config, resolve_path
+from graphrag.config import (
+    GraphRagConfig,
+    load_config,
+    resolve_path,
+)
+from graphrag.index.create_pipeline_config import create_pipeline_config
 from graphrag.index.progress import PrintProgressReporter
+from graphrag.utils.storage import _create_storage, _load_table_from_storage

 from . import api

@ -36,17 +42,21 @@ def run_global_search(
    if data_dir:
        config.storage.base_dir = str(resolve_path(data_dir, root))

-    data_path = Path(config.storage.base_dir).resolve()
-
-    final_nodes: pd.DataFrame = pd.read_parquet(
-        data_path / "create_final_nodes.parquet"
-    )
-    final_entities: pd.DataFrame = pd.read_parquet(
-        data_path / "create_final_entities.parquet"
-    )
-    final_community_reports: pd.DataFrame = pd.read_parquet(
-        data_path / "create_final_community_reports.parquet"
+    dataframe_dict = _resolve_parquet_files(
+        root_dir=root_dir,
+        config=config,
+        parquet_list=[
+            "create_final_nodes.parquet",
+            "create_final_entities.parquet",
+            "create_final_community_reports.parquet",
+        ],
+        optional_list=[],
    )
+    final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
+    final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
+    final_community_reports: pd.DataFrame = dataframe_dict[
+        "create_final_community_reports"
+    ]

    # call the Query API
    if streaming:
@ -112,23 +122,26 @@ def run_local_search(
    if data_dir:
        config.storage.base_dir = str(resolve_path(data_dir, root))

-    data_path = Path(config.storage.base_dir).resolve()
-
-    final_nodes = pd.read_parquet(data_path / "create_final_nodes.parquet")
-    final_community_reports = pd.read_parquet(
-        data_path / "create_final_community_reports.parquet"
-    )
-    final_text_units = pd.read_parquet(data_path / "create_final_text_units.parquet")
-    final_relationships = pd.read_parquet(
-        data_path / "create_final_relationships.parquet"
-    )
-    final_entities = pd.read_parquet(data_path / "create_final_entities.parquet")
-    final_covariates_path = data_path / "create_final_covariates.parquet"
-    final_covariates = (
-        pd.read_parquet(final_covariates_path)
-        if final_covariates_path.exists()
-        else None
+    dataframe_dict = _resolve_parquet_files(
+        root_dir=root_dir,
+        config=config,
+        parquet_list=[
+            "create_final_nodes.parquet",
+            "create_final_community_reports.parquet",
+            "create_final_text_units.parquet",
+            "create_final_relationships.parquet",
+            "create_final_entities.parquet",
+        ],
+        optional_list=["create_final_covariates.parquet"],
    )
+    final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
+    final_community_reports: pd.DataFrame = dataframe_dict[
+        "create_final_community_reports"
+    ]
+    final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"]
+    final_relationships: pd.DataFrame = dataframe_dict["create_final_relationships"]
+    final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
+    final_covariates: pd.DataFrame | None = dataframe_dict["create_final_covariates"]

    # call the Query API
    if streaming:
@ -179,3 +192,35 @@ def run_local_search(
    # NOTE: we return the response and context data here purely as a complete demonstration of the API.
    # External users should use the API directly to get the response and context data.
    return response, context_data
+
+
+def _resolve_parquet_files(
+    root_dir: str,
+    config: GraphRagConfig,
+    parquet_list: list[str],
+    optional_list: list[str],
+) -> dict[str, pd.DataFrame]:
+    """Read parquet files to a dataframe dict."""
+    dataframe_dict = {}
+    pipeline_config = create_pipeline_config(config)
+    storage_obj = _create_storage(root_dir=root_dir, config=pipeline_config.storage)
+    for parquet_file in parquet_list:
+        df_key = parquet_file.split(".")[0]
+        df_value = asyncio.run(
+            _load_table_from_storage(name=parquet_file, storage=storage_obj)
+        )
+        dataframe_dict[df_key] = df_value
+
+    # for optional parquet files, set the dict entry to None instead of erroring out if it does not exist
+    for optional_file in optional_list:
+        file_exists = asyncio.run(storage_obj.has(optional_file))
+        df_key = optional_file.split(".")[0]
+        if file_exists:
+            df_value = asyncio.run(
+                _load_table_from_storage(name=optional_file, storage=storage_obj)
+            )
+            dataframe_dict[df_key] = df_value
+        else:
+            dataframe_dict[df_key] = None
+
+    return dataframe_dict
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,266 +1,266 @@
-[tool.poetry]
-name = "graphrag"
-# Maintainers: do not change the version here manually, use ./scripts/release.sh
-version = "0.3.2"
-description = ""
-authors = [
-    "Alonso Guevara Fernández <alonsog@microsoft.com>",
-    "Andrés Morales Esquivel <andresmor@microsoft.com>",
-    "Chris Trevino <chtrevin@microsoft.com>",
-    "David Tittsworth <datittsw@microsoft.com>",
-    "Dayenne de Souza <ddesouza@microsoft.com>",
-    "Derek Worthen <deworthe@microsoft.com>",
-    "Gaudy Blanco Meneses <gaudyb@microsoft.com>",
-    "Ha Trinh <trinhha@microsoft.com>",
-    "Jonathan Larson <jolarso@microsoft.com>",
-    "Josh Bradley <joshbradley@microsoft.com>",
-    "Kate Lytvynets <kalytv@microsoft.com>",
-    "Kenny Zhang <zhangken@microsoft.com>",
-    "Mónica Carvajal",
-    "Nathan Evans <naevans@microsoft.com>",
-    "Rodrigo Racanicci <rracanicci@microsoft.com>",
-    "Sarah Smith <smithsarah@microsoft.com>",
-]
-license = "MIT"
-readme = "README.md"
-packages = [{ include = "graphrag" }]
-
-[tool.poetry.urls]
-"Source" = "https://github.com/microsoft/graphrag"
-
-[tool.poetry-dynamic-versioning]
-enable = true
-style = "pep440"
-vcs = "git"
-bump = true
-format-jinja = """
-    {%- if distance == 0 -%}
-        {{ serialize_pep440(base, stage, revision) }}
-    {%- else -%}
-        {{ serialize_pep440(base, stage, revision, dev=distance) }}
-    {%- endif -%}
-"""
-
-[tool.poetry.dependencies]
-python = ">=3.10,<3.13"
-environs = "^11.0.0"
-datashaper = "^0.0.49"
-
-# Vector Stores
-azure-search-documents = "^11.4.0"
-lancedb = "^0.12.0"
-
-# Event Loops
-uvloop = { version = "^0.20.0", markers = "platform_system != 'Windows'" }
-nest-asyncio = { version = "^1.6.0", markers = "platform_system == 'Windows'" }
-
-# Async IO
-aiolimiter = "^1.1.0"
-aiofiles = "^24.1.0"
-
-# LLM
-openai = "^1.37.1"
-nltk = "3.9.1"
-tiktoken = "^0.7.0"
-
-# Data-Sci
-numba = "0.60.0"
-numpy = "^1.25.2"
-graspologic = "^3.4.1"
-networkx = "^3"
-fastparquet = "^2024.2.0"
-# 1.13.0 was a footgun
-scipy = "1.12.0"
-
-# Configuration
-pyyaml = "^6.0.2"
-pyaml-env = "^1.2.1"
-python-dotenv = "^1.0.0"
-
-# Network
-tenacity = "^9.0.0"
-
-swifter = "^1.4.0"
-pydantic = "^2"
-rich = "^13.6.0"
-textual = "^0.78.0"
-devtools = "^0.12.2"
-
-typing-extensions = "^4.12.2"
-
-#Azure
-azure-storage-blob = "^12.22.0"
-azure-identity = "^1.17.1"
-json-repair = "^0.28.4"
-
-future = "^1.0.0"
-[tool.poetry.group.dev.dependencies]
-coverage = "^7.6.0"
-ipykernel = "^6.29.4"
-jupyter = "^1.0.0"
-nbconvert = "^7.16.3"
-poethepoet = "^0.27.0"
-pyright = "^1.1.371"
-pytest = "^8.3.2"
-pytest-asyncio = "^0.24.0"
-pytest-timeout = "^2.3.1"
-ruff = "^0.6.2"
-semversioner = "^2.0.3"
-
-update-toml = "^0.2.1"
-
-[build-system]
-requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
-build-backend = "poetry_dynamic_versioning.backend"
-
-[tool.poe.tasks]
-_sort_imports = "ruff check --select I --fix . --preview"
-_format_code = "ruff format  . --preview"
-_ruff_check = 'ruff check . --preview'
-_pyright = "pyright"
-_convert_local_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/local_search.ipynb'
-_convert_global_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/global_search.ipynb'
-_semversioner_release = "semversioner release"
-_semversioner_changelog = "semversioner changelog > CHANGELOG.md"
-_semversioner_update_toml_version = "update-toml update --path tool.poetry.version --value $(poetry run semversioner current-version)"
-semversioner_add = "semversioner add-change"
-coverage_report = 'coverage report --omit "**/tests/**" --show-missing'
-check_format = 'ruff format . --check --preview'
-fix = "ruff  --preview check --fix ."
-fix_unsafe = "ruff check --preview --fix --unsafe-fixes ."
-
-_test_all = "coverage run -m pytest ./tests"
-test_unit = "pytest ./tests/unit"
-test_integration = "pytest ./tests/integration"
-test_smoke = "pytest ./tests/smoke"
-test_notebook = "pytest ./tests/notebook"
-index = "python -m graphrag.index"
-query = "python -m graphrag.query"
-prompt_tune = "python -m graphrag.prompt_tune"
-# Pass in a test pattern
-test_only = "pytest -s -k"
-
-[[tool.poe.tasks.release]]
-sequence = [
-    '_semversioner_release',
-    '_semversioner_changelog',
-    '_semversioner_update_toml_version',
-]
-ignore_fail = 'return_non_zero'
-
-[[tool.poe.tasks.convert_docsite_notebooks]]
-sequence = ['_convert_local_search_nb', '_convert_global_search_nb']
-ignore_fail = 'return_non_zero'
-
-[[tool.poe.tasks.format]]
-sequence = ['_sort_imports', '_format_code']
-ignore_fail = 'return_non_zero'
-
-[[tool.poe.tasks.check]]
-sequence = ['check_format', '_ruff_check', '_pyright']
-ignore_fail = 'return_non_zero'
-
-[[tool.poe.tasks.test]]
-sequence = ['_test_all', 'coverage_report']
-ignore_fail = 'return_non_zero'
-
-[tool.ruff]
-target-version = "py310"
-extend-include = ["*.ipynb"]
-
-[tool.ruff.format]
-docstring-code-format = true
-docstring-code-line-length = 20
-
-[tool.ruff.lint]
-select = [
-    "E4",
-    "E7",
-    "E9",
-    "W291",
-    "YTT",
-    "T10",
-    "ICN",
-    "INP",
-    "Q",
-    "RSE",
-    "SLOT",
-    "INT",
-    "FLY",
-    "LOG",
-    "C90",
-    "T20",
-    "D",
-    "RET",
-    "PD",
-    "N",
-    "PIE",
-    "SIM",
-    "S",
-    "G",
-    "ERA",
-    "ASYNC",
-    "TID",
-    "UP",
-    "SLF",
-    "BLE",
-    "C4",
-    "I",
-    "F",
-    "A",
-    "ARG",
-    "PTH",
-    "RUF",
-    "B",
-    "TCH",
-    "DTZ",
-    "PYI",
-    "PT",
-    "EM",
-    "TRY",
-    "PERF",
-    "CPY",
-    # "FBT", # use named arguments for boolean flags
-    # "TD", # todos
-    # "FIX", # fixme
-    # "FURB" # preview rules
-    # ANN # Type annotations, re-enable when we get bandwidth
-]
-ignore = [
-    # Ignore module names shadowing Python builtins
-    "A005",
-    # Deprecated Rules
-    "ANN101",
-    "ANN102",
-    # Conflicts with interface argument checking
-    "ARG002",
-    "ANN204",
-    # TODO: Inspect these pandas rules for validity
-    "PD002", # prevents inplace=True
-    # TODO RE-Enable when we get bandwidth
-    "PERF203", # Needs restructuring of errors, we should bail-out on first error
-    "C901",    # needs refactoring to remove cyclomatic complexity
-]
-
-[tool.ruff.lint.per-file-ignores]
-"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"]
-"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"]
-"graphrag/index/config/*" = ["TCH"]
-"*.ipynb" = ["T201"]
-
-[tool.ruff.lint.flake8-builtins]
-builtins-ignorelist = ["input", "id", "bytes"]
-
-[tool.ruff.lint.pydocstyle]
-convention = "numpy"
-
-# https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file
-[tool.pyright]
-include = ["graphrag", "tests", "examples", "examples_notebooks"]
-exclude = ["**/node_modules", "**/__pycache__"]
-
-[tool.pytest.ini_options]
-asyncio_mode = "auto"
-timeout = 800
-# log_cli = true
-# log_cli_level = "INFO"
+[tool.poetry]
+name = "graphrag"
+# Maintainers: do not change the version here manually, use ./scripts/release.sh
+version = "0.3.3"
+description = ""
+authors = [
+    "Alonso Guevara Fernández <alonsog@microsoft.com>",
+    "Andrés Morales Esquivel <andresmor@microsoft.com>",
+    "Chris Trevino <chtrevin@microsoft.com>",
+    "David Tittsworth <datittsw@microsoft.com>",
+    "Dayenne de Souza <ddesouza@microsoft.com>",
+    "Derek Worthen <deworthe@microsoft.com>",
+    "Gaudy Blanco Meneses <gaudyb@microsoft.com>",
+    "Ha Trinh <trinhha@microsoft.com>",
+    "Jonathan Larson <jolarso@microsoft.com>",
+    "Josh Bradley <joshbradley@microsoft.com>",
+    "Kate Lytvynets <kalytv@microsoft.com>",
+    "Kenny Zhang <zhangken@microsoft.com>",
+    "Mónica Carvajal",
+    "Nathan Evans <naevans@microsoft.com>",
+    "Rodrigo Racanicci <rracanicci@microsoft.com>",
+    "Sarah Smith <smithsarah@microsoft.com>",
+]
+license = "MIT"
+readme = "README.md"
+packages = [{ include = "graphrag" }]
+
+[tool.poetry.urls]
+"Source" = "https://github.com/microsoft/graphrag"
+
+[tool.poetry-dynamic-versioning]
+enable = true
+style = "pep440"
+vcs = "git"
+bump = true
+format-jinja = """
+    {%- if distance == 0 -%}
+        {{ serialize_pep440(base, stage, revision) }}
+    {%- else -%}
+        {{ serialize_pep440(base, stage, revision, dev=distance) }}
+    {%- endif -%}
+"""
+
+[tool.poetry.dependencies]
+python = ">=3.10,<3.13"
+environs = "^11.0.0"
+datashaper = "^0.0.49"
+
+# Vector Stores
+azure-search-documents = "^11.4.0"
+lancedb = "^0.12.0"
+
+# Event Loops
+uvloop = { version = "^0.20.0", markers = "platform_system != 'Windows'" }
+nest-asyncio = { version = "^1.6.0", markers = "platform_system == 'Windows'" }
+
+# Async IO
+aiolimiter = "^1.1.0"
+aiofiles = "^24.1.0"
+
+# LLM
+openai = "^1.37.1"
+nltk = "3.9.1"
+tiktoken = "^0.7.0"
+
+# Data-Sci
+numba = "0.60.0"
+numpy = "^1.25.2"
+graspologic = "^3.4.1"
+networkx = "^3"
+fastparquet = "^2024.2.0"
+# 1.13.0 was a footgun
+scipy = "1.12.0"
+
+# Configuration
+pyyaml = "^6.0.2"
+pyaml-env = "^1.2.1"
+python-dotenv = "^1.0.0"
+
+# Network
+tenacity = "^9.0.0"
+
+swifter = "^1.4.0"
+pydantic = "^2"
+rich = "^13.6.0"
+textual = "^0.78.0"
+devtools = "^0.12.2"
+
+typing-extensions = "^4.12.2"
+
+#Azure
+azure-storage-blob = "^12.22.0"
+azure-identity = "^1.17.1"
+json-repair = "^0.28.4"
+
+future = "^1.0.0"
+[tool.poetry.group.dev.dependencies]
+coverage = "^7.6.0"
+ipykernel = "^6.29.4"
+jupyter = "^1.0.0"
+nbconvert = "^7.16.3"
+poethepoet = "^0.27.0"
+pyright = "^1.1.371"
+pytest = "^8.3.2"
+pytest-asyncio = "^0.24.0"
+pytest-timeout = "^2.3.1"
+ruff = "^0.6.2"
+semversioner = "^2.0.3"
+
+update-toml = "^0.2.1"
+
+[build-system]
+requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]
+build-backend = "poetry_dynamic_versioning.backend"
+
+[tool.poe.tasks]
+_sort_imports = "ruff check --select I --fix . --preview"
+_format_code = "ruff format  . --preview"
+_ruff_check = 'ruff check . --preview'
+_pyright = "pyright"
+_convert_local_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/local_search.ipynb'
+_convert_global_search_nb = 'jupyter nbconvert --output-dir=docsite/posts/query/notebooks/ --output="{notebook_name}_nb" --template=docsite/nbdocsite_template --to markdown examples_notebooks/global_search.ipynb'
+_semversioner_release = "semversioner release"
+_semversioner_changelog = "semversioner changelog > CHANGELOG.md"
+_semversioner_update_toml_version = "update-toml update --path tool.poetry.version --value $(poetry run semversioner current-version)"
+semversioner_add = "semversioner add-change"
+coverage_report = 'coverage report --omit "**/tests/**" --show-missing'
+check_format = 'ruff format . --check --preview'
+fix = "ruff  --preview check --fix ."
+fix_unsafe = "ruff check --preview --fix --unsafe-fixes ."
+
+_test_all = "coverage run -m pytest ./tests"
+test_unit = "pytest ./tests/unit"
+test_integration = "pytest ./tests/integration"
+test_smoke = "pytest ./tests/smoke"
+test_notebook = "pytest ./tests/notebook"
+index = "python -m graphrag.index"
+query = "python -m graphrag.query"
+prompt_tune = "python -m graphrag.prompt_tune"
+# Pass in a test pattern
+test_only = "pytest -s -k"
+
+[[tool.poe.tasks.release]]
+sequence = [
+    '_semversioner_release',
+    '_semversioner_changelog',
+    '_semversioner_update_toml_version',
+]
+ignore_fail = 'return_non_zero'
+
+[[tool.poe.tasks.convert_docsite_notebooks]]
+sequence = ['_convert_local_search_nb', '_convert_global_search_nb']
+ignore_fail = 'return_non_zero'
+
+[[tool.poe.tasks.format]]
+sequence = ['_sort_imports', '_format_code']
+ignore_fail = 'return_non_zero'
+
+[[tool.poe.tasks.check]]
+sequence = ['check_format', '_ruff_check', '_pyright']
+ignore_fail = 'return_non_zero'
+
+[[tool.poe.tasks.test]]
+sequence = ['_test_all', 'coverage_report']
+ignore_fail = 'return_non_zero'
+
+[tool.ruff]
+target-version = "py310"
+extend-include = ["*.ipynb"]
+
+[tool.ruff.format]
+docstring-code-format = true
+docstring-code-line-length = 20
+
+[tool.ruff.lint]
+select = [
+    "E4",
+    "E7",
+    "E9",
+    "W291",
+    "YTT",
+    "T10",
+    "ICN",
+    "INP",
+    "Q",
+    "RSE",
+    "SLOT",
+    "INT",
+    "FLY",
+    "LOG",
+    "C90",
+    "T20",
+    "D",
+    "RET",
+    "PD",
+    "N",
+    "PIE",
+    "SIM",
+    "S",
+    "G",
+    "ERA",
+    "ASYNC",
+    "TID",
+    "UP",
+    "SLF",
+    "BLE",
+    "C4",
+    "I",
+    "F",
+    "A",
+    "ARG",
+    "PTH",
+    "RUF",
+    "B",
+    "TCH",
+    "DTZ",
+    "PYI",
+    "PT",
+    "EM",
+    "TRY",
+    "PERF",
+    "CPY",
+    # "FBT", # use named arguments for boolean flags
+    # "TD", # todos
+    # "FIX", # fixme
+    # "FURB" # preview rules
+    # ANN # Type annotations, re-enable when we get bandwidth
+]
+ignore = [
+    # Ignore module names shadowing Python builtins
+    "A005",
+    # Deprecated Rules
+    "ANN101",
+    "ANN102",
+    # Conflicts with interface argument checking
+    "ARG002",
+    "ANN204",
+    # TODO: Inspect these pandas rules for validity
+    "PD002", # prevents inplace=True
+    # TODO RE-Enable when we get bandwidth
+    "PERF203", # Needs restructuring of errors, we should bail-out on first error
+    "C901",    # needs refactoring to remove cyclomatic complexity
+]
+
+[tool.ruff.lint.per-file-ignores]
+"tests/*" = ["S", "D", "ANN", "T201", "ASYNC", "ARG", "PTH", "TRY"]
+"examples/*" = ["S", "D", "ANN", "T201", "PTH", "TRY", "PERF"]
+"graphrag/index/config/*" = ["TCH"]
+"*.ipynb" = ["T201"]
+
+[tool.ruff.lint.flake8-builtins]
+builtins-ignorelist = ["input", "id", "bytes"]
+
+[tool.ruff.lint.pydocstyle]
+convention = "numpy"
+
+# https://github.com/microsoft/pyright/blob/9f81564a4685ff5c55edd3959f9b39030f590b2f/docs/configuration.md#sample-pyprojecttoml-file
+[tool.pyright]
+include = ["graphrag", "tests", "examples", "examples_notebooks"]
+exclude = ["**/node_modules", "**/__pycache__"]
+
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+timeout = 800
+# log_cli = true
+# log_cli_level = "INFO"