Merge branch 'main' into incremental_indexing/main

2025-12-17 01:58:46 +00:00 · 2024-09-10 16:04:01 -06:00 · 2024-09-10 16:04:01 -06:00 · 67f4b02ecd
commit 67f4b02ecd
parent 3295e2b861 e7ee8cb8a5
18 changed files with 511 additions and 434 deletions
--- a/.semversioner/0.3.3.json
+++ b/.semversioner/0.3.3.json
@ -0,0 +1,66 @@
 {
  "changes": [
    {
      "description": "Add entrypoints for incremental indexing",
      "type": "patch"
    },
    {
      "description": "Clean up and organize run index code",
      "type": "patch"
    },
    {
      "description": "Consistent config loading. Resolves #99 and Resolves #1049",
      "type": "patch"
    },
    {
      "description": "Fix circular dependency when running prompt tune api directly",
      "type": "patch"
    },
    {
      "description": "Fix default settings for embedding",
      "type": "patch"
    },
    {
      "description": "Fix img for auto tune",
      "type": "patch"
    },
    {
      "description": "Fix img width",
      "type": "patch"
    },
    {
      "description": "Fixed a bug in prompt tuning process",
      "type": "patch"
    },
    {
      "description": "Refactor text unit build at local search",
      "type": "patch"
    },
    {
      "description": "Update Prompt Tuning docs",
      "type": "patch"
    },
    {
      "description": "Update create_pipeline_config.py",
      "type": "patch"
    },
    {
      "description": "Update prompt tune command in docs",
      "type": "patch"
    },
    {
      "description": "add querying from azure blob storage",
      "type": "patch"
    },
    {
      "description": "fix setting base_dir to full paths when not using file system.",
      "type": "patch"
    },
    {
      "description": "fix strategy config in entity_extraction",
      "type": "patch"
    }
  ],
  "created_at": "2024-09-10T19:51:24+00:00",
  "version": "0.3.3"
 }
--- a/.semversioner/next-release/patch-20240712071506108985.json
+++ b/.semversioner/next-release/patch-20240712071506108985.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "fix strategy config in entity_extraction"
 }
--- a/.semversioner/next-release/patch-20240814063732868394.json
+++ b/.semversioner/next-release/patch-20240814063732868394.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Fixed a bug in prompt tuning process"
 }
--- a/.semversioner/next-release/patch-20240827203354884800.json
+++ b/.semversioner/next-release/patch-20240827203354884800.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Fix default settings for embedding"
 }
--- a/.semversioner/next-release/patch-20240827212041426794.json
+++ b/.semversioner/next-release/patch-20240827212041426794.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Refactor text unit build at local search"
 }
--- a/.semversioner/next-release/patch-20240829175336332224.json
+++ b/.semversioner/next-release/patch-20240829175336332224.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Fix circular dependency when running prompt tune api directly"
 }
--- a/.semversioner/next-release/patch-20240829213842840703.json
+++ b/.semversioner/next-release/patch-20240829213842840703.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Update Prompt Tuning docs"
 }
--- a/.semversioner/next-release/patch-20240829222117086645.json
+++ b/.semversioner/next-release/patch-20240829222117086645.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Update prompt tune command in docs"
 }
--- a/.semversioner/next-release/patch-20240829223855375571.json
+++ b/.semversioner/next-release/patch-20240829223855375571.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Fix img for auto tune"
 }
--- a/.semversioner/next-release/patch-20240829230018473667.json
+++ b/.semversioner/next-release/patch-20240829230018473667.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Fix img width"
 }
--- a/.semversioner/next-release/patch-20240830151802543194.json
+++ b/.semversioner/next-release/patch-20240830151802543194.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Consistent config loading. Resolves #99 and Resolves #1049"
 }
--- a/.semversioner/next-release/patch-20240830181135475287.json
+++ b/.semversioner/next-release/patch-20240830181135475287.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Add entrypoints for incremental indexing"
 }
--- a/.semversioner/next-release/patch-20240903205022597458.json
+++ b/.semversioner/next-release/patch-20240903205022597458.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "Clean up and organize run index code"
 }
--- a/.semversioner/next-release/patch-20240904173227165702.json
+++ b/.semversioner/next-release/patch-20240904173227165702.json
@ -1,4 +0,0 @@
 {
  "type": "patch",
  "description": "fix setting base_dir to full paths when not using file system."
 }
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,6 +1,24 @@
 # Changelog
 Note: version releases in the 0.x.y range may introduce breaking changes.
 ## 0.3.3
 - patch: Add entrypoints for incremental indexing
 - patch: Clean up and organize run index code
 - patch: Consistent config loading. Resolves #99 and Resolves #1049
 - patch: Fix circular dependency when running prompt tune api directly
 - patch: Fix default settings for embedding
 - patch: Fix img for auto tune
 - patch: Fix img width
 - patch: Fixed a bug in prompt tuning process
 - patch: Refactor text unit build at local search
 - patch: Update Prompt Tuning docs
 - patch: Update create_pipeline_config.py
 - patch: Update prompt tune command in docs
 - patch: add querying from azure blob storage
 - patch: fix setting base_dir to full paths when not using file system.
 - patch: fix strategy config in entity_extraction
 ## 0.3.2
 - patch: Add context data to query API responses.
--- a/graphrag/index/create_pipeline_config.py
+++ b/graphrag/index/create_pipeline_config.py
@ -274,7 +274,7 @@ def _get_embedding_settings(
    #
    strategy = settings.resolved_strategy()  # get the default strategy
    strategy.update({
-        "vector_store": {**vector_store_settings, **(vector_store_params or {})}
+        "vector_store": {**(vector_store_params or {}), **vector_store_settings}
    })  # update the default strategy with the vector store settings
    # This ensures the vector store config is part of the strategy and not the global config
    return {
--- a/graphrag/query/cli.py
+++ b/graphrag/query/cli.py
@ -9,8 +9,14 @@ from pathlib import Path
 import pandas as pd
-from graphrag.config import load_config, resolve_path
+from graphrag.config import (
    GraphRagConfig,
    load_config,
    resolve_path,
 )
 from graphrag.index.create_pipeline_config import create_pipeline_config
 from graphrag.index.progress import PrintProgressReporter
 from graphrag.utils.storage import _create_storage, _load_table_from_storage
 from . import api
@ -36,17 +42,21 @@ def run_global_search(
    if data_dir:
        config.storage.base_dir = str(resolve_path(data_dir, root))
-    data_path = Path(config.storage.base_dir).resolve()
+    dataframe_dict = _resolve_parquet_files(
-
+        root_dir=root_dir,
-    final_nodes: pd.DataFrame = pd.read_parquet(
+        config=config,
-        data_path / "create_final_nodes.parquet"
+        parquet_list=[
-    )
+            "create_final_nodes.parquet",
-    final_entities: pd.DataFrame = pd.read_parquet(
+            "create_final_entities.parquet",
-        data_path / "create_final_entities.parquet"
+            "create_final_community_reports.parquet",
-    )
+        ],
-    final_community_reports: pd.DataFrame = pd.read_parquet(
+        optional_list=[],
        data_path / "create_final_community_reports.parquet"
    )
    final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
    final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
    final_community_reports: pd.DataFrame = dataframe_dict[
        "create_final_community_reports"
    ]
    # call the Query API
    if streaming:
@ -112,23 +122,26 @@ def run_local_search(
    if data_dir:
        config.storage.base_dir = str(resolve_path(data_dir, root))
-    data_path = Path(config.storage.base_dir).resolve()
+    dataframe_dict = _resolve_parquet_files(
-
+        root_dir=root_dir,
-    final_nodes = pd.read_parquet(data_path / "create_final_nodes.parquet")
+        config=config,
-    final_community_reports = pd.read_parquet(
+        parquet_list=[
-        data_path / "create_final_community_reports.parquet"
+            "create_final_nodes.parquet",
-    )
+            "create_final_community_reports.parquet",
-    final_text_units = pd.read_parquet(data_path / "create_final_text_units.parquet")
+            "create_final_text_units.parquet",
-    final_relationships = pd.read_parquet(
+            "create_final_relationships.parquet",
-        data_path / "create_final_relationships.parquet"
+            "create_final_entities.parquet",
-    )
+        ],
-    final_entities = pd.read_parquet(data_path / "create_final_entities.parquet")
+        optional_list=["create_final_covariates.parquet"],
    final_covariates_path = data_path / "create_final_covariates.parquet"
    final_covariates = (
        pd.read_parquet(final_covariates_path)
        if final_covariates_path.exists()
        else None
    )
    final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
    final_community_reports: pd.DataFrame = dataframe_dict[
        "create_final_community_reports"
    ]
    final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"]
    final_relationships: pd.DataFrame = dataframe_dict["create_final_relationships"]
    final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
    final_covariates: pd.DataFrame | None = dataframe_dict["create_final_covariates"]
    # call the Query API
    if streaming:
@ -179,3 +192,35 @@ def run_local_search(
    # NOTE: we return the response and context data here purely as a complete demonstration of the API.
    # External users should use the API directly to get the response and context data.
    return response, context_data
 def _resolve_parquet_files(
    root_dir: str,
    config: GraphRagConfig,
    parquet_list: list[str],
    optional_list: list[str],
 ) -> dict[str, pd.DataFrame]:
    """Read parquet files to a dataframe dict."""
    dataframe_dict = {}
    pipeline_config = create_pipeline_config(config)
    storage_obj = _create_storage(root_dir=root_dir, config=pipeline_config.storage)
    for parquet_file in parquet_list:
        df_key = parquet_file.split(".")[0]
        df_value = asyncio.run(
            _load_table_from_storage(name=parquet_file, storage=storage_obj)
        )
        dataframe_dict[df_key] = df_value
    # for optional parquet files, set the dict entry to None instead of erroring out if it does not exist
    for optional_file in optional_list:
        file_exists = asyncio.run(storage_obj.has(optional_file))
        df_key = optional_file.split(".")[0]
        if file_exists:
            df_value = asyncio.run(
                _load_table_from_storage(name=optional_file, storage=storage_obj)
            )
            dataframe_dict[df_key] = df_value
        else:
            dataframe_dict[df_key] = None
    return dataframe_dict
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,7 +1,7 @@
 [tool.poetry]
 name = "graphrag"
 # Maintainers: do not change the version here manually, use ./scripts/release.sh
-version = "0.3.2"
+version = "0.3.3"
 description = ""
 authors = [
    "Alonso Guevara Fernández <alonsog@microsoft.com>",