mirror of
https://github.com/microsoft/graphrag.git
synced 2025-12-17 01:58:46 +00:00
Merge branch 'main' into incremental_indexing/main
This commit is contained in:
commit
67f4b02ecd
66
.semversioner/0.3.3.json
Normal file
66
.semversioner/0.3.3.json
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
{
|
||||||
|
"changes": [
|
||||||
|
{
|
||||||
|
"description": "Add entrypoints for incremental indexing",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Clean up and organize run index code",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Consistent config loading. Resolves #99 and Resolves #1049",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Fix circular dependency when running prompt tune api directly",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Fix default settings for embedding",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Fix img for auto tune",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Fix img width",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Fixed a bug in prompt tuning process",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Refactor text unit build at local search",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Update Prompt Tuning docs",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Update create_pipeline_config.py",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "Update prompt tune command in docs",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "add querying from azure blob storage",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "fix setting base_dir to full paths when not using file system.",
|
||||||
|
"type": "patch"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"description": "fix strategy config in entity_extraction",
|
||||||
|
"type": "patch"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"created_at": "2024-09-10T19:51:24+00:00",
|
||||||
|
"version": "0.3.3"
|
||||||
|
}
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "fix strategy config in entity_extraction"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Fixed a bug in prompt tuning process"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Fix default settings for embedding"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Refactor text unit build at local search"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Fix circular dependency when running prompt tune api directly"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Update Prompt Tuning docs"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Update prompt tune command in docs"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Fix img for auto tune"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Fix img width"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Consistent config loading. Resolves #99 and Resolves #1049"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Add entrypoints for incremental indexing"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "Clean up and organize run index code"
|
|
||||||
}
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "patch",
|
|
||||||
"description": "fix setting base_dir to full paths when not using file system."
|
|
||||||
}
|
|
||||||
18
CHANGELOG.md
18
CHANGELOG.md
@ -1,6 +1,24 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
Note: version releases in the 0.x.y range may introduce breaking changes.
|
Note: version releases in the 0.x.y range may introduce breaking changes.
|
||||||
|
|
||||||
|
## 0.3.3
|
||||||
|
|
||||||
|
- patch: Add entrypoints for incremental indexing
|
||||||
|
- patch: Clean up and organize run index code
|
||||||
|
- patch: Consistent config loading. Resolves #99 and Resolves #1049
|
||||||
|
- patch: Fix circular dependency when running prompt tune api directly
|
||||||
|
- patch: Fix default settings for embedding
|
||||||
|
- patch: Fix img for auto tune
|
||||||
|
- patch: Fix img width
|
||||||
|
- patch: Fixed a bug in prompt tuning process
|
||||||
|
- patch: Refactor text unit build at local search
|
||||||
|
- patch: Update Prompt Tuning docs
|
||||||
|
- patch: Update create_pipeline_config.py
|
||||||
|
- patch: Update prompt tune command in docs
|
||||||
|
- patch: add querying from azure blob storage
|
||||||
|
- patch: fix setting base_dir to full paths when not using file system.
|
||||||
|
- patch: fix strategy config in entity_extraction
|
||||||
|
|
||||||
## 0.3.2
|
## 0.3.2
|
||||||
|
|
||||||
- patch: Add context data to query API responses.
|
- patch: Add context data to query API responses.
|
||||||
|
|||||||
@ -274,7 +274,7 @@ def _get_embedding_settings(
|
|||||||
#
|
#
|
||||||
strategy = settings.resolved_strategy() # get the default strategy
|
strategy = settings.resolved_strategy() # get the default strategy
|
||||||
strategy.update({
|
strategy.update({
|
||||||
"vector_store": {**vector_store_settings, **(vector_store_params or {})}
|
"vector_store": {**(vector_store_params or {}), **vector_store_settings}
|
||||||
}) # update the default strategy with the vector store settings
|
}) # update the default strategy with the vector store settings
|
||||||
# This ensures the vector store config is part of the strategy and not the global config
|
# This ensures the vector store config is part of the strategy and not the global config
|
||||||
return {
|
return {
|
||||||
|
|||||||
@ -9,8 +9,14 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from graphrag.config import load_config, resolve_path
|
from graphrag.config import (
|
||||||
|
GraphRagConfig,
|
||||||
|
load_config,
|
||||||
|
resolve_path,
|
||||||
|
)
|
||||||
|
from graphrag.index.create_pipeline_config import create_pipeline_config
|
||||||
from graphrag.index.progress import PrintProgressReporter
|
from graphrag.index.progress import PrintProgressReporter
|
||||||
|
from graphrag.utils.storage import _create_storage, _load_table_from_storage
|
||||||
|
|
||||||
from . import api
|
from . import api
|
||||||
|
|
||||||
@ -36,17 +42,21 @@ def run_global_search(
|
|||||||
if data_dir:
|
if data_dir:
|
||||||
config.storage.base_dir = str(resolve_path(data_dir, root))
|
config.storage.base_dir = str(resolve_path(data_dir, root))
|
||||||
|
|
||||||
data_path = Path(config.storage.base_dir).resolve()
|
dataframe_dict = _resolve_parquet_files(
|
||||||
|
root_dir=root_dir,
|
||||||
final_nodes: pd.DataFrame = pd.read_parquet(
|
config=config,
|
||||||
data_path / "create_final_nodes.parquet"
|
parquet_list=[
|
||||||
)
|
"create_final_nodes.parquet",
|
||||||
final_entities: pd.DataFrame = pd.read_parquet(
|
"create_final_entities.parquet",
|
||||||
data_path / "create_final_entities.parquet"
|
"create_final_community_reports.parquet",
|
||||||
)
|
],
|
||||||
final_community_reports: pd.DataFrame = pd.read_parquet(
|
optional_list=[],
|
||||||
data_path / "create_final_community_reports.parquet"
|
|
||||||
)
|
)
|
||||||
|
final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
|
||||||
|
final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
|
||||||
|
final_community_reports: pd.DataFrame = dataframe_dict[
|
||||||
|
"create_final_community_reports"
|
||||||
|
]
|
||||||
|
|
||||||
# call the Query API
|
# call the Query API
|
||||||
if streaming:
|
if streaming:
|
||||||
@ -112,23 +122,26 @@ def run_local_search(
|
|||||||
if data_dir:
|
if data_dir:
|
||||||
config.storage.base_dir = str(resolve_path(data_dir, root))
|
config.storage.base_dir = str(resolve_path(data_dir, root))
|
||||||
|
|
||||||
data_path = Path(config.storage.base_dir).resolve()
|
dataframe_dict = _resolve_parquet_files(
|
||||||
|
root_dir=root_dir,
|
||||||
final_nodes = pd.read_parquet(data_path / "create_final_nodes.parquet")
|
config=config,
|
||||||
final_community_reports = pd.read_parquet(
|
parquet_list=[
|
||||||
data_path / "create_final_community_reports.parquet"
|
"create_final_nodes.parquet",
|
||||||
)
|
"create_final_community_reports.parquet",
|
||||||
final_text_units = pd.read_parquet(data_path / "create_final_text_units.parquet")
|
"create_final_text_units.parquet",
|
||||||
final_relationships = pd.read_parquet(
|
"create_final_relationships.parquet",
|
||||||
data_path / "create_final_relationships.parquet"
|
"create_final_entities.parquet",
|
||||||
)
|
],
|
||||||
final_entities = pd.read_parquet(data_path / "create_final_entities.parquet")
|
optional_list=["create_final_covariates.parquet"],
|
||||||
final_covariates_path = data_path / "create_final_covariates.parquet"
|
|
||||||
final_covariates = (
|
|
||||||
pd.read_parquet(final_covariates_path)
|
|
||||||
if final_covariates_path.exists()
|
|
||||||
else None
|
|
||||||
)
|
)
|
||||||
|
final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
|
||||||
|
final_community_reports: pd.DataFrame = dataframe_dict[
|
||||||
|
"create_final_community_reports"
|
||||||
|
]
|
||||||
|
final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"]
|
||||||
|
final_relationships: pd.DataFrame = dataframe_dict["create_final_relationships"]
|
||||||
|
final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
|
||||||
|
final_covariates: pd.DataFrame | None = dataframe_dict["create_final_covariates"]
|
||||||
|
|
||||||
# call the Query API
|
# call the Query API
|
||||||
if streaming:
|
if streaming:
|
||||||
@ -179,3 +192,35 @@ def run_local_search(
|
|||||||
# NOTE: we return the response and context data here purely as a complete demonstration of the API.
|
# NOTE: we return the response and context data here purely as a complete demonstration of the API.
|
||||||
# External users should use the API directly to get the response and context data.
|
# External users should use the API directly to get the response and context data.
|
||||||
return response, context_data
|
return response, context_data
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_parquet_files(
|
||||||
|
root_dir: str,
|
||||||
|
config: GraphRagConfig,
|
||||||
|
parquet_list: list[str],
|
||||||
|
optional_list: list[str],
|
||||||
|
) -> dict[str, pd.DataFrame]:
|
||||||
|
"""Read parquet files to a dataframe dict."""
|
||||||
|
dataframe_dict = {}
|
||||||
|
pipeline_config = create_pipeline_config(config)
|
||||||
|
storage_obj = _create_storage(root_dir=root_dir, config=pipeline_config.storage)
|
||||||
|
for parquet_file in parquet_list:
|
||||||
|
df_key = parquet_file.split(".")[0]
|
||||||
|
df_value = asyncio.run(
|
||||||
|
_load_table_from_storage(name=parquet_file, storage=storage_obj)
|
||||||
|
)
|
||||||
|
dataframe_dict[df_key] = df_value
|
||||||
|
|
||||||
|
# for optional parquet files, set the dict entry to None instead of erroring out if it does not exist
|
||||||
|
for optional_file in optional_list:
|
||||||
|
file_exists = asyncio.run(storage_obj.has(optional_file))
|
||||||
|
df_key = optional_file.split(".")[0]
|
||||||
|
if file_exists:
|
||||||
|
df_value = asyncio.run(
|
||||||
|
_load_table_from_storage(name=optional_file, storage=storage_obj)
|
||||||
|
)
|
||||||
|
dataframe_dict[df_key] = df_value
|
||||||
|
else:
|
||||||
|
dataframe_dict[df_key] = None
|
||||||
|
|
||||||
|
return dataframe_dict
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "graphrag"
|
name = "graphrag"
|
||||||
# Maintainers: do not change the version here manually, use ./scripts/release.sh
|
# Maintainers: do not change the version here manually, use ./scripts/release.sh
|
||||||
version = "0.3.2"
|
version = "0.3.3"
|
||||||
description = ""
|
description = ""
|
||||||
authors = [
|
authors = [
|
||||||
"Alonso Guevara Fernández <alonsog@microsoft.com>",
|
"Alonso Guevara Fernández <alonsog@microsoft.com>",
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user