2024-08-20 14:42:20 -07:00
|
|
|
# Copyright (c) 2024 Microsoft Corporation.
|
|
|
|
# Licensed under the MIT License
|
|
|
|
|
|
|
|
"""
|
|
|
|
Indexing API for GraphRAG.
|
|
|
|
|
|
|
|
WARNING: This API is under development and may undergo changes in future releases.
|
|
|
|
Backwards compatibility is not guaranteed at this time.
|
|
|
|
"""
|
|
|
|
|
2025-01-03 13:59:26 -08:00
|
|
|
import logging
|
2024-12-05 16:34:21 -05:00
|
|
|
|
2024-11-27 13:27:43 -05:00
|
|
|
from graphrag.cache.noop_pipeline_cache import NoopPipelineCache
|
2024-12-05 16:34:21 -05:00
|
|
|
from graphrag.callbacks.factory import create_pipeline_reporter
|
2025-01-03 13:59:26 -08:00
|
|
|
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
|
2024-11-15 16:41:10 -08:00
|
|
|
from graphrag.config.enums import CacheType
|
|
|
|
from graphrag.config.models.graph_rag_config import GraphRagConfig
|
2025-01-03 13:59:26 -08:00
|
|
|
from graphrag.index.run.run_workflows import run_workflows
|
2024-10-10 17:01:42 -04:00
|
|
|
from graphrag.index.typing import PipelineRunResult
|
2024-12-10 17:11:11 -05:00
|
|
|
from graphrag.logger.base import ProgressLogger
|
2024-08-20 14:42:20 -07:00
|
|
|
|
2025-01-03 13:59:26 -08:00
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
2024-08-20 14:42:20 -07:00
|
|
|
|
|
|
|
async def build_index(
|
|
|
|
config: GraphRagConfig,
|
2024-09-05 08:15:10 -06:00
|
|
|
run_id: str = "",
|
|
|
|
is_resume_run: bool = False,
|
|
|
|
memory_profile: bool = False,
|
2024-12-05 16:34:21 -05:00
|
|
|
callbacks: list[WorkflowCallbacks] | None = None,
|
2024-12-10 17:11:11 -05:00
|
|
|
progress_logger: ProgressLogger | None = None,
|
2024-08-20 14:42:20 -07:00
|
|
|
) -> list[PipelineRunResult]:
|
|
|
|
"""Run the pipeline with the given configuration.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
2024-10-21 16:56:56 -04:00
|
|
|
config : GraphRagConfig
|
2024-08-20 14:42:20 -07:00
|
|
|
The configuration.
|
|
|
|
run_id : str
|
|
|
|
The run id. Creates a output directory with this name.
|
2024-09-05 08:15:10 -06:00
|
|
|
is_resume_run : bool default=False
|
|
|
|
Whether to resume a previous index run.
|
2024-08-20 14:42:20 -07:00
|
|
|
memory_profile : bool
|
|
|
|
Whether to enable memory profiling.
|
2024-12-05 16:34:21 -05:00
|
|
|
callbacks : list[WorkflowCallbacks] | None default=None
|
|
|
|
A list of callbacks to register.
|
2024-12-10 17:11:11 -05:00
|
|
|
progress_logger : ProgressLogger | None default=None
|
|
|
|
The progress logger.
|
2024-08-20 14:42:20 -07:00
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
list[PipelineRunResult]
|
|
|
|
The list of pipeline run results
|
|
|
|
"""
|
2024-10-30 11:59:44 -06:00
|
|
|
is_update_run = bool(config.update_index_storage)
|
|
|
|
|
2024-09-05 08:15:10 -06:00
|
|
|
if is_resume_run and is_update_run:
|
|
|
|
msg = "Cannot resume and update a run at the same time."
|
|
|
|
raise ValueError(msg)
|
|
|
|
|
2024-08-20 14:42:20 -07:00
|
|
|
pipeline_cache = (
|
|
|
|
NoopPipelineCache() if config.cache.type == CacheType.none is None else None
|
|
|
|
)
|
2024-12-10 17:11:11 -05:00
|
|
|
# create a pipeline reporter and add to any additional callbacks
|
2024-12-05 16:34:21 -05:00
|
|
|
# TODO: remove the type ignore once the new config engine has been refactored
|
2024-12-10 17:11:11 -05:00
|
|
|
callbacks = callbacks or []
|
|
|
|
callbacks.append(create_pipeline_reporter(config.reporting, None)) # type: ignore
|
2024-08-20 14:42:20 -07:00
|
|
|
outputs: list[PipelineRunResult] = []
|
2025-01-03 13:59:26 -08:00
|
|
|
|
|
|
|
if memory_profile:
|
|
|
|
log.warning("New pipeline does not yet support memory profiling.")
|
|
|
|
|
|
|
|
workflows = _get_workflows_list(config)
|
|
|
|
|
|
|
|
async for output in run_workflows(
|
|
|
|
workflows,
|
|
|
|
config,
|
2024-08-20 14:42:20 -07:00
|
|
|
cache=pipeline_cache,
|
2024-12-05 16:34:21 -05:00
|
|
|
callbacks=callbacks,
|
2024-12-10 17:11:11 -05:00
|
|
|
logger=progress_logger,
|
2025-01-03 13:59:26 -08:00
|
|
|
run_id=run_id,
|
2024-09-05 08:15:10 -06:00
|
|
|
is_update_run=is_update_run,
|
2024-08-20 14:42:20 -07:00
|
|
|
):
|
|
|
|
outputs.append(output)
|
2024-12-10 17:11:11 -05:00
|
|
|
if progress_logger:
|
2024-08-20 14:42:20 -07:00
|
|
|
if output.errors and len(output.errors) > 0:
|
2024-12-10 17:11:11 -05:00
|
|
|
progress_logger.error(output.workflow)
|
2024-08-20 14:42:20 -07:00
|
|
|
else:
|
2024-12-10 17:11:11 -05:00
|
|
|
progress_logger.success(output.workflow)
|
|
|
|
progress_logger.info(str(output.result))
|
2025-01-03 13:59:26 -08:00
|
|
|
|
2024-08-20 14:42:20 -07:00
|
|
|
return outputs
|
2025-01-03 13:59:26 -08:00
|
|
|
|
|
|
|
|
|
|
|
def _get_workflows_list(config: GraphRagConfig) -> list[str]:
|
|
|
|
return [
|
|
|
|
"create_base_text_units",
|
|
|
|
"create_final_documents",
|
|
|
|
"extract_graph",
|
|
|
|
"compute_communities",
|
|
|
|
"create_final_entities",
|
|
|
|
"create_final_relationships",
|
|
|
|
"create_final_nodes",
|
|
|
|
"create_final_communities",
|
|
|
|
*(["create_final_covariates"] if config.claim_extraction.enabled else []),
|
|
|
|
"create_final_text_units",
|
|
|
|
"create_final_community_reports",
|
|
|
|
"generate_text_embeddings",
|
|
|
|
]
|