2024-08-20 14:42:20 -07:00
|
|
|
# Copyright (c) 2024 Microsoft Corporation.
|
|
|
|
# Licensed under the MIT License
|
|
|
|
|
|
|
|
"""
|
|
|
|
Indexing API for GraphRAG.
|
|
|
|
|
|
|
|
WARNING: This API is under development and may undergo changes in future releases.
|
|
|
|
Backwards compatibility is not guaranteed at this time.
|
|
|
|
"""
|
|
|
|
|
2025-01-03 13:59:26 -08:00
|
|
|
import logging
|
2024-12-05 16:34:21 -05:00
|
|
|
|
2025-01-21 15:52:06 -08:00
|
|
|
from graphrag.callbacks.reporting import create_pipeline_reporter
|
2025-01-03 13:59:26 -08:00
|
|
|
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
|
2025-02-25 15:07:51 -08:00
|
|
|
from graphrag.config.enums import IndexingMethod
|
2024-11-15 16:41:10 -08:00
|
|
|
from graphrag.config.models.graph_rag_config import GraphRagConfig
|
2025-01-28 12:27:03 -08:00
|
|
|
from graphrag.index.run.run_pipeline import run_pipeline
|
2025-02-25 15:07:51 -08:00
|
|
|
from graphrag.index.run.utils import create_callback_chain
|
2025-02-28 09:31:48 -08:00
|
|
|
from graphrag.index.typing.pipeline_run_result import PipelineRunResult
|
|
|
|
from graphrag.index.typing.workflow import WorkflowFunction
|
2025-02-14 13:21:31 -08:00
|
|
|
from graphrag.index.workflows.factory import PipelineFactory
|
2024-12-10 17:11:11 -05:00
|
|
|
from graphrag.logger.base import ProgressLogger
|
2025-02-25 15:07:51 -08:00
|
|
|
from graphrag.logger.null_progress import NullProgressLogger
|
2024-08-20 14:42:20 -07:00
|
|
|
|
2025-01-03 13:59:26 -08:00
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
2024-08-20 14:42:20 -07:00
|
|
|
|
|
|
|
async def build_index(
|
|
|
|
config: GraphRagConfig,
|
2025-06-12 16:14:39 -07:00
|
|
|
method: IndexingMethod | str = IndexingMethod.Standard,
|
2025-02-13 16:22:32 -08:00
|
|
|
is_update_run: bool = False,
|
2024-09-05 08:15:10 -06:00
|
|
|
memory_profile: bool = False,
|
2024-12-05 16:34:21 -05:00
|
|
|
callbacks: list[WorkflowCallbacks] | None = None,
|
2024-12-10 17:11:11 -05:00
|
|
|
progress_logger: ProgressLogger | None = None,
|
2024-08-20 14:42:20 -07:00
|
|
|
) -> list[PipelineRunResult]:
|
|
|
|
"""Run the pipeline with the given configuration.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
2024-10-21 16:56:56 -04:00
|
|
|
config : GraphRagConfig
|
2024-08-20 14:42:20 -07:00
|
|
|
The configuration.
|
2025-01-28 12:27:03 -08:00
|
|
|
method : IndexingMethod default=IndexingMethod.Standard
|
|
|
|
Styling of indexing to perform (full LLM, NLP + LLM, etc.).
|
2024-08-20 14:42:20 -07:00
|
|
|
memory_profile : bool
|
|
|
|
Whether to enable memory profiling.
|
2024-12-05 16:34:21 -05:00
|
|
|
callbacks : list[WorkflowCallbacks] | None default=None
|
|
|
|
A list of callbacks to register.
|
2024-12-10 17:11:11 -05:00
|
|
|
progress_logger : ProgressLogger | None default=None
|
|
|
|
The progress logger.
|
2024-08-20 14:42:20 -07:00
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
list[PipelineRunResult]
|
|
|
|
The list of pipeline run results
|
|
|
|
"""
|
2025-02-25 15:07:51 -08:00
|
|
|
logger = progress_logger or NullProgressLogger()
|
2024-12-10 17:11:11 -05:00
|
|
|
# create a pipeline reporter and add to any additional callbacks
|
|
|
|
callbacks = callbacks or []
|
2025-02-25 15:07:51 -08:00
|
|
|
callbacks.append(create_pipeline_reporter(config.reporting, None))
|
|
|
|
|
|
|
|
workflow_callbacks = create_callback_chain(callbacks, logger)
|
|
|
|
|
2024-08-20 14:42:20 -07:00
|
|
|
outputs: list[PipelineRunResult] = []
|
2025-01-03 13:59:26 -08:00
|
|
|
|
|
|
|
if memory_profile:
|
|
|
|
log.warning("New pipeline does not yet support memory profiling.")
|
|
|
|
|
2025-06-12 16:14:39 -07:00
|
|
|
# todo: this could propagate out to the cli for better clarity, but will be a breaking api change
|
|
|
|
method = _get_method(method, is_update_run)
|
|
|
|
pipeline = PipelineFactory.create_pipeline(config, method)
|
2025-01-03 13:59:26 -08:00
|
|
|
|
2025-02-25 15:07:51 -08:00
|
|
|
workflow_callbacks.pipeline_start(pipeline.names())
|
|
|
|
|
2025-01-28 12:27:03 -08:00
|
|
|
async for output in run_pipeline(
|
|
|
|
pipeline,
|
2025-01-03 13:59:26 -08:00
|
|
|
config,
|
2025-02-25 15:07:51 -08:00
|
|
|
callbacks=workflow_callbacks,
|
|
|
|
logger=logger,
|
2024-09-05 08:15:10 -06:00
|
|
|
is_update_run=is_update_run,
|
2024-08-20 14:42:20 -07:00
|
|
|
):
|
|
|
|
outputs.append(output)
|
2025-02-25 15:07:51 -08:00
|
|
|
if output.errors and len(output.errors) > 0:
|
|
|
|
logger.error(output.workflow)
|
|
|
|
else:
|
|
|
|
logger.success(output.workflow)
|
|
|
|
logger.info(str(output.result))
|
2025-01-03 13:59:26 -08:00
|
|
|
|
2025-02-25 15:07:51 -08:00
|
|
|
workflow_callbacks.pipeline_end(outputs)
|
2024-08-20 14:42:20 -07:00
|
|
|
return outputs
|
2025-02-14 13:21:31 -08:00
|
|
|
|
|
|
|
|
|
|
|
def register_workflow_function(name: str, workflow: WorkflowFunction):
|
|
|
|
"""Register a custom workflow function. You can then include the name in the settings.yaml workflows list."""
|
|
|
|
PipelineFactory.register(name, workflow)
|
2025-06-12 16:14:39 -07:00
|
|
|
|
|
|
|
|
|
|
|
def _get_method(method: IndexingMethod | str, is_update_run: bool) -> str:
|
|
|
|
m = method.value if isinstance(method, IndexingMethod) else method
|
|
|
|
return f"{m}-update" if is_update_run else m
|