graphrag-accelerator/backend/graphrag_app/api/prompt_tuning.py

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import inspect
import os
import traceback

import graphrag.api as api
import yaml
from fastapi import (
    APIRouter,
    HTTPException,
)
from graphrag.config.create_graphrag_config import create_graphrag_config

from graphrag_app.logger.load_logger import load_pipeline_logger
from graphrag_app.utils.azure_clients import AzureClientManager
from graphrag_app.utils.common import sanitize_name

prompt_tuning_route = APIRouter(prefix="/index/config", tags=["Index Configuration"])


@prompt_tuning_route.get(
    "/prompts",
    summary="Generate prompts from user-provided data.",
    description="Generating custom prompts from user-provided data may take several minutes to run based on the amount of data used.",
)
async def generate_prompts(storage_name: str, limit: int = 5):
    """
    Automatically generate custom prompts for entity entraction,
    community reports, and summarize descriptions based on a sample of provided data.
    """
    # check for storage container existence
    azure_client_manager = AzureClientManager()
    blob_service_client = azure_client_manager.get_blob_service_client()
    sanitized_storage_name = sanitize_name(storage_name)
    if not blob_service_client.get_container_client(sanitized_storage_name).exists():
        raise HTTPException(
            status_code=500,
            detail=f"Data container '{storage_name}' does not exist.",
        )

    # load pipeline configuration file (settings.yaml) for input data and other settings
    this_directory = os.path.dirname(
        os.path.abspath(inspect.getfile(inspect.currentframe()))
    )
    data = yaml.safe_load(open(f"{this_directory}/../indexer/settings.yaml"))
    data["input"]["container_name"] = sanitized_storage_name
    graphrag_config = create_graphrag_config(values=data, root_dir=".")

    # generate prompts
    try:
        # NOTE: we need to call api.generate_indexing_prompts
        prompts: tuple[str, str, str] = await api.generate_indexing_prompts(
            config=graphrag_config,
            root=".",
            limit=limit,
            selection_method="random",
        )
    except Exception as e:
        logger = load_pipeline_logger()
        error_details = {
            "storage_name": storage_name,
        }
        logger.error(
            message="Auto-prompt generation failed.",
            cause=e,
            stack=traceback.format_exc(),
            details=error_details,
        )
        raise HTTPException(
            status_code=500,
            detail=f"Error generating prompts for data in '{storage_name}'. Please try a lower limit.",
        )

    content = {
        "entity_extraction_prompt": prompts[0],
        "entity_summarization_prompt": prompts[1],
        "community_summarization_prompt": prompts[2],
    }
    return content  # return a fastapi.responses.JSONResponse object
initial commit 2024-06-26 15:45:06 -04:00			`# Copyright (c) Microsoft Corporation.`
			`# Licensed under the MIT License.`

			`import inspect`
			`import os`
Cdifonzo/multi index testing (#162) 2024-09-12 21:41:46 -04:00			`import traceback`
initial commit 2024-06-26 15:45:06 -04:00
refactor variable names to be more generic and add integration tests 2025-01-02 23:19:28 -05:00			`import graphrag.api as api`
initial commit 2024-06-26 15:45:06 -04:00			`import yaml`
			`from fastapi import (`
			`APIRouter,`
			`HTTPException,`
			`)`
refactor variable names to be more generic and add integration tests 2025-01-02 23:19:28 -05:00			`from graphrag.config.create_graphrag_config import create_graphrag_config`
initial commit 2024-06-26 15:45:06 -04:00
convert app to proper python package 2025-01-25 04:07:53 -05:00			`from graphrag_app.logger.load_logger import load_pipeline_logger`
			`from graphrag_app.utils.azure_clients import AzureClientManager`
			`from graphrag_app.utils.common import sanitize_name`
initial commit 2024-06-26 15:45:06 -04:00
refactor and reorganize indexing code out of api code 2025-01-21 00:29:48 -05:00			`prompt_tuning_route = APIRouter(prefix="/index/config", tags=["Index Configuration"])`
initial commit 2024-06-26 15:45:06 -04:00

refactor and reorganize indexing code out of api code 2025-01-21 00:29:48 -05:00			`@prompt_tuning_route.get(`
initial commit 2024-06-26 15:45:06 -04:00			`"/prompts",`
refactor variable names to be more generic and add integration tests 2025-01-02 23:19:28 -05:00			`summary="Generate prompts from user-provided data.",`
initial commit 2024-06-26 15:45:06 -04:00			`description="Generating custom prompts from user-provided data may take several minutes to run based on the amount of data used.",`
			`)`
			`async def generate_prompts(storage_name: str, limit: int = 5):`
			`"""`
			`Automatically generate custom prompts for entity entraction,`
			`community reports, and summarize descriptions based on a sample of provided data.`
			`"""`
			`# check for storage container existence`
Re-activate pytests (#208) 2024-12-30 01:59:08 -05:00			`azure_client_manager = AzureClientManager()`
			`blob_service_client = azure_client_manager.get_blob_service_client()`
initial commit 2024-06-26 15:45:06 -04:00			`sanitized_storage_name = sanitize_name(storage_name)`
			`if not blob_service_client.get_container_client(sanitized_storage_name).exists():`
			`raise HTTPException(`
			`status_code=500,`
			`detail=f"Data container '{storage_name}' does not exist.",`
			`)`
refactor variable names to be more generic and add integration tests 2025-01-02 23:19:28 -05:00
			`# load pipeline configuration file (settings.yaml) for input data and other settings`
initial commit 2024-06-26 15:45:06 -04:00			`this_directory = os.path.dirname(`
			`os.path.abspath(inspect.getfile(inspect.currentframe()))`
			`)`
add custom cosmosdb rbac role 2025-01-23 00:23:58 -05:00			`data = yaml.safe_load(open(f"{this_directory}/../indexer/settings.yaml"))`
initial commit 2024-06-26 15:45:06 -04:00			`data["input"]["container_name"] = sanitized_storage_name`
refactor variable names to be more generic and add integration tests 2025-01-02 23:19:28 -05:00			`graphrag_config = create_graphrag_config(values=data, root_dir=".")`
initial commit 2024-06-26 15:45:06 -04:00
			`# generate prompts`
			`try:`
refactor variable names to be more generic and add integration tests 2025-01-02 23:19:28 -05:00			`# NOTE: we need to call api.generate_indexing_prompts`
			`prompts: tuple[str, str, str] = await api.generate_indexing_prompts(`
			`config=graphrag_config,`
			`root=".",`
initial commit 2024-06-26 15:45:06 -04:00			`limit=limit,`
refactor variable names to be more generic and add integration tests 2025-01-02 23:19:28 -05:00			`selection_method="random",`
Cdifonzo/multi index testing (#162) 2024-09-12 21:41:46 -04:00			`)`
			`except Exception as e:`
fixed appinsights logging 2025-01-21 18:43:55 -05:00			`logger = load_pipeline_logger()`
Cdifonzo/multi index testing (#162) 2024-09-12 21:41:46 -04:00			`error_details = {`
			`"storage_name": storage_name,`
			`}`
refactor and reorganize indexing code out of api code 2025-01-21 00:29:48 -05:00			`logger.error(`
Cdifonzo/multi index testing (#162) 2024-09-12 21:41:46 -04:00			`message="Auto-prompt generation failed.",`
			`cause=e,`
			`stack=traceback.format_exc(),`
			`details=error_details,`
initial commit 2024-06-26 15:45:06 -04:00			`)`
			`raise HTTPException(`
			`status_code=500,`
			`detail=f"Error generating prompts for data in '{storage_name}'. Please try a lower limit.",`
			`)`

refactor variable names to be more generic and add integration tests 2025-01-02 23:19:28 -05:00			`content = {`
			`"entity_extraction_prompt": prompts[0],`
			`"entity_summarization_prompt": prompts[1],`
			`"community_summarization_prompt": prompts[2],`
			`}`
			`return content # return a fastapi.responses.JSONResponse object`