graphrag-accelerator/backend/manage-indexing-jobs.py

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""
Note: This script is intended to be executed as a cron job on kubernetes.

A naive implementation of a job manager that leverages k8s CronJob and CosmosDB
to schedule graphrag indexing jobs on a first-come-first-serve basis (based on epoch time).
"""

import os

import pandas as pd
import yaml
from kubernetes import (
    client,
    config,
)

from src.api.azure_clients import AzureClientManager
from src.api.common import sanitize_name
from src.logger.logger_singleton import LoggerSingleton
from src.typing.pipeline import PipelineJobState
from src.utils.pipeline import PipelineJob


def schedule_indexing_job(index_name: str):
    """
    Schedule a k8s job to run graphrag indexing for a given index name.
    """
    try:
        config.load_incluster_config()
        # get container image name
        core_v1 = client.CoreV1Api()
        pod_name = os.environ["HOSTNAME"]
        pod = core_v1.read_namespaced_pod(
            name=pod_name, namespace=os.environ["AKS_NAMESPACE"]
        )
        # retrieve job manifest template and replace necessary values
        job_manifest = _generate_aks_job_manifest(
            docker_image_name=pod.spec.containers[0].image,
            index_name=index_name,
            service_account_name=pod.spec.service_account_name,
        )
        batch_v1 = client.BatchV1Api()
        batch_v1.create_namespaced_job(
            body=job_manifest, namespace=os.environ["AKS_NAMESPACE"]
        )
    except Exception:
        reporter = LoggerSingleton().get_instance()
        reporter.on_error(
            "Index job manager encountered error scheduling indexing job",
        )
        # In the event of a catastrophic scheduling failure, something in k8s or the job manifest is likely broken.
        # Set job status to failed to prevent an infinite loop of re-scheduling
        pipelinejob = PipelineJob()
        pipeline_job = pipelinejob.load_item(sanitize_name(index_name))
        pipeline_job["status"] = PipelineJobState.FAILED


def _generate_aks_job_manifest(
    docker_image_name: str,
    index_name: str,
    service_account_name: str,
) -> dict:
    """Generate an AKS Jobs manifest file with the specified parameters.

    The manifest must be valid YAML with certain values replaced by the provided arguments.
    """
    # NOTE: this file location is relative to the WORKDIR set in Dockerfile-backend
    with open("indexing-job-template.yaml", "r") as f:
        manifest = yaml.safe_load(f)
    manifest["metadata"]["name"] = f"indexing-job-{sanitize_name(index_name)}"
    manifest["spec"]["template"]["spec"]["serviceAccountName"] = service_account_name
    manifest["spec"]["template"]["spec"]["containers"][0]["image"] = docker_image_name
    manifest["spec"]["template"]["spec"]["containers"][0]["command"] = [
        "python",
        "run-indexing-job.py",
        f"-i={index_name}",
    ]
    return manifest


def list_k8s_jobs(namespace: str) -> list[str]:
    """List all k8s jobs in a given namespace."""
    config.load_incluster_config()
    batch_v1 = client.BatchV1Api()
    jobs = batch_v1.list_namespaced_job(namespace=namespace)
    job_list = []
    for job in jobs.items:
        if job.metadata.name.startswith("indexing-job-") and job.status.active:
            job_list.append(job.metadata.name)
    return job_list


def main():
    """
    There are two places to check to determine if an indexing job should be executed:
        * Kubernetes: check if there are any active k8s jobs running in the cluster
        * CosmosDB: check if there are any indexing jobs in a scheduled state

    Ideally if an indexing job has finished or failed, the job status will be reflected in cosmosdb.
    However, if an indexing job failed due to OOM, the job status will not have been updated in cosmosdb.

    To avoid a catastrophic failure scenario where all indexing jobs are stuck in a scheduled state,
    both checks are necessary.
    """
    kubernetes_jobs = list_k8s_jobs(os.environ["AKS_NAMESPACE"])

    azure_storage_client_manager = AzureClientManager()
    job_container_store_client = (
        azure_storage_client_manager.get_cosmos_container_client(
            database="graphrag", container="jobs"
        )
    )
    # retrieve status of all index jobs that are scheduled or running
    job_metadata = []
    for item in job_container_store_client.read_all_items():
        if item["status"] == PipelineJobState.RUNNING.value:
            # if index job has running state but no associated k8s job, a catastrophic
            # failure (OOM for example) occurred. Set job status to failed.
            if len(kubernetes_jobs) == 0:
                print(
                    f"Indexing job for '{item['human_readable_index_name']}' in 'running' state but no associated k8s job found. Updating to failed state."
                )
                pipelinejob = PipelineJob()
                pipeline_job = pipelinejob.load_item(item["sanitized_index_name"])
                pipeline_job.status = PipelineJobState.FAILED
            else:
                print(
                    f"Indexing job for '{item['human_readable_index_name']}' already running. Will not schedule another. Exiting..."
                )
                exit()
        if item["status"] == PipelineJobState.SCHEDULED.value:
            job_metadata.append({
                "human_readable_index_name": item["human_readable_index_name"],
                "epoch_request_time": item["epoch_request_time"],
                "status": item["status"],
                "percent_complete": item["percent_complete"],
            })

    # exit if no 'scheduled' jobs were found
    if not job_metadata:
        print("No jobs found")
        exit()
    # convert to dataframe for easier processing
    df = pd.DataFrame(job_metadata)
    # jobs should be run in the order they were requested - sort by epoch_request_time
    df.sort_values(by="epoch_request_time", ascending=True, inplace=True)
    index_to_schedule = df.iloc[0]["human_readable_index_name"]
    print(f"Scheduling job for index: {index_to_schedule}")
    schedule_indexing_job(index_to_schedule)


if __name__ == "__main__":
    main()
Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`# Copyright (c) Microsoft Corporation.`
			`# Licensed under the MIT License.`

			`"""`
Improve error handling of index job management (#166) 2024-09-13 01:31:08 -04:00			`Note: This script is intended to be executed as a cron job on kubernetes.`

Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`A naive implementation of a job manager that leverages k8s CronJob and CosmosDB`
Improve error handling of index job management (#166) 2024-09-13 01:31:08 -04:00			`to schedule graphrag indexing jobs on a first-come-first-serve basis (based on epoch time).`
Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`"""`

			`import os`

			`import pandas as pd`
			`import yaml`
			`from kubernetes import (`
			`client,`
			`config,`
			`)`
Update frontend UI app (#174) Co-authored-by: dorbaker <dorbaker@microsoft.com> 2024-09-19 01:09:26 -04:00
Re-activate pytests (#208) 2024-12-30 01:59:08 -05:00			`from src.api.azure_clients import AzureClientManager`
Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`from src.api.common import sanitize_name`
Re-activate pytests (#208) 2024-12-30 01:59:08 -05:00			`from src.logger.logger_singleton import LoggerSingleton`
Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`from src.typing.pipeline import PipelineJobState`
Re-activate pytests (#208) 2024-12-30 01:59:08 -05:00			`from src.utils.pipeline import PipelineJob`
Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00

			`def schedule_indexing_job(index_name: str):`
			`"""`
			`Schedule a k8s job to run graphrag indexing for a given index name.`
			`"""`
			`try:`
			`config.load_incluster_config()`
			`# get container image name`
			`core_v1 = client.CoreV1Api()`
			`pod_name = os.environ["HOSTNAME"]`
			`pod = core_v1.read_namespaced_pod(`
			`name=pod_name, namespace=os.environ["AKS_NAMESPACE"]`
			`)`
			`# retrieve job manifest template and replace necessary values`
			`job_manifest = _generate_aks_job_manifest(`
			`docker_image_name=pod.spec.containers[0].image,`
			`index_name=index_name,`
			`service_account_name=pod.spec.service_account_name,`
			`)`
			`batch_v1 = client.BatchV1Api()`
			`batch_v1.create_namespaced_job(`
			`body=job_manifest, namespace=os.environ["AKS_NAMESPACE"]`
			`)`
			`except Exception:`
Re-activate pytests (#208) 2024-12-30 01:59:08 -05:00			`reporter = LoggerSingleton().get_instance()`
Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`reporter.on_error(`
			`"Index job manager encountered error scheduling indexing job",`
			`)`
			`# In the event of a catastrophic scheduling failure, something in k8s or the job manifest is likely broken.`
			`# Set job status to failed to prevent an infinite loop of re-scheduling`
			`pipelinejob = PipelineJob()`
			`pipeline_job = pipelinejob.load_item(sanitize_name(index_name))`
			`pipeline_job["status"] = PipelineJobState.FAILED`


			`def _generate_aks_job_manifest(`
			`docker_image_name: str,`
			`index_name: str,`
			`service_account_name: str,`
			`) -> dict:`
			`"""Generate an AKS Jobs manifest file with the specified parameters.`

			`The manifest must be valid YAML with certain values replaced by the provided arguments.`
			`"""`
			`# NOTE: this file location is relative to the WORKDIR set in Dockerfile-backend`
			`with open("indexing-job-template.yaml", "r") as f:`
			`manifest = yaml.safe_load(f)`
			`manifest["metadata"]["name"] = f"indexing-job-{sanitize_name(index_name)}"`
			`manifest["spec"]["template"]["spec"]["serviceAccountName"] = service_account_name`
			`manifest["spec"]["template"]["spec"]["containers"][0]["image"] = docker_image_name`
			`manifest["spec"]["template"]["spec"]["containers"][0]["command"] = [`
			`"python",`
			`"run-indexing-job.py",`
			`f"-i={index_name}",`
			`]`
			`return manifest`


Improve error handling of index job management (#166) 2024-09-13 01:31:08 -04:00			`def list_k8s_jobs(namespace: str) -> list[str]:`
			`"""List all k8s jobs in a given namespace."""`
			`config.load_incluster_config()`
			`batch_v1 = client.BatchV1Api()`
			`jobs = batch_v1.list_namespaced_job(namespace=namespace)`
			`job_list = []`
			`for job in jobs.items:`
Handle previously failed indexing jobs (#206) Co-authored-by: Josh Bradley <joshbradley@microsoft.com> 2024-12-30 02:09:45 -05:00			`if job.metadata.name.startswith("indexing-job-") and job.status.active:`
			`job_list.append(job.metadata.name)`
Improve error handling of index job management (#166) 2024-09-13 01:31:08 -04:00			`return job_list`


Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`def main():`
Improve error handling of index job management (#166) 2024-09-13 01:31:08 -04:00			`"""`
			`There are two places to check to determine if an indexing job should be executed:`
			`* Kubernetes: check if there are any active k8s jobs running in the cluster`
			`* CosmosDB: check if there are any indexing jobs in a scheduled state`

			`Ideally if an indexing job has finished or failed, the job status will be reflected in cosmosdb.`
			`However, if an indexing job failed due to OOM, the job status will not have been updated in cosmosdb.`

			`To avoid a catastrophic failure scenario where all indexing jobs are stuck in a scheduled state,`
			`both checks are necessary.`
			`"""`
			`kubernetes_jobs = list_k8s_jobs(os.environ["AKS_NAMESPACE"])`

Re-activate pytests (#208) 2024-12-30 01:59:08 -05:00			`azure_storage_client_manager = AzureClientManager()`
Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`job_container_store_client = (`
			`azure_storage_client_manager.get_cosmos_container_client(`
Re-activate pytests (#208) 2024-12-30 01:59:08 -05:00			`database="graphrag", container="jobs"`
Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`)`
			`)`
Improve error handling of index job management (#166) 2024-09-13 01:31:08 -04:00			`# retrieve status of all index jobs that are scheduled or running`
Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`job_metadata = []`
			`for item in job_container_store_client.read_all_items():`
			`if item["status"] == PipelineJobState.RUNNING.value:`
Improve error handling of index job management (#166) 2024-09-13 01:31:08 -04:00			`# if index job has running state but no associated k8s job, a catastrophic`
			`# failure (OOM for example) occurred. Set job status to failed.`
			`if len(kubernetes_jobs) == 0:`
			`print(`
			`f"Indexing job for '{item['human_readable_index_name']}' in 'running' state but no associated k8s job found. Updating to failed state."`
			`)`
			`pipelinejob = PipelineJob()`
			`pipeline_job = pipelinejob.load_item(item["sanitized_index_name"])`
Handle previously failed indexing jobs (#206) Co-authored-by: Josh Bradley <joshbradley@microsoft.com> 2024-12-30 02:09:45 -05:00			`pipeline_job.status = PipelineJobState.FAILED`
Improve error handling of index job management (#166) 2024-09-13 01:31:08 -04:00			`else:`
			`print(`
			`f"Indexing job for '{item['human_readable_index_name']}' already running. Will not schedule another. Exiting..."`
			`)`
			`exit()`
Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`if item["status"] == PipelineJobState.SCHEDULED.value:`
Cdifonzo/multi index testing (#162) 2024-09-12 21:41:46 -04:00			`job_metadata.append({`
			`"human_readable_index_name": item["human_readable_index_name"],`
			`"epoch_request_time": item["epoch_request_time"],`
			`"status": item["status"],`
			`"percent_complete": item["percent_complete"],`
			`})`
Improve error handling of index job management (#166) 2024-09-13 01:31:08 -04:00
			`# exit if no 'scheduled' jobs were found`
Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`if not job_metadata:`
			`print("No jobs found")`
			`exit()`
Improve error handling of index job management (#166) 2024-09-13 01:31:08 -04:00			`# convert to dataframe for easier processing`
Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`df = pd.DataFrame(job_metadata)`
Improve error handling of index job management (#166) 2024-09-13 01:31:08 -04:00			`# jobs should be run in the order they were requested - sort by epoch_request_time`
Add indexing job manager (#133) 2024-08-09 22:22:49 -04:00			`df.sort_values(by="epoch_request_time", ascending=True, inplace=True)`
			`index_to_schedule = df.iloc[0]["human_readable_index_name"]`
			`print(f"Scheduling job for index: {index_to_schedule}")`
			`schedule_indexing_job(index_to_schedule)`


			`if __name__ == "__main__":`
			`main()`