2024-08-09 22:22:49 -04:00
|
|
|
# Copyright (c) Microsoft Corporation.
|
|
|
|
# Licensed under the MIT License.
|
|
|
|
|
|
|
|
"""
|
2024-09-13 01:31:08 -04:00
|
|
|
Note: This script is intended to be executed as a cron job on kubernetes.
|
|
|
|
|
2024-08-09 22:22:49 -04:00
|
|
|
A naive implementation of a job manager that leverages k8s CronJob and CosmosDB
|
2024-09-13 01:31:08 -04:00
|
|
|
to schedule graphrag indexing jobs on a first-come-first-serve basis (based on epoch time).
|
2024-08-09 22:22:49 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
import os
|
2025-01-30 13:59:51 -05:00
|
|
|
import traceback
|
|
|
|
from pathlib import Path
|
2024-08-09 22:22:49 -04:00
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
import yaml
|
|
|
|
from kubernetes import (
|
|
|
|
client,
|
|
|
|
config,
|
|
|
|
)
|
2024-09-19 01:09:26 -04:00
|
|
|
|
2025-01-30 13:59:51 -05:00
|
|
|
from graphrag_app.logger.load_logger import load_pipeline_logger
|
|
|
|
from graphrag_app.typing.pipeline import PipelineJobState
|
|
|
|
from graphrag_app.utils.azure_clients import AzureClientManager
|
|
|
|
from graphrag_app.utils.common import sanitize_name
|
|
|
|
from graphrag_app.utils.pipeline import PipelineJob
|
2024-08-09 22:22:49 -04:00
|
|
|
|
|
|
|
|
|
|
|
def schedule_indexing_job(index_name: str):
|
|
|
|
"""
|
|
|
|
Schedule a k8s job to run graphrag indexing for a given index name.
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
config.load_incluster_config()
|
|
|
|
# get container image name
|
|
|
|
core_v1 = client.CoreV1Api()
|
|
|
|
pod_name = os.environ["HOSTNAME"]
|
|
|
|
pod = core_v1.read_namespaced_pod(
|
|
|
|
name=pod_name, namespace=os.environ["AKS_NAMESPACE"]
|
|
|
|
)
|
2025-04-02 17:15:26 -04:00
|
|
|
# get image pull secrets only if they were provided as part of the deployment.
|
|
|
|
image_pull_secrets = (
|
|
|
|
pod.spec.image_pull_secrets
|
|
|
|
if hasattr(pod.spec, "image_pull_secrets")
|
|
|
|
else None
|
|
|
|
)
|
2024-08-09 22:22:49 -04:00
|
|
|
# retrieve job manifest template and replace necessary values
|
|
|
|
job_manifest = _generate_aks_job_manifest(
|
|
|
|
docker_image_name=pod.spec.containers[0].image,
|
|
|
|
index_name=index_name,
|
|
|
|
service_account_name=pod.spec.service_account_name,
|
2025-04-02 17:15:26 -04:00
|
|
|
image_pull_secrets=image_pull_secrets,
|
2024-08-09 22:22:49 -04:00
|
|
|
)
|
|
|
|
batch_v1 = client.BatchV1Api()
|
|
|
|
batch_v1.create_namespaced_job(
|
|
|
|
body=job_manifest, namespace=os.environ["AKS_NAMESPACE"]
|
|
|
|
)
|
2025-01-30 13:59:51 -05:00
|
|
|
except Exception as e:
|
|
|
|
reporter = load_pipeline_logger()
|
|
|
|
reporter.error(
|
|
|
|
message="Index job manager encountered error scheduling indexing job",
|
|
|
|
cause=e,
|
|
|
|
stack=traceback.format_exc(),
|
2024-08-09 22:22:49 -04:00
|
|
|
)
|
|
|
|
# In the event of a catastrophic scheduling failure, something in k8s or the job manifest is likely broken.
|
|
|
|
# Set job status to failed to prevent an infinite loop of re-scheduling
|
|
|
|
pipelinejob = PipelineJob()
|
|
|
|
pipeline_job = pipelinejob.load_item(sanitize_name(index_name))
|
|
|
|
pipeline_job["status"] = PipelineJobState.FAILED
|
|
|
|
|
|
|
|
|
|
|
|
def _generate_aks_job_manifest(
|
|
|
|
docker_image_name: str,
|
|
|
|
index_name: str,
|
|
|
|
service_account_name: str,
|
2025-04-02 17:15:26 -04:00
|
|
|
image_pull_secrets: list,
|
2024-08-09 22:22:49 -04:00
|
|
|
) -> dict:
|
|
|
|
"""Generate an AKS Jobs manifest file with the specified parameters.
|
|
|
|
|
|
|
|
The manifest must be valid YAML with certain values replaced by the provided arguments.
|
|
|
|
"""
|
2025-01-30 13:59:51 -05:00
|
|
|
ROOT_DIR = Path(__file__).resolve().parent.parent
|
|
|
|
with (ROOT_DIR / "manifests/job.yaml").open("r") as f:
|
2024-08-09 22:22:49 -04:00
|
|
|
manifest = yaml.safe_load(f)
|
|
|
|
manifest["metadata"]["name"] = f"indexing-job-{sanitize_name(index_name)}"
|
|
|
|
manifest["spec"]["template"]["spec"]["serviceAccountName"] = service_account_name
|
2025-04-02 17:15:26 -04:00
|
|
|
if image_pull_secrets:
|
|
|
|
manifest["spec"]["template"]["spec"]["imagePullSecrets"] = image_pull_secrets
|
2024-08-09 22:22:49 -04:00
|
|
|
manifest["spec"]["template"]["spec"]["containers"][0]["image"] = docker_image_name
|
|
|
|
manifest["spec"]["template"]["spec"]["containers"][0]["command"] = [
|
|
|
|
"python",
|
2025-01-30 13:59:51 -05:00
|
|
|
"indexer.py",
|
2024-08-09 22:22:49 -04:00
|
|
|
f"-i={index_name}",
|
|
|
|
]
|
|
|
|
return manifest
|
|
|
|
|
|
|
|
|
2024-09-13 01:31:08 -04:00
|
|
|
def list_k8s_jobs(namespace: str) -> list[str]:
|
|
|
|
"""List all k8s jobs in a given namespace."""
|
|
|
|
config.load_incluster_config()
|
|
|
|
batch_v1 = client.BatchV1Api()
|
|
|
|
jobs = batch_v1.list_namespaced_job(namespace=namespace)
|
|
|
|
job_list = []
|
|
|
|
for job in jobs.items:
|
2024-12-30 02:09:45 -05:00
|
|
|
if job.metadata.name.startswith("indexing-job-") and job.status.active:
|
|
|
|
job_list.append(job.metadata.name)
|
2024-09-13 01:31:08 -04:00
|
|
|
return job_list
|
|
|
|
|
|
|
|
|
2024-08-09 22:22:49 -04:00
|
|
|
def main():
|
2024-09-13 01:31:08 -04:00
|
|
|
"""
|
|
|
|
There are two places to check to determine if an indexing job should be executed:
|
|
|
|
* Kubernetes: check if there are any active k8s jobs running in the cluster
|
|
|
|
* CosmosDB: check if there are any indexing jobs in a scheduled state
|
|
|
|
|
|
|
|
Ideally if an indexing job has finished or failed, the job status will be reflected in cosmosdb.
|
|
|
|
However, if an indexing job failed due to OOM, the job status will not have been updated in cosmosdb.
|
|
|
|
|
|
|
|
To avoid a catastrophic failure scenario where all indexing jobs are stuck in a scheduled state,
|
|
|
|
both checks are necessary.
|
|
|
|
"""
|
|
|
|
kubernetes_jobs = list_k8s_jobs(os.environ["AKS_NAMESPACE"])
|
|
|
|
|
2024-12-30 01:59:08 -05:00
|
|
|
azure_storage_client_manager = AzureClientManager()
|
2024-08-09 22:22:49 -04:00
|
|
|
job_container_store_client = (
|
|
|
|
azure_storage_client_manager.get_cosmos_container_client(
|
2024-12-30 01:59:08 -05:00
|
|
|
database="graphrag", container="jobs"
|
2024-08-09 22:22:49 -04:00
|
|
|
)
|
|
|
|
)
|
2024-09-13 01:31:08 -04:00
|
|
|
# retrieve status of all index jobs that are scheduled or running
|
2024-08-09 22:22:49 -04:00
|
|
|
job_metadata = []
|
|
|
|
for item in job_container_store_client.read_all_items():
|
|
|
|
if item["status"] == PipelineJobState.RUNNING.value:
|
2024-09-13 01:31:08 -04:00
|
|
|
# if index job has running state but no associated k8s job, a catastrophic
|
|
|
|
# failure (OOM for example) occurred. Set job status to failed.
|
|
|
|
if len(kubernetes_jobs) == 0:
|
|
|
|
print(
|
|
|
|
f"Indexing job for '{item['human_readable_index_name']}' in 'running' state but no associated k8s job found. Updating to failed state."
|
|
|
|
)
|
|
|
|
pipelinejob = PipelineJob()
|
|
|
|
pipeline_job = pipelinejob.load_item(item["sanitized_index_name"])
|
2024-12-30 02:09:45 -05:00
|
|
|
pipeline_job.status = PipelineJobState.FAILED
|
2024-09-13 01:31:08 -04:00
|
|
|
else:
|
|
|
|
print(
|
|
|
|
f"Indexing job for '{item['human_readable_index_name']}' already running. Will not schedule another. Exiting..."
|
|
|
|
)
|
|
|
|
exit()
|
2024-08-09 22:22:49 -04:00
|
|
|
if item["status"] == PipelineJobState.SCHEDULED.value:
|
2024-09-12 21:41:46 -04:00
|
|
|
job_metadata.append({
|
|
|
|
"human_readable_index_name": item["human_readable_index_name"],
|
|
|
|
"epoch_request_time": item["epoch_request_time"],
|
|
|
|
"status": item["status"],
|
|
|
|
"percent_complete": item["percent_complete"],
|
|
|
|
})
|
2024-09-13 01:31:08 -04:00
|
|
|
|
|
|
|
# exit if no 'scheduled' jobs were found
|
2024-08-09 22:22:49 -04:00
|
|
|
if not job_metadata:
|
|
|
|
print("No jobs found")
|
|
|
|
exit()
|
2024-09-13 01:31:08 -04:00
|
|
|
# convert to dataframe for easier processing
|
2024-08-09 22:22:49 -04:00
|
|
|
df = pd.DataFrame(job_metadata)
|
2024-09-13 01:31:08 -04:00
|
|
|
# jobs should be run in the order they were requested - sort by epoch_request_time
|
2024-08-09 22:22:49 -04:00
|
|
|
df.sort_values(by="epoch_request_time", ascending=True, inplace=True)
|
|
|
|
index_to_schedule = df.iloc[0]["human_readable_index_name"]
|
|
|
|
print(f"Scheduling job for index: {index_to_schedule}")
|
|
|
|
schedule_indexing_job(index_to_schedule)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|