Handle previously failed indexing jobs (#206)

Co-authored-by: Josh Bradley <joshbradley@microsoft.com>
2025-06-27 04:39:57 +00:00 · 2024-12-30 02:09:45 -05:00 · 2024-12-30 02:09:45 -05:00 · 900c503573
commit 900c503573
parent c7b5f96a03
3 changed files with 5 additions and 5 deletions
--- a/backend/indexing-job-manager-template.yaml
+++ b/backend/indexing-job-manager-template.yaml
@ -12,7 +12,6 @@ spec:
  schedule: "*/5 * * * *"
  jobTemplate:
    spec:
-      ttlSecondsAfterFinished: 30
      template:
        metadata:
          labels:
--- a/backend/indexing-job-template.yaml
+++ b/backend/indexing-job-template.yaml
@ -26,8 +26,8 @@ spec:
        imagePullPolicy: Always
        resources:
          requests:
-            cpu: "6"
-            memory: "56Gi"
+            cpu: "5"
+            memory: "36Gi"
          limits:
            cpu: "8"
            memory: "64Gi"
--- a/backend/manage-indexing-jobs.py
+++ b/backend/manage-indexing-jobs.py
@ -88,7 +88,8 @@ def list_k8s_jobs(namespace: str) -> list[str]:
    jobs = batch_v1.list_namespaced_job(namespace=namespace)
    job_list = []
    for job in jobs.items:
-        job_list.append(job.metadata.name)
+        if job.metadata.name.startswith("indexing-job-") and job.status.active:
+            job_list.append(job.metadata.name)
    return job_list


@ -124,7 +125,7 @@ def main():
                )
                pipelinejob = PipelineJob()
                pipeline_job = pipelinejob.load_item(item["sanitized_index_name"])
-                pipeline_job["status"] = PipelineJobState.FAILED.value
+                pipeline_job.status = PipelineJobState.FAILED
            else:
                print(
                    f"Indexing job for '{item['human_readable_index_name']}' already running. Will not schedule another. Exiting..."