From e2d8a93cf07cd98401f5156eb61c73efb7350185 Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Fri, 14 May 2021 03:57:55 +0100 Subject: [PATCH] feat(k8s): generalizes CronJob metadata ingestion resource for custom logic (#2467) --- .../charts/datahub-ingestion-cron/Chart.yaml | 2 +- .../charts/datahub-ingestion-cron/README.md | 10 ++-- .../templates/cron.yaml | 19 ++++--- .../charts/datahub-ingestion-cron/values.yaml | 56 +++++++++++-------- 4 files changed, 51 insertions(+), 36 deletions(-) diff --git a/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/Chart.yaml b/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/Chart.yaml index 245fb86ec4..63d5199523 100644 --- a/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/Chart.yaml +++ b/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/Chart.yaml @@ -14,7 +14,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. -version: 0.2.1 +version: 0.2.2 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. diff --git a/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/README.md b/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/README.md index 2979da65c0..52301f01d5 100644 --- a/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/README.md +++ b/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/README.md @@ -11,10 +11,12 @@ A Helm chart for datahub's metadata-ingestion framework with kerberos authentica | image.tag | string | `"latest"` | DataHub Ingestion image tag | | imagePullSecrets | array | `[]` (does not add image pull secrets to deployed pods) | Docker registry secret names as an array | | labels | string | `{}` | Metadata labels to be added to each crawling cron job | -| crons | type | `[]` | A list of crawling parameters per different technology being crawler | -| crons.name | string | `crawler` | Name of the crawler container | -| crons.schedule | string | `""0 0 * * *"` | Cron expression (daily at midnight) for crawler jobs | -| crons.crawlerConfigPath | string | N/A | Path to metadata configuration file. This must explicitly defined as a mount and is **required**. | +| crons | type | `{}` | A map of crawling parameters per different technology being crawler, the key in the object will be used as the name for the new cron job | +| crons.schedule | string | `"0 0 * * *"` | Cron expression (default is daily at midnight) for crawler jobs | +| crons.recipe | object | `{}` | Recipe configuration to be executed (required) | +| crons.recipe.configmapName | string | `""` | Name of configmap to be mounted containing recipe to be executed | +| crons.recipe.fileName | string | `""` | Name of property within configMap referenced by `recipe.configName` with the concrete recipe definition | +| crons.command | array | `["/bin/sh", "-c", "datahub ingest -c /etc/recipe/"]` | Array of strings denoting the crawling command to be invoked in the cron job. By default it will execute the recipe defined in the `crons.recipe` object. Cron crawling customization is possible by having extra volumes with custom logic to be executed. | | crons.hostAliases | array | `[]` | host aliases | | crons.env | object | `{}` | Environment variables to add to the cronjob container | | crons.envFromSecrets | object | `{}` | Environment variables from secrets to the cronjob container | diff --git a/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/templates/cron.yaml b/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/templates/cron.yaml index 8b8db74ea6..5c4491c979 100644 --- a/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/templates/cron.yaml +++ b/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/templates/cron.yaml @@ -1,10 +1,11 @@ {{- $baseName := include "datahub-ingestion-cron.fullname" .}} {{- $labels := include "datahub-ingestion-cron.labels" .}} -{{- range $job, $val := .Values.crons }} +{{- range $jobName, $val := .Values.crons }} +{{- $defaultCommand := printf "datahub ingest -c /etc/recipe/%s" $val.recipe.fileName }} apiVersion: batch/v1beta1 kind: CronJob metadata: - name: "{{ $baseName }}-{{ .name }}" + name: "{{ $baseName }}-{{ $jobName }}" labels: {{- $labels | nindent 4 }} spec: schedule: {{ default "0 0 * * *" .schedule | quote}} @@ -24,17 +25,16 @@ spec: hostAliases: {{- include "common.tplvalues.render" (dict "value" .hostAliases "context" $) | nindent 10 }} {{- end }} containers: - - name: {{ default "crawler" .name }} + - name: {{ $jobName }}-crawler image: "{{ $.Values.image.repository }}:{{ $.Values.image.tag }}" imagePullPolicy: {{ $.Values.image.pullPolicy }} {{- if .extraVolumeMounts }} volumeMounts: + - name: recipe + mountPath: /etc/recipe {{- toYaml .extraVolumeMounts | nindent 14 }} {{- end }} - command: - - /bin/sh - - -c - - datahub ingest -c {{ required "Path to configuration file is required" .crawlerConfigPath }} + command: ["/bin/sh", "-c", {{ default $defaultCommand .command }} ] env: {{- if .env }} {{- range $key,$value := .env }} @@ -52,8 +52,11 @@ spec: {{- end }} {{- end }} restartPolicy: OnFailure - {{- if .extraVolumes }} volumes: + - name: recipe + configMap: + name: {{ required "A valid .recipe.configmapName entry is required!" $val.recipe.configmapName }} + {{- if .extraVolumes }} {{- toYaml .extraVolumes | nindent 12 }} {{- end }} --- diff --git a/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/values.yaml b/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/values.yaml index 28a4bcb69e..f1106c5915 100644 --- a/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/values.yaml +++ b/datahub-kubernetes/datahub/charts/datahub-ingestion-cron/values.yaml @@ -9,34 +9,44 @@ image: imagePullSecrets: [] -crons: [] +crons: {} #### Example data - ## Metadata ingestion name - ## - #name: "crawler" + #hive: + ## Daily at midnight (we may want to offset this to not conflict with other processes) + #schedule: "0 0 * * *" - ## Daily at midnight (we may want to offset this to not conflict with other processes) - #schedule: "0 0 * * *" + #recipe: + # configmapName: + # fileName: - ## Deployment pod host aliases - ## https://kubernetes.io/docs/concepts/services-networking/add-entries-to-pod-etc-hosts-with-host-aliases/ - ## - #hostAliases: [] + ## Command to be executed + #command: ["/bin/sh", "-c", "datahub ingest -c "] - ## Environment variables. - #env: {} + ## Deployment pod host aliases + ## https://kubernetes.io/docs/concepts/services-networking/add-entries-to-pod-etc-hosts-with-host-aliases/ + ## + #hostAliases: [] - ## Environment variables from Secret resources. - #envFromSecrets: {} + ## Environment variables. + #env: {} - ## Additional primary volume mounts - ## - #extraVolumeMounts: [] + ## Environment variables from Secret resources. + #envFromSecrets: {} - ## Additional primary volumes - ## - #extraVolumes: [] + ## Additional primary volume mounts + ## + #extraVolumeMounts: + #- name: configmap-volume + # mountPath: config.yml + # subPath: config.yml - ## Add your own init container or uncomment and modify the given example. - ## - #extraInitContainers: {} \ No newline at end of file + ## Additional primary volumes + ## + #extraVolumes: + #- name: configmap-volume + # configMap: + # name: crawler-config + + ## Add your own init container or uncomment and modify the given example. + ## + #extraInitContainers: {}