name: Docker Build, Scan, Test on: workflow_dispatch: schedule: - cron: "0 0 * * *" # Run at midnight UTC every day push: branches: - master - releases/** pull_request: branches: - "**" release: types: [published] concurrency: # Using `github.run_id` (unique val) instead of `github.ref` here # because we don't want to cancel this workflow on master only for PRs # as that makes reproducing issues easier group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.run_id }} cancel-in-progress: true env: DOCKER_REGISTRY: "acryldata" DOCKER_CACHE: "DEPOT" DEPOT_PROJECT_ID: "${{ vars.DEPOT_PROJECT_ID }}" DEPOT_TOKEN: "${{ secrets.DEPOT_TOKEN }}" permissions: contents: read id-token: write jobs: setup: runs-on: depot-ubuntu-24.04-small outputs: # TODO: Many of the vars below should not be required anymore. tag: ${{ steps.tag.outputs.tag }} slim_tag: ${{ steps.tag.outputs.slim_tag }} full_tag: ${{ steps.tag.outputs.full_tag }} short_sha: ${{ steps.tag.outputs.short_sha }} # needed for auto-deploy unique_tag: ${{ steps.tag.outputs.unique_tag }} unique_slim_tag: ${{ steps.tag.outputs.unique_slim_tag }} unique_full_tag: ${{ steps.tag.outputs.unique_full_tag }} docker-login: ${{ steps.docker-login.outputs.docker-login }} publish: ${{ steps.publish.outputs.publish }} pr-publish: ${{ steps.pr-publish.outputs.publish }} python_release_version: ${{ steps.tag.outputs.python_release_version }} branch_name: ${{ steps.tag.outputs.branch_name }} repository_name: ${{ steps.tag.outputs.repository_name }} frontend_change: ${{ steps.ci-optimize.outputs.frontend-change == 'true' || github.event_name != 'pull_request' }} actions_change: ${{ steps.ci-optimize.outputs.actions-change == 'true' || github.event_name != 'pull_request'}} ingestion_change: ${{ steps.ci-optimize.outputs.ingestion-change == 'true' || github.event_name != 'pull_request' }} ingestion_base_change: ${{ steps.ci-optimize.outputs.ingestion-base-change == 'true' }} backend_change: ${{ steps.ci-optimize.outputs.backend-change == 'true' || github.event_name != 'pull_request' }} frontend_only: ${{ steps.ci-optimize.outputs.frontend-only == 'true' }} ingestion_only: ${{ steps.ci-optimize.outputs.ingestion-only == 'true' }} backend_only: ${{ steps.ci-optimize.outputs.backend-only == 'true' }} kafka_setup_change: ${{ steps.ci-optimize.outputs.kafka-setup-change == 'true' }} mysql_setup_change: ${{ steps.ci-optimize.outputs.mysql-setup-change == 'true' }} postgres_setup_change: ${{ steps.ci-optimize.outputs.postgres-setup-change == 'true' }} elasticsearch_setup_change: ${{ steps.ci-optimize.outputs.elasticsearch-setup-change == 'true' }} smoke_test_change: ${{ steps.ci-optimize.outputs.smoke-test-change == 'true' }} integrations_service_change: "false" datahub_executor_change: "false" run_publish_images: ${{ steps.run-publish-images.outputs.run_publish_images}} build_runner_type: ${{ steps.set-runner.outputs.build_runner_type }} test_runner_type: ${{ steps.set-runner.outputs.test_runner_type }} test_runner_type_small: ${{ steps.set-runner.outputs.test_runner_type_small }} use_depot_cache: ${{ steps.set-runner.outputs.use_depot_cache }} steps: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 - name: Compute Tag id: tag run: | source .github/scripts/docker_helpers.sh { echo "short_sha=${SHORT_SHA}" echo "tag=$(get_tag)" echo "slim_tag=$(get_tag_slim)" echo "full_tag=$(get_tag_full)" echo "unique_tag=$(get_unique_tag)" echo "unique_slim_tag=$(get_unique_tag_slim)" echo "unique_full_tag=$(get_unique_tag_full)" echo "python_release_version=$(get_python_docker_release_v)" echo "branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" echo "repository_name=${GITHUB_REPOSITORY#*/}" } >> "$GITHUB_OUTPUT" - name: Check whether docker login is possible id: docker-login env: ENABLE_DOCKER_LOGIN: ${{ secrets.ACRYL_DOCKER_PASSWORD != '' }} run: | echo "Enable Docker Login: ${{ env.ENABLE_DOCKER_LOGIN }}" echo "docker-login=${{ env.ENABLE_DOCKER_LOGIN }}" >> "$GITHUB_OUTPUT" - name: Check whether publishing enabled id: publish env: ENABLE_PUBLISH: >- ${{ (github.event_name != 'pull_request') && ( secrets.ACRYL_DOCKER_PASSWORD != '' ) }} run: | echo "Enable publish: ${{ env.ENABLE_PUBLISH }}" echo "publish=${{ env.ENABLE_PUBLISH }}" >> "$GITHUB_OUTPUT" - name: Check whether PR publishing enabled id: pr-publish env: ENABLE_PUBLISH: >- ${{ (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'publish') || contains(github.event.pull_request.labels.*.name, 'publish-docker'))) && ( secrets.ACRYL_DOCKER_PASSWORD != '' ) }} run: | echo "Enable PR publish: ${{ env.ENABLE_PUBLISH }}" echo "publish=${{ env.ENABLE_PUBLISH }}" >> "$GITHUB_OUTPUT" - uses: ./.github/actions/ci-optimization id: ci-optimize - name: Determine runner type id: set-runner # This needs to handle two scenarios: # 1. Running on a PR from a fork. There are some auth issues that prevent us from using depot in that case. # So, Its easier to just use the regular github actions cache and build all images for each parallel job running smoke test. # Note, concurrency is lower when using github runners, queue times can be longer, test time is longer due to fewer parallel jobs. # 2. Running on a PR from a branch in the datahub-project org and push/schedule events on master. # Depot is used here for remote container builds in base_build and also for all runners. Depot runners support unlimited concurrency # and hence short queue times and higher parallelism of smoke tests run: | if [[ "${{ env.DOCKER_CACHE }}" == "DEPOT" && "${{ env.DEPOT_PROJECT_ID }}" != "" ]]; then echo "build_runner_type=depot-ubuntu-24.04-4" >> "$GITHUB_OUTPUT" echo "test_runner_type=depot-ubuntu-24.04-4" >> "$GITHUB_OUTPUT" echo "test_runner_type_small=depot-ubuntu-24.04-small" >> "$GITHUB_OUTPUT" echo "use_depot_cache=true" >> "$GITHUB_OUTPUT" else echo "build_runner_type=ubuntu-latest" >> "$GITHUB_OUTPUT" echo "test_runner_type=ubuntu-latest" >> "$GITHUB_OUTPUT" echo "test_runner_type_small=ubuntu-latest" >> "$GITHUB_OUTPUT" echo "use_depot_cache=false" >> "$GITHUB_OUTPUT" # publishing is currently only supported via depot fi smoke_test_lint: name: Lint on smoke tests runs-on: ${{ needs.setup.outputs.test_runner_type_small }} needs: setup if: ${{ needs.setup.outputs.smoke_test_change == 'true' }} steps: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 - uses: actions/setup-python@v5 with: python-version: "3.11" cache: "pip" - uses: actions/cache@v4 with: path: | ~/.cache/uv key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }} - uses: actions/cache@v4 with: path: | ~/.cache/yarn key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }} - name: Run lint on smoke test run: | python ./.github/scripts/check_python_package.py ./gradlew :smoke-test:pythonLint ./gradlew :smoke-test:cypressLint base_build: name: Build all images runs-on: ${{ needs.setup.outputs.build_runner_type }} needs: setup if: ${{ needs.setup.outputs.use_depot_cache == 'true' }} # On fork, smoke test job does the build since depot cache is not available outputs: build_id: ${{ steps.capture-build-id.outputs.build_id }} matrix: ${{ steps.capture-build-id.outputs.matrix }} steps: - name: Set up JDK 17 uses: actions/setup-java@v4 with: distribution: "zulu" java-version: 17 - uses: actions/cache/restore@v4 with: path: | ~/.cache/uv key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }} restore-keys: | ${{ runner.os }}-uv- - uses: actions/cache/restore@v4 with: path: | ~/.cache/yarn key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }} restore-keys: | ${{ runner.os }}-yarn- - name: Set up Depot CLI if: ${{ env.DOCKER_CACHE == 'DEPOT' }} uses: depot/setup-action@v1 - name: Check out the repo uses: acryldata/sane-checkout-action@v3 - uses: actions/setup-python@v5 with: python-version: "3.11" cache: "pip" - name: Login to DockerHub uses: docker/login-action@v3 if: ${{ needs.setup.outputs.docker-login == 'true' }} with: username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} - name: Build all Images (For Smoke tests) if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} # If not publishing, just a subset of images required for smoke tests is sufficient. run: | ./gradlew :docker:buildImagesQuickStartDebugConsumers -Ptag=${{ needs.setup.outputs.tag }} -PpythonDockerVersion=${{ needs.setup.outputs.python_release_version }} -PdockerRegistry=${{ env.DOCKER_REGISTRY }} - name: Build all Images (Publish) if: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} run: | ./gradlew :docker:buildImagesAll -PmatrixBuild=true -Ptag=${{ needs.setup.outputs.tag }} -PshaTag=${{ needs.setup.outputs.short_sha }} -PpythonDockerVersion=${{ needs.setup.outputs.python_release_version }} -PdockerRegistry=${{ env.DOCKER_REGISTRY }} -PdockerPush=true - name: Capture build Id id: capture-build-id run: | pip install jq DEPOT_BUILD_ID=$(jq -r '.["depot.build"]?.buildID' ${{ github.workspace }}/build/build-metadata.json) echo "build_id=${DEPOT_BUILD_ID}" >> "$GITHUB_OUTPUT" echo "matrix=$(jq -c '{"target":.["depot.build"].targets}' ${{ github.workspace }}/build/build-metadata.json)" >> $GITHUB_OUTPUT - uses: actions/cache/save@v4 if: ${{ github.ref == 'refs/heads/master' }} with: path: | ~/.cache/uv key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }} - uses: actions/cache/save@v4 if: ${{ github.ref == 'refs/heads/master' }} with: path: | ~/.cache/yarn key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }} scan_images: permissions: contents: read # for actions/checkout to fetch code security-events: write # for github/codeql-action/upload-sarif to upload SARIF results actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status name: Scan images for vulnerabilities runs-on: depot-ubuntu-24.04 needs: [setup, base_build] if: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} strategy: fail-fast: false matrix: ${{ fromJson(needs.base_build.outputs.matrix) }} steps: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 - id: download_image name: Download images from depot if: ${{ needs.setup.outputs.use_depot_cache == 'true' }} run: | depot pull --project ${{ env.DEPOT_PROJECT_ID }} ${{ needs.base_build.outputs.build_id }} --target ${{ matrix.target}} docker images echo "docker_image=$(docker images --format '{{.Repository}}:{{.Tag}}' | grep ${{ needs.setup.outputs.tag }} )" >> $GITHUB_OUTPUT - name: Run Trivy vulnerability scanner uses: aquasecurity/trivy-action@0.30.0 env: TRIVY_OFFLINE_SCAN: true TRIVY_DB_REPOSITORY: public.ecr.aws/aquasecurity/trivy-db:2,ghcr.io/aquasecurity/trivy-db:2 TRIVY_JAVA_DB_REPOSITORY: public.ecr.aws/aquasecurity/trivy-java-db:1,ghcr.io/aquasecurity/trivy-java-db:1 with: image-ref: ${{ steps.download_image.outputs.docker_image }} format: "template" template: "@/contrib/sarif.tpl" output: "trivy-results.sarif" severity: "CRITICAL,HIGH" ignore-unfixed: true vuln-type: "os,library" - name: Upload Trivy scan results to GitHub Security tab uses: github/codeql-action/upload-sarif@v3 with: sarif_file: "trivy-results.sarif" smoke_test_matrix: runs-on: ${{ needs.setup.outputs.test_runner_type_small }} needs: setup outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} cypress_batch_count: ${{ steps.set-batch-count.outputs.cypress_batch_count }} python_batch_count: ${{ steps.set-batch-count.outputs.python_batch_count }} steps: - id: set-batch-count # Tests are split simply to ensure the configured number of batches for parallelization. This may need some # increase as a new tests added increase the duration where an additional parallel batch helps. # python_batch_count is used to split pytests in the smoke-test (batches of actual test functions) # cypress_batch_count is used to split the collection of cypress test specs into batches. run: | if [[ "${{ needs.setup.outputs.test_runner_type }}" == "ubuntu-latest" ]]; then echo "cypress_batch_count=5" >> "$GITHUB_OUTPUT" echo "python_batch_count=3" >> "$GITHUB_OUTPUT" else echo "cypress_batch_count=11" >> "$GITHUB_OUTPUT" echo "python_batch_count=6" >> "$GITHUB_OUTPUT" fi - id: set-matrix # For m batches for python and n batches for cypress, we need a test matrix of python x m + cypress x n. # while the github action matrix generation can handle these two parts individually, there isnt a way to use the # two generated matrices for the same job. So, produce that matrix with scripting and use the include directive # to add it to the test matrix. run: | python_batch_count=${{ steps.set-batch-count.outputs.python_batch_count }} python_matrix='{"test_strategy":"pytests","batch":"0","batch_count":"'"$python_batch_count"'"}' for ((i=1;i> "$GITHUB_OUTPUT" smoke_test: name: Run Smoke Tests (${{ matrix.test_strategy }}, Batch ${{ matrix.batch }}/${{ matrix.batch_count }}) runs-on: ${{ needs.setup.outputs.test_runner_type }} needs: [setup, smoke_test_matrix, base_build] strategy: fail-fast: false matrix: ${{ fromJson(needs.smoke_test_matrix.outputs.matrix) }} if: ${{ always() && !failure() && !cancelled() && needs.smoke_test_matrix.outputs.matrix != '[]' }} steps: - name: Free up disk space if: ${{ !contains(needs.setup.outputs.test_runner_type, 'depot') }} run: | sudo apt-get remove 'dotnet-*' azure-cli || true sudo rm -rf /usr/local/lib/android/ || true sudo docker image prune -a -f || true - uses: actions/cache/restore@v4 with: path: | ~/.cache/uv key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }} restore-keys: | ${{ runner.os }}-uv- - uses: actions/cache/restore@v4 with: path: | ~/.npm ~/.cache/Cypress ~/.cache/yarn key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }} restore-keys: | ${{ runner.os }}-yarn- - name: Check out the repo uses: acryldata/sane-checkout-action@v3 - name: Set up Depot CLI if: ${{ needs.setup.outputs.use_depot_cache == 'true' }} uses: depot/setup-action@v1 - uses: actions/setup-python@v5 with: python-version: "3.11" cache: "pip" - uses: gradle/actions/setup-gradle@v4 if: ${{ needs.setup.outputs.use_depot_cache != 'true' }} - name: Login to DockerHub uses: docker/login-action@v3 if: ${{ needs.setup.outputs.docker-login == 'true' }} with: username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} - name: build images if: ${{ needs.setup.outputs.use_depot_cache != 'true' }} run: | ./gradlew :docker:buildImagesQuickstartDebugConsumers -Ptag=${{ needs.setup.outputs.tag }} -PpythonDockerVersion=${{ needs.setup.outputs.python_release_version }} -PdockerRegistry=${{ env.DOCKER_REGISTRY }} docker images env: DOCKER_CACHE: GITHUB - name: pull images from depot if: ${{ needs.setup.outputs.use_depot_cache == 'true' }} run: | depot pull --project ${{ env.DEPOT_PROJECT_ID }} ${{ needs.base_build.outputs.build_id }} docker images - name: run quickstart env: DATAHUB_TELEMETRY_ENABLED: false DATAHUB_VERSION: ${{ needs.setup.outputs.tag }} DATAHUB_ACTIONS_IMAGE: ${{ env.DATAHUB_ACTIONS_IMAGE }} ACTIONS_EXTRA_PACKAGES: "acryl-datahub-actions[executor] acryl-datahub-actions" ACTIONS_CONFIG: "https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml" run: | ./smoke-test/run-quickstart.sh - name: Disk Check run: df -h . && docker images - name: Disable ES Disk Threshold run: | curl -XPUT "http://localhost:9200/_cluster/settings" \ -H 'Content-Type: application/json' -d'{ "persistent": { "cluster": { "routing": { "allocation.disk.threshold_enabled": false } } } }' - name: Install dependencies run: ./metadata-ingestion/scripts/install_deps.sh - name: Build datahub cli run: | ./gradlew :metadata-ingestion:install - name: Smoke test env: RUN_QUICKSTART: false DATAHUB_VERSION: ${{ needs.setup.outputs.tag }} CYPRESS_RECORD_KEY: ${{ secrets.CYPRESS_RECORD_KEY }} CLEANUP_DATA: "false" TEST_STRATEGY: ${{ matrix.test_strategy }} BATCH_COUNT: ${{ matrix.batch_count }} BATCH_NUMBER: ${{ matrix.batch }} run: | echo "$DATAHUB_VERSION" ./gradlew --stop ./smoke-test/smoke.sh - name: Disk Check run: df -h . && docker images - name: store logs if: failure() run: | docker ps -a TEST_STRATEGY="-${{ matrix.test_strategy }}-${{ matrix.batch }}" source .github/scripts/docker_logs.sh - name: Upload logs uses: actions/upload-artifact@v4 if: failure() with: name: docker-logs-${{ matrix.test_strategy }}-${{ matrix.batch }} path: "docker_logs/*.log" retention-days: 5 - name: Upload screenshots uses: actions/upload-artifact@v4 if: failure() with: name: cypress-snapshots-${{ matrix.test_strategy }}-${{ matrix.batch }} path: smoke-test/tests/cypress/cypress/screenshots/ - uses: actions/upload-artifact@v4 if: always() with: name: Test Results (smoke tests) ${{ matrix.test_strategy }} ${{ matrix.batch }} path: | **/build/reports/tests/test/** **/build/test-results/test/** **/junit.*.xml !**/binary/** - name: Upload test results to Codecov if: ${{ !cancelled() }} uses: codecov/test-results-action@v1 with: token: ${{ secrets.CODECOV_TOKEN }} override_branch: ${{ github.head_ref || github.ref_name }} - uses: actions/cache/save@v4 if: ${{ github.ref == 'refs/heads/master' }} with: path: | ~/.cache/uv key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }} - uses: actions/cache/save@v4 if: ${{ github.ref == 'refs/heads/master' }} with: path: | ~/.cache/yarn key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }} deploy_datahub_head: name: Deploy to Datahub HEAD runs-on: ubuntu-latest needs: [setup, smoke_test_lint, smoke_test] steps: - uses: aws-actions/configure-aws-credentials@v4 if: ${{ needs.setup.outputs.publish != 'false' && github.repository_owner == 'datahub-project' && needs.setup.outputs.repository_name == 'datahub' }} with: aws-access-key-id: ${{ secrets.AWS_SQS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SQS_ACCESS_KEY }} aws-region: us-west-2 - uses: isbang/sqs-action@v0.2.0 if: ${{ needs.setup.outputs.publish != 'false' && github.repository_owner == 'datahub-project' && needs.setup.outputs.repository_name == 'datahub' }} with: sqs-url: ${{ secrets.DATAHUB_HEAD_SYNC_QUEUE }} message: '{ "command": "git-sync", "args" : {"repoName": "${{ needs.setup.outputs.repository_name }}", "repoOrg": "${{ github.repository_owner }}", "repoBranch": "${{ needs.setup.outputs.branch_name }}", "repoShaShort": "${{ needs.setup.outputs.short_sha }}" }}'