feat(ci): make datahub-actions docker build standalone (#13241)

This commit is contained in:
Harshal Sheth 2025-04-16 23:51:46 -07:00 committed by GitHub
parent 4e37202373
commit b75dbaa3a1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 156 additions and 93 deletions

View File

@ -1043,7 +1043,7 @@ jobs:
datahub_actions_build:
name: Build and Push DataHub Actions Docker Image
runs-on: depot-ubuntu-24.04
needs: [setup, base_build]
needs: [setup]
if: ${{ needs.setup.outputs.actions_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true'}}
steps:
- name: Check out the repo
@ -1061,6 +1061,8 @@ jobs:
image_tag: ${{ needs.setup.outputs.tag }}
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
build-args: |
RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }}
publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
context: .
file: ./docker/datahub-actions/Dockerfile
@ -1103,7 +1105,7 @@ jobs:
datahub_actions_slim_build:
name: Build and Push DataHub Actions Slim Docker Image
runs-on: depot-ubuntu-24.04
needs: [setup, base_build]
needs: [setup]
outputs:
tag: ${{ steps.tag.outputs.tag }}
needs_artifact_download: ${{ needs.setup.outputs.actions_change == 'true' && ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true') }}
@ -1123,6 +1125,7 @@ jobs:
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
build-args: |
APP_ENV=slim
RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }}
publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
context: .
file: ./docker/datahub-actions/Dockerfile

View File

@ -29,6 +29,7 @@ ext {
docker_repo = 'datahub-actions'
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
docker_version = "v${version}${docker_target == 'slim' ? '-slim' : ''}"
python_docker_version = project.getProperties().getOrDefault("pythonDockerVersion", "1!0.0.0+docker.${version}")
}
if (!project.hasProperty("extra_pip_requirements")) {
@ -152,11 +153,13 @@ docker {
exclude "**/*.xml"
include ".dockerignore"
include "docker/datahub-actions/**"
include "docker/snippets/**"
include "datahub-actions/**"
include "python-build/**"
}.exclude {
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
// buildArgs([APP_ENV: 'prod-slim'])
buildArgs([APP_ENV: docker_target, RELEASE_VERSION: python_docker_version])
additionalTag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -14,11 +14,11 @@
# Published at https://pypi.org/project/acryl-datahub-actions/.
__package_name__ = "acryl-datahub-actions"
__version__ = "0.0.0.dev0"
__version__ = "1!0.0.0.dev0"
def is_dev_mode() -> bool:
return __version__ == "0.0.0.dev0"
return __version__ == "1!0.0.0.dev0"
def nice_version_name() -> str:

View File

@ -1,61 +1,160 @@
# Copyright 2021 Acryl Data, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This image has two variants: full and slim.
# The full variant has additional deps preinstalled, like a JRE and Oracle client.
ARG APP_ENV=full
ARG PYTHON_VERSION=3.10
# Defining environment
ARG BASE_IMAGE=acryldata/datahub-ingestion-base
ARG DOCKER_VERSION=head-full
ARG APP_ENV=prod
# INLINE-BEGIN @/docker/snippets/ingestion_base
# This is the "base" image workflow.
# While it has a bunch of intermediate stages, it "exports" a couple
# stages for consumption.
# - python-base: A basic stage, with basic deps, Python, and a venv.
# - ingestion-base-slim: Currently the same as base-empty.
# - ingestion-base-full: Adds a JRE and Oracle client.
FROM $BASE_IMAGE:$DOCKER_VERSION AS base
FROM ubuntu:24.04 AS python-base
# TODO: This may not work on Ubuntu 24.04 due to the new deb822 package format.
ARG UBUNTU_REPO_URL=http://ports.ubuntu.com/ubuntu-ports
RUN if [ "${UBUNTU_REPO_URL}" != "http://ports.ubuntu.com/ubuntu-ports" ] ; then sed -i "s#http.*://ports.ubuntu.com/ubuntu-ports#${UBUNTU_REPO_URL}#g" /etc/apt/sources.list ; fi
ENV HOME=/home/datahub
RUN existing_group=$(getent group 1000 | cut -d: -f1) && \
if [ -n "$existing_group" ] && [ "$existing_group" != "datahub" ]; then \
echo "Renaming existing group $existing_group to datahub"; \
groupmod -n datahub "$existing_group"; \
elif [ -z "$existing_group" ]; then \
echo "Creating new group datahub with GID 1000"; \
addgroup --gid 1000 datahub; \
fi && \
existing_user=$(id -nu 1000 2>/dev/null || echo "") && \
if [ -n "$existing_user" ] && [ "$existing_user" != "datahub" ]; then \
echo "Renaming existing user $existing_user to datahub"; \
usermod -l datahub -d $HOME "$existing_user"; \
usermod -g datahub datahub; \
elif [ -z "$existing_user" ]; then \
echo "Creating new user datahub with UID 1000"; \
adduser --disabled-password --uid 1000 --gid 1000 --home $HOME datahub; \
fi && \
# Create and set proper permissions for datahub directories
mkdir -p $HOME && \
chown -R datahub:datahub $HOME
# Setup the PPA for alternative Python versions.
# TODO: Eventually we should switch to using uv's support for python-build-standalone.
RUN apt-get update && apt-get install -y \
software-properties-common \
lsb-release \
gnupg \
ca-certificates \
&& add-apt-repository --no-update ppa:deadsnakes/ppa \
&& rm -rf /var/lib/apt/lists/*
ARG PYTHON_VERSION
RUN test -n "${PYTHON_VERSION}" # PYTHON_VERSION must be set
RUN apt-get update && apt-get install -y \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-venv \
python${PYTHON_VERSION}-dev \
python${PYTHON_VERSION}-distutils \
python-is-python3 \
git \
wget \
curl \
zip \
unzip \
nano \
&& rm -rf /var/lib/apt/lists/*
# Set the default python version.
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --install /usr/bin/python python /usr/bin/python3 1
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
ARG PIP_MIRROR_URL=https://pypi.python.org/simple
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then uvx --no-cache pip config set global.index-url ${PIP_MIRROR_URL} ; fi
ENV UV_INDEX_URL=${PIP_MIRROR_URL}
USER datahub
WORKDIR $HOME
RUN uv venv --python "$PYTHON_VERSION"
ENV VIRTUAL_ENV=$HOME/.venv
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
# We always want to use the system CA bundle.
# Requests comes with it's own CA bundle, which we need to override.
ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
# uv uses a different mechanism. See https://github.com/astral-sh/uv/issues/1474.
ENV SSL_CERT_FILE="/etc/ssl/certs/ca-certificates.crt"
FROM python-base AS ingestion-base-slim
USER 0
RUN apt-get update && apt-get install -y \
python3-ldap \
libldap2-dev \
libsasl2-dev \
libsasl2-modules \
libaio-dev \
libaio1t64 \
libsasl2-modules-gssapi-mit \
krb5-user \
krb5-config \
libkrb5-dev \
librdkafka-dev \
ldap-utils \
unixodbc \
libodbc2 \
&& rm -rf /var/lib/apt/lists/*
USER datahub
FROM ingestion-base-slim AS ingestion-base-full
USER 0
# We need to install build-essential in order to build some Python packages (e.g. python-ldap)
RUN apt-get update && apt-get install --no-install-recommends -y -qq \
default-jre-headless \
build-essential \
&& rm -rf /var/lib/apt/lists/*
RUN --mount=type=bind,source=./docker/snippets/oracle_instantclient.sh,target=/oracle_instantclient.sh \
/oracle_instantclient.sh
USER datahub
# INLINE-END
FROM ingestion-base-${APP_ENV} AS final
USER root
COPY --from=powerman/dockerize:0.19 /usr/local/bin/dockerize /usr/local/bin
COPY --chown=datahub:datahub ./docker/datahub-actions/start.sh /start_datahub_actions.sh
COPY --chown=datahub:datahub ./docker/datahub-actions/readiness-check.sh /readiness-check.sh
RUN chmod a+x /start_datahub_actions.sh && \
mkdir -p /etc/datahub/actions && \
mkdir -p /tmp/datahub/logs/actions/system && \
chown -R datahub:datahub /etc/datahub /tmp/datahub && \
apt-get update && \
apt-get install -y -qq default-jre && \
apt-get clean && \
rm -rf /var/lib/{apt,dpkg,cache,log}/
chown -R datahub:datahub /etc/datahub /tmp/datahub
COPY --chown=datahub:datahub ./datahub-actions /actions-src
COPY --chown=datahub:datahub ./datahub-actions /datahub-actions
# Add other default configurations into this!
COPY --chown=datahub:datahub ./docker/datahub-actions/config /etc/datahub/actions/system/conf
USER datahub
WORKDIR /actions-src
FROM base AS slim-install
ARG RELEASE_VERSION
RUN test -n "$RELEASE_VERSION" # RELEASE_VERSION is a required build arg
RUN --mount=type=bind,source=./python-build/version_updater.py,target=/version_updater.py \
python /version_updater.py --directory /datahub-actions/ --version "$RELEASE_VERSION" --expected-update-count 1
# Effectively builds the image without the .cache (looks like it is duplicated but is not)
RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \
UV_LINK_MODE=copy uv pip install -e ".[all]"
FROM base AS prod-install
RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \
UV_LINK_MODE=copy uv pip install -e ".[all]"
FROM ${APP_ENV}-install AS final
WORKDIR /datahub-ingestion
# This is required to fix security vulnerability in htrace-core4
RUN find . -name "htrace-core4-4.1.0-incubating.jar" -exec rm "{}" \;
# For the datahub-actions build, we explicitly want to retain the uv cache.
# This speeds up the process of creating venvs at runtime.
# Because uv uses hardlinks for installing packages, keeping the cache around does not
# really impact image size.
RUN uv pip install -e "/datahub-actions/[all]"
ENTRYPOINT [ ]
CMD dockerize -wait ${DATAHUB_GMS_PROTOCOL:-http}://$DATAHUB_GMS_HOST:$DATAHUB_GMS_PORT/health -timeout 240s /start_datahub_actions.sh

View File

@ -1,10 +0,0 @@
docker/datahub-actions/Dockerfile
.git
**scripts/
**build/
**venv/
**tests/
**smoke-test/
**/*.xml
.*/
datahub-actions/.*/

View File

@ -119,7 +119,7 @@ RUN apt-get update && apt-get install --no-install-recommends -y -qq \
build-essential \
&& rm -rf /var/lib/apt/lists/*
RUN --mount=type=bind,source=./docker/datahub-ingestion/oracle_instantclient.sh,target=/oracle_instantclient.sh \
RUN --mount=type=bind,source=./docker/snippets/oracle_instantclient.sh,target=/oracle_instantclient.sh \
/oracle_instantclient.sh
USER datahub

View File

@ -120,7 +120,7 @@ RUN apt-get update && apt-get install --no-install-recommends -y -qq \
build-essential \
&& rm -rf /var/lib/apt/lists/*
RUN --mount=type=bind,source=./docker/datahub-ingestion/oracle_instantclient.sh,target=/oracle_instantclient.sh \
RUN --mount=type=bind,source=./docker/snippets/oracle_instantclient.sh,target=/oracle_instantclient.sh \
/oracle_instantclient.sh
USER datahub

View File

@ -27,6 +27,7 @@ docker {
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
include "docker/snippets/*"
include "metadata-ingestion/**"
include "metadata-ingestion-modules/**"
include "python-build/**"

View File

@ -1,34 +0,0 @@
#!/bin/bash
set -ex
PYSPARK_JARS="$(python -c 'import site; print(site.getsitepackages()[0])')/pyspark/jars"
function replace_jar {
JAR_PREFIX=$1
TRANSITIVE=$2
DEPENDENCY=$3
echo "Removing version conflicts for $PYSPARK_JARS/$JAR_PREFIX*.jar"
ls "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
rm "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
rm -r "$HOME/.m2" || true
if [ ! -z "$DEPENDENCY" ]; then
echo "Resolving $DEPENDENCY"
mvn dependency:get -Dtransitive=$TRANSITIVE -Dartifact="$DEPENDENCY" >/dev/null
echo "Moving jars to $PYSPARK_JARS"
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec echo "{}" \;
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec cp {} "$PYSPARK_JARS/" \;
fi
}
replace_jar "zookeeper-" "false" "${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_API_DEPENDENCY:-org.apache.hadoop:hadoop-client-api:3.3.6}"
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_RUNTIME_DEPENDENCY:-org.apache.hadoop:hadoop-client-runtime:3.3.6}"
replace_jar "hadoop-yarn-" "true" "${HADOOP_YARN_DEPENDENCY:-org.apache.hadoop:hadoop-yarn-server-web-proxy:3.3.6}"
replace_jar "snappy-java-" "false" "${SNAPPY_JAVA_DEPENDENCY:-org.xerial.snappy:snappy-java:1.1.10.5}"
replace_jar "libthrift-" "false" "${LIBTHRIFT_DEPENDENCY:-org.apache.thrift:libthrift:0.19.0}"
replace_jar "ivy-" "false" "${IVY_DEPENDENCY:-org.apache.ivy:ivy:2.5.2}"
replace_jar "parquet-jackson-" "false" "${PARQUET_JACKSON_DEPENDENCY:-org.apache.parquet:parquet-jackson:1.13.1}"

View File

@ -4,5 +4,5 @@ RUN apt-get update && apt-get install --no-install-recommends -y -qq \
build-essential \
&& rm -rf /var/lib/apt/lists/*
RUN --mount=type=bind,source=./docker/datahub-ingestion/oracle_instantclient.sh,target=/oracle_instantclient.sh \
RUN --mount=type=bind,source=./docker/snippets/oracle_instantclient.sh,target=/oracle_instantclient.sh \
/oracle_instantclient.sh

View File

@ -87,5 +87,6 @@ if __name__ == "__main__":
for file in [
"docker/datahub-ingestion-base/Dockerfile",
"docker/datahub-ingestion/Dockerfile",
"docker/datahub-actions/Dockerfile",
]:
update_template(Path(_repo_root / file), check_only=check_only)