mirror of
https://github.com/datahub-project/datahub.git
synced 2025-06-27 05:03:31 +00:00
feat(ci): make datahub-actions docker build standalone (#13241)
This commit is contained in:
parent
4e37202373
commit
b75dbaa3a1
7
.github/workflows/docker-unified.yml
vendored
7
.github/workflows/docker-unified.yml
vendored
@ -1043,7 +1043,7 @@ jobs:
|
||||
datahub_actions_build:
|
||||
name: Build and Push DataHub Actions Docker Image
|
||||
runs-on: depot-ubuntu-24.04
|
||||
needs: [setup, base_build]
|
||||
needs: [setup]
|
||||
if: ${{ needs.setup.outputs.actions_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true'}}
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
@ -1061,6 +1061,8 @@ jobs:
|
||||
image_tag: ${{ needs.setup.outputs.tag }}
|
||||
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
|
||||
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
|
||||
build-args: |
|
||||
RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }}
|
||||
publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
|
||||
context: .
|
||||
file: ./docker/datahub-actions/Dockerfile
|
||||
@ -1103,7 +1105,7 @@ jobs:
|
||||
datahub_actions_slim_build:
|
||||
name: Build and Push DataHub Actions Slim Docker Image
|
||||
runs-on: depot-ubuntu-24.04
|
||||
needs: [setup, base_build]
|
||||
needs: [setup]
|
||||
outputs:
|
||||
tag: ${{ steps.tag.outputs.tag }}
|
||||
needs_artifact_download: ${{ needs.setup.outputs.actions_change == 'true' && ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true') }}
|
||||
@ -1123,6 +1125,7 @@ jobs:
|
||||
password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
|
||||
build-args: |
|
||||
APP_ENV=slim
|
||||
RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }}
|
||||
publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
|
||||
context: .
|
||||
file: ./docker/datahub-actions/Dockerfile
|
||||
|
@ -29,6 +29,7 @@ ext {
|
||||
docker_repo = 'datahub-actions'
|
||||
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
|
||||
docker_version = "v${version}${docker_target == 'slim' ? '-slim' : ''}"
|
||||
python_docker_version = project.getProperties().getOrDefault("pythonDockerVersion", "1!0.0.0+docker.${version}")
|
||||
}
|
||||
|
||||
if (!project.hasProperty("extra_pip_requirements")) {
|
||||
@ -152,11 +153,13 @@ docker {
|
||||
exclude "**/*.xml"
|
||||
include ".dockerignore"
|
||||
include "docker/datahub-actions/**"
|
||||
include "docker/snippets/**"
|
||||
include "datahub-actions/**"
|
||||
include "python-build/**"
|
||||
}.exclude {
|
||||
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||
}
|
||||
// buildArgs([APP_ENV: 'prod-slim'])
|
||||
buildArgs([APP_ENV: docker_target, RELEASE_VERSION: python_docker_version])
|
||||
|
||||
additionalTag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||
|
||||
|
@ -14,11 +14,11 @@
|
||||
|
||||
# Published at https://pypi.org/project/acryl-datahub-actions/.
|
||||
__package_name__ = "acryl-datahub-actions"
|
||||
__version__ = "0.0.0.dev0"
|
||||
__version__ = "1!0.0.0.dev0"
|
||||
|
||||
|
||||
def is_dev_mode() -> bool:
|
||||
return __version__ == "0.0.0.dev0"
|
||||
return __version__ == "1!0.0.0.dev0"
|
||||
|
||||
|
||||
def nice_version_name() -> str:
|
||||
|
@ -1,61 +1,160 @@
|
||||
# Copyright 2021 Acryl Data, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This image has two variants: full and slim.
|
||||
# The full variant has additional deps preinstalled, like a JRE and Oracle client.
|
||||
ARG APP_ENV=full
|
||||
ARG PYTHON_VERSION=3.10
|
||||
|
||||
# Defining environment
|
||||
ARG BASE_IMAGE=acryldata/datahub-ingestion-base
|
||||
ARG DOCKER_VERSION=head-full
|
||||
ARG APP_ENV=prod
|
||||
# INLINE-BEGIN @/docker/snippets/ingestion_base
|
||||
# This is the "base" image workflow.
|
||||
# While it has a bunch of intermediate stages, it "exports" a couple
|
||||
# stages for consumption.
|
||||
# - python-base: A basic stage, with basic deps, Python, and a venv.
|
||||
# - ingestion-base-slim: Currently the same as base-empty.
|
||||
# - ingestion-base-full: Adds a JRE and Oracle client.
|
||||
|
||||
FROM $BASE_IMAGE:$DOCKER_VERSION AS base
|
||||
FROM ubuntu:24.04 AS python-base
|
||||
|
||||
# TODO: This may not work on Ubuntu 24.04 due to the new deb822 package format.
|
||||
ARG UBUNTU_REPO_URL=http://ports.ubuntu.com/ubuntu-ports
|
||||
RUN if [ "${UBUNTU_REPO_URL}" != "http://ports.ubuntu.com/ubuntu-ports" ] ; then sed -i "s#http.*://ports.ubuntu.com/ubuntu-ports#${UBUNTU_REPO_URL}#g" /etc/apt/sources.list ; fi
|
||||
|
||||
ENV HOME=/home/datahub
|
||||
RUN existing_group=$(getent group 1000 | cut -d: -f1) && \
|
||||
if [ -n "$existing_group" ] && [ "$existing_group" != "datahub" ]; then \
|
||||
echo "Renaming existing group $existing_group to datahub"; \
|
||||
groupmod -n datahub "$existing_group"; \
|
||||
elif [ -z "$existing_group" ]; then \
|
||||
echo "Creating new group datahub with GID 1000"; \
|
||||
addgroup --gid 1000 datahub; \
|
||||
fi && \
|
||||
existing_user=$(id -nu 1000 2>/dev/null || echo "") && \
|
||||
if [ -n "$existing_user" ] && [ "$existing_user" != "datahub" ]; then \
|
||||
echo "Renaming existing user $existing_user to datahub"; \
|
||||
usermod -l datahub -d $HOME "$existing_user"; \
|
||||
usermod -g datahub datahub; \
|
||||
elif [ -z "$existing_user" ]; then \
|
||||
echo "Creating new user datahub with UID 1000"; \
|
||||
adduser --disabled-password --uid 1000 --gid 1000 --home $HOME datahub; \
|
||||
fi && \
|
||||
# Create and set proper permissions for datahub directories
|
||||
mkdir -p $HOME && \
|
||||
chown -R datahub:datahub $HOME
|
||||
|
||||
|
||||
# Setup the PPA for alternative Python versions.
|
||||
# TODO: Eventually we should switch to using uv's support for python-build-standalone.
|
||||
RUN apt-get update && apt-get install -y \
|
||||
software-properties-common \
|
||||
lsb-release \
|
||||
gnupg \
|
||||
ca-certificates \
|
||||
&& add-apt-repository --no-update ppa:deadsnakes/ppa \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ARG PYTHON_VERSION
|
||||
RUN test -n "${PYTHON_VERSION}" # PYTHON_VERSION must be set
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
python${PYTHON_VERSION} \
|
||||
python${PYTHON_VERSION}-venv \
|
||||
python${PYTHON_VERSION}-dev \
|
||||
python${PYTHON_VERSION}-distutils \
|
||||
python-is-python3 \
|
||||
git \
|
||||
wget \
|
||||
curl \
|
||||
zip \
|
||||
unzip \
|
||||
nano \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set the default python version.
|
||||
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --install /usr/bin/python python /usr/bin/python3 1
|
||||
|
||||
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
||||
|
||||
ARG PIP_MIRROR_URL=https://pypi.python.org/simple
|
||||
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then uvx --no-cache pip config set global.index-url ${PIP_MIRROR_URL} ; fi
|
||||
ENV UV_INDEX_URL=${PIP_MIRROR_URL}
|
||||
|
||||
USER datahub
|
||||
WORKDIR $HOME
|
||||
RUN uv venv --python "$PYTHON_VERSION"
|
||||
ENV VIRTUAL_ENV=$HOME/.venv
|
||||
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
|
||||
|
||||
# We always want to use the system CA bundle.
|
||||
# Requests comes with it's own CA bundle, which we need to override.
|
||||
ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
|
||||
# uv uses a different mechanism. See https://github.com/astral-sh/uv/issues/1474.
|
||||
ENV SSL_CERT_FILE="/etc/ssl/certs/ca-certificates.crt"
|
||||
|
||||
|
||||
FROM python-base AS ingestion-base-slim
|
||||
|
||||
USER 0
|
||||
RUN apt-get update && apt-get install -y \
|
||||
python3-ldap \
|
||||
libldap2-dev \
|
||||
libsasl2-dev \
|
||||
libsasl2-modules \
|
||||
libaio-dev \
|
||||
libaio1t64 \
|
||||
libsasl2-modules-gssapi-mit \
|
||||
krb5-user \
|
||||
krb5-config \
|
||||
libkrb5-dev \
|
||||
librdkafka-dev \
|
||||
ldap-utils \
|
||||
unixodbc \
|
||||
libodbc2 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
USER datahub
|
||||
|
||||
FROM ingestion-base-slim AS ingestion-base-full
|
||||
|
||||
USER 0
|
||||
# We need to install build-essential in order to build some Python packages (e.g. python-ldap)
|
||||
RUN apt-get update && apt-get install --no-install-recommends -y -qq \
|
||||
default-jre-headless \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN --mount=type=bind,source=./docker/snippets/oracle_instantclient.sh,target=/oracle_instantclient.sh \
|
||||
/oracle_instantclient.sh
|
||||
|
||||
USER datahub
|
||||
# INLINE-END
|
||||
|
||||
FROM ingestion-base-${APP_ENV} AS final
|
||||
|
||||
USER root
|
||||
|
||||
COPY --from=powerman/dockerize:0.19 /usr/local/bin/dockerize /usr/local/bin
|
||||
COPY --chown=datahub:datahub ./docker/datahub-actions/start.sh /start_datahub_actions.sh
|
||||
COPY --chown=datahub:datahub ./docker/datahub-actions/readiness-check.sh /readiness-check.sh
|
||||
|
||||
RUN chmod a+x /start_datahub_actions.sh && \
|
||||
mkdir -p /etc/datahub/actions && \
|
||||
mkdir -p /tmp/datahub/logs/actions/system && \
|
||||
chown -R datahub:datahub /etc/datahub /tmp/datahub && \
|
||||
apt-get update && \
|
||||
apt-get install -y -qq default-jre && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/{apt,dpkg,cache,log}/
|
||||
chown -R datahub:datahub /etc/datahub /tmp/datahub
|
||||
|
||||
COPY --chown=datahub:datahub ./datahub-actions /actions-src
|
||||
COPY --chown=datahub:datahub ./datahub-actions /datahub-actions
|
||||
# Add other default configurations into this!
|
||||
COPY --chown=datahub:datahub ./docker/datahub-actions/config /etc/datahub/actions/system/conf
|
||||
|
||||
USER datahub
|
||||
WORKDIR /actions-src
|
||||
|
||||
FROM base AS slim-install
|
||||
ARG RELEASE_VERSION
|
||||
RUN test -n "$RELEASE_VERSION" # RELEASE_VERSION is a required build arg
|
||||
RUN --mount=type=bind,source=./python-build/version_updater.py,target=/version_updater.py \
|
||||
python /version_updater.py --directory /datahub-actions/ --version "$RELEASE_VERSION" --expected-update-count 1
|
||||
|
||||
# Effectively builds the image without the .cache (looks like it is duplicated but is not)
|
||||
RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \
|
||||
UV_LINK_MODE=copy uv pip install -e ".[all]"
|
||||
|
||||
FROM base AS prod-install
|
||||
|
||||
RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \
|
||||
UV_LINK_MODE=copy uv pip install -e ".[all]"
|
||||
|
||||
FROM ${APP_ENV}-install AS final
|
||||
WORKDIR /datahub-ingestion
|
||||
|
||||
# This is required to fix security vulnerability in htrace-core4
|
||||
RUN find . -name "htrace-core4-4.1.0-incubating.jar" -exec rm "{}" \;
|
||||
# For the datahub-actions build, we explicitly want to retain the uv cache.
|
||||
# This speeds up the process of creating venvs at runtime.
|
||||
# Because uv uses hardlinks for installing packages, keeping the cache around does not
|
||||
# really impact image size.
|
||||
RUN uv pip install -e "/datahub-actions/[all]"
|
||||
|
||||
ENTRYPOINT [ ]
|
||||
CMD dockerize -wait ${DATAHUB_GMS_PROTOCOL:-http}://$DATAHUB_GMS_HOST:$DATAHUB_GMS_PORT/health -timeout 240s /start_datahub_actions.sh
|
@ -1,10 +0,0 @@
|
||||
docker/datahub-actions/Dockerfile
|
||||
.git
|
||||
**scripts/
|
||||
**build/
|
||||
**venv/
|
||||
**tests/
|
||||
**smoke-test/
|
||||
**/*.xml
|
||||
.*/
|
||||
datahub-actions/.*/
|
@ -119,7 +119,7 @@ RUN apt-get update && apt-get install --no-install-recommends -y -qq \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN --mount=type=bind,source=./docker/datahub-ingestion/oracle_instantclient.sh,target=/oracle_instantclient.sh \
|
||||
RUN --mount=type=bind,source=./docker/snippets/oracle_instantclient.sh,target=/oracle_instantclient.sh \
|
||||
/oracle_instantclient.sh
|
||||
|
||||
USER datahub
|
||||
|
@ -120,7 +120,7 @@ RUN apt-get update && apt-get install --no-install-recommends -y -qq \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN --mount=type=bind,source=./docker/datahub-ingestion/oracle_instantclient.sh,target=/oracle_instantclient.sh \
|
||||
RUN --mount=type=bind,source=./docker/snippets/oracle_instantclient.sh,target=/oracle_instantclient.sh \
|
||||
/oracle_instantclient.sh
|
||||
|
||||
USER datahub
|
||||
|
@ -27,6 +27,7 @@ docker {
|
||||
files fileTree(rootProject.projectDir) {
|
||||
include '.dockerignore'
|
||||
include "docker/${docker_dir}/*"
|
||||
include "docker/snippets/*"
|
||||
include "metadata-ingestion/**"
|
||||
include "metadata-ingestion-modules/**"
|
||||
include "python-build/**"
|
||||
|
@ -1,34 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
PYSPARK_JARS="$(python -c 'import site; print(site.getsitepackages()[0])')/pyspark/jars"
|
||||
|
||||
function replace_jar {
|
||||
JAR_PREFIX=$1
|
||||
TRANSITIVE=$2
|
||||
DEPENDENCY=$3
|
||||
|
||||
echo "Removing version conflicts for $PYSPARK_JARS/$JAR_PREFIX*.jar"
|
||||
ls "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
|
||||
rm "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
|
||||
rm -r "$HOME/.m2" || true
|
||||
|
||||
if [ ! -z "$DEPENDENCY" ]; then
|
||||
echo "Resolving $DEPENDENCY"
|
||||
mvn dependency:get -Dtransitive=$TRANSITIVE -Dartifact="$DEPENDENCY" >/dev/null
|
||||
|
||||
echo "Moving jars to $PYSPARK_JARS"
|
||||
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec echo "{}" \;
|
||||
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec cp {} "$PYSPARK_JARS/" \;
|
||||
fi
|
||||
}
|
||||
|
||||
replace_jar "zookeeper-" "false" "${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
|
||||
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_API_DEPENDENCY:-org.apache.hadoop:hadoop-client-api:3.3.6}"
|
||||
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_RUNTIME_DEPENDENCY:-org.apache.hadoop:hadoop-client-runtime:3.3.6}"
|
||||
replace_jar "hadoop-yarn-" "true" "${HADOOP_YARN_DEPENDENCY:-org.apache.hadoop:hadoop-yarn-server-web-proxy:3.3.6}"
|
||||
replace_jar "snappy-java-" "false" "${SNAPPY_JAVA_DEPENDENCY:-org.xerial.snappy:snappy-java:1.1.10.5}"
|
||||
replace_jar "libthrift-" "false" "${LIBTHRIFT_DEPENDENCY:-org.apache.thrift:libthrift:0.19.0}"
|
||||
replace_jar "ivy-" "false" "${IVY_DEPENDENCY:-org.apache.ivy:ivy:2.5.2}"
|
||||
replace_jar "parquet-jackson-" "false" "${PARQUET_JACKSON_DEPENDENCY:-org.apache.parquet:parquet-jackson:1.13.1}"
|
@ -4,5 +4,5 @@ RUN apt-get update && apt-get install --no-install-recommends -y -qq \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN --mount=type=bind,source=./docker/datahub-ingestion/oracle_instantclient.sh,target=/oracle_instantclient.sh \
|
||||
RUN --mount=type=bind,source=./docker/snippets/oracle_instantclient.sh,target=/oracle_instantclient.sh \
|
||||
/oracle_instantclient.sh
|
||||
|
@ -87,5 +87,6 @@ if __name__ == "__main__":
|
||||
for file in [
|
||||
"docker/datahub-ingestion-base/Dockerfile",
|
||||
"docker/datahub-ingestion/Dockerfile",
|
||||
"docker/datahub-actions/Dockerfile",
|
||||
]:
|
||||
update_template(Path(_repo_root / file), check_only=check_only)
|
||||
|
Loading…
x
Reference in New Issue
Block a user