From 275535d4d38c143149de3a6ab87f8452de1dc8f3 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 9 Apr 2025 11:53:34 -0700 Subject: [PATCH] feat: start removing ingestion-base image (#13146) --- .dockerignore | 4 + .github/workflows/docker-unified.yml | 24 +- docker/datahub-ingestion-base/Dockerfile | 164 +++++++------- docker/datahub-ingestion/Dockerfile | 212 ++++++++++++++---- docker/datahub-ingestion/Dockerfile-slim-only | 31 --- docker/datahub-ingestion/build.gradle | 7 +- docker/snippets/.gitignore | 2 + docker/snippets/ingestion_base.template | 46 ++++ docker/snippets/ingestion_full_deps | 21 ++ docker/snippets/ubuntu_mirror_setup | 3 + docker/snippets/ubuntu_python_base | 80 +++++++ python-build/generate_ingestion_docker.py | 87 +++++++ 12 files changed, 502 insertions(+), 179 deletions(-) delete mode 100644 docker/datahub-ingestion/Dockerfile-slim-only create mode 100644 docker/snippets/.gitignore create mode 100644 docker/snippets/ingestion_base.template create mode 100644 docker/snippets/ingestion_full_deps create mode 100644 docker/snippets/ubuntu_mirror_setup create mode 100644 docker/snippets/ubuntu_python_base create mode 100644 python-build/generate_ingestion_docker.py diff --git a/.dockerignore b/.dockerignore index 602b46750d..b251e9bea9 100644 --- a/.dockerignore +++ b/.dockerignore @@ -5,6 +5,7 @@ **/.tox/ **/.mypy_cache/ **/.pytest_cache/ +**/.ruff_cache/ **/__pycache__/ out **/*.class @@ -16,3 +17,6 @@ out .git/COMMIT_* .git/index .gradle + +/metadata-ingestion/tests +/metadata-ingestion/examples diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 2b6dc7e675..5f2094ee6d 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -754,7 +754,7 @@ jobs: if: ${{ needs.setup.outputs.ingestion_base_change == 'true' }} uses: ./.github/actions/docker-custom-build-and-push with: - target: base + target: base-empty images: | ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} image_tag: ${{ needs.setup.outputs.tag }} @@ -798,7 +798,6 @@ jobs: if: ${{ needs.setup.outputs.ingestion_base_change == 'true' }} uses: ./.github/actions/docker-custom-build-and-push with: - target: slim-install images: | ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} image_tag: ${{ needs.setup.outputs.slim_tag }} @@ -841,7 +840,6 @@ jobs: if: ${{ needs.setup.outputs.ingestion_base_change == 'true' }} uses: ./.github/actions/docker-custom-build-and-push with: - target: full-install images: | ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} image_tag: ${{ needs.setup.outputs.full_tag }} @@ -864,7 +862,7 @@ jobs: outputs: tag: ${{ steps.tag.outputs.tag }} needs_artifact_download: ${{ needs.setup.outputs.ingestion_change == 'true' && ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true') }} - needs: [setup, smoke_test_lint,datahub_ingestion_base_slim_build] + needs: [setup, smoke_test_lint] if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} steps: - name: Check out the repo @@ -881,11 +879,6 @@ jobs: - name: Build codegen if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish =='true' }} run: ./gradlew :metadata-ingestion:codegen - - name: Download Base Image - uses: ishworkh/container-image-artifact-download@v2.0.0 - if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} - with: - image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }} - name: Login to DockerHub uses: docker/login-action@v3 if: ${{ needs.setup.outputs.docker-login == 'true' && needs.setup.outputs.publish == 'false' && needs.setup.outputs.pr-publish == 'false' && needs.setup.outputs.ingestion_base_change == 'false' }} @@ -896,12 +889,9 @@ jobs: if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} uses: ./.github/actions/docker-custom-build-and-push with: - target: final images: | ${{ env.DATAHUB_INGESTION_IMAGE }} build-args: | - BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }} - DOCKER_VERSION=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }} RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }} APP_ENV=slim image_tag: ${{ needs.setup.outputs.slim_tag }} @@ -960,7 +950,7 @@ jobs: outputs: tag: ${{ steps.tag.outputs.tag }} needs_artifact_download: ${{ needs.setup.outputs.ingestion_change == 'true' && ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) }} - needs: [setup, smoke_test_lint,datahub_ingestion_base_full_build] + needs: [setup, smoke_test_lint] if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} steps: - name: Check out the repo @@ -978,11 +968,6 @@ jobs: - name: Build codegen if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} run: ./gradlew :metadata-ingestion:codegen - - name: Download Base Image - uses: ishworkh/container-image-artifact-download@v2.0.0 - if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }} - with: - image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }} - name: Login to DockerHub uses: docker/login-action@v3 if: ${{ needs.setup.outputs.docker-login == 'true' && needs.setup.outputs.publish == 'false' && needs.setup.outputs.pr-publish == 'false' && needs.setup.outputs.ingestion_base_change == 'false' }} @@ -993,12 +978,9 @@ jobs: if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} uses: ./.github/actions/docker-custom-build-and-push with: - target: final images: | ${{ env.DATAHUB_INGESTION_IMAGE }} build-args: | - BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }} - DOCKER_VERSION=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }} RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }} image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index 53c8afabd4..ecb80e80ad 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -1,42 +1,63 @@ +# This image has two variants: full and slim. ARG APP_ENV=full -ARG BASE_IMAGE=base +ARG PYTHON_VERSION=3.10 -# Defining custom repo urls for use in enterprise environments. Re-used between stages below. -ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine -ARG GITHUB_REPO_URL=https://github.com -ARG DEBIAN_REPO_URL=https://deb.debian.org/debian +# INLINE-BEGIN @/docker/snippets/ingestion_base +# This is the "base" image workflow. +# While it has a bunch of intermediate stages, it "exports" a couple +# stages for consumption. +# - base-empty: A basic stage, with basic deps, Python, and a venv. +# - base-slim: Currently the same as base-empty. +# - base-full: Adds a JRE and Oracle client. + +FROM ubuntu:24.04 AS base-empty + +ARG PYTHON_VERSION +RUN test -n "${PYTHON_VERSION}" # PYTHON_VERSION must be set + +# TODO: This may not work on Ubuntu 24.04 due to the new deb822 package format. ARG UBUNTU_REPO_URL=http://ports.ubuntu.com/ubuntu-ports -ARG PIP_MIRROR_URL=https://pypi.python.org/simple - -FROM ubuntu:24.04 AS base - -ARG GITHUB_REPO_URL - -ENV DEBIAN_FRONTEND=noninteractive - -# Optionally set corporate mirror for deb -ARG DEBIAN_REPO_URL -ARG UBUNTU_REPO_URL -RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list ; fi RUN if [ "${UBUNTU_REPO_URL}" != "http://ports.ubuntu.com/ubuntu-ports" ] ; then sed -i "s#http.*://ports.ubuntu.com/ubuntu-ports#${UBUNTU_REPO_URL}#g" /etc/apt/sources.list ; fi -# Install software-properties-common to add PPAs + +ENV HOME=/home/datahub +RUN existing_group=$(getent group 1000 | cut -d: -f1) && \ + if [ -n "$existing_group" ] && [ "$existing_group" != "datahub" ]; then \ + echo "Renaming existing group $existing_group to datahub"; \ + groupmod -n datahub "$existing_group"; \ + elif [ -z "$existing_group" ]; then \ + echo "Creating new group datahub with GID 1000"; \ + addgroup --gid 1000 datahub; \ + fi && \ + existing_user=$(id -nu 1000 2>/dev/null || echo "") && \ + if [ -n "$existing_user" ] && [ "$existing_user" != "datahub" ]; then \ + echo "Renaming existing user $existing_user to datahub"; \ + usermod -l datahub -d $HOME "$existing_user"; \ + usermod -g datahub datahub; \ + elif [ -z "$existing_user" ]; then \ + echo "Creating new user datahub with UID 1000"; \ + adduser --disabled-password --uid 1000 --gid 1000 --home $HOME datahub; \ + fi && \ + # Create and set proper permissions for datahub directories + mkdir -p $HOME && \ + chown -R datahub:datahub $HOME + + +# Setup the PPA for alternative Python versions. +# TODO: Eventually we should switch to using uv's support for python-build-standalone. RUN apt-get update && apt-get install -y \ software-properties-common \ lsb-release \ gnupg \ ca-certificates \ + && add-apt-repository --no-update ppa:deadsnakes/ppa \ && rm -rf /var/lib/apt/lists/* -# Add deadsnakes PPA for Python 3.10 -RUN add-apt-repository ppa:deadsnakes/ppa - -RUN apt-get update && apt-get upgrade -y \ - && apt-get install -y -qq \ - python3.10 \ - python3.10-venv \ - python3.10-dev \ - python3.10-distutils \ +RUN apt-get update && apt-get install -y \ + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-venv \ + python${PYTHON_VERSION}-dev \ + python${PYTHON_VERSION}-distutils \ python3-pip \ python3-ldap \ python-is-python3 \ @@ -61,64 +82,49 @@ RUN apt-get update && apt-get upgrade -y \ && apt-get clean \ && rm -rf /var/lib/{apt,dpkg,cache,log}/ -# Set Python 3.10 as the default python3 -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 \ +# Set the default python version. +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --install /usr/bin/python python /usr/bin/python3 1 -# Optionally set corporate mirror for pip -ARG PIP_MIRROR_URL -RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +ARG PIP_MIRROR_URL=https://pypi.python.org/simple +RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then uvx --no-cache pip config set global.index-url ${PIP_MIRROR_URL} ; fi ENV UV_INDEX_URL=${PIP_MIRROR_URL} -COPY --from=powerman/dockerize:0.19 /usr/local/bin/dockerize /usr/local/bin - -COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh - -RUN existing_group=$(getent group 1000 | cut -d: -f1) && \ - if [ -n "$existing_group" ] && [ "$existing_group" != "datahub" ]; then \ - echo "Renaming existing group $existing_group to datahub"; \ - groupmod -n datahub "$existing_group"; \ - elif [ -z "$existing_group" ]; then \ - echo "Creating new group datahub with GID 1000"; \ - addgroup --gid 1000 datahub; \ - fi && \ - existing_user=$(id -nu 1000 2>/dev/null || echo "") && \ - if [ -n "$existing_user" ] && [ "$existing_user" != "datahub" ]; then \ - echo "Renaming existing user $existing_user to datahub"; \ - usermod -l datahub -d /datahub-ingestion "$existing_user"; \ - usermod -g datahub datahub; \ - elif [ -z "$existing_user" ]; then \ - echo "Creating new user datahub with UID 1000"; \ - adduser --disabled-password --uid 1000 --gid 1000 --home /datahub-ingestion datahub; \ - fi && \ - # Create and set proper permissions for datahub directories - mkdir -p /datahub-ingestion && \ - chown -R datahub:datahub /datahub-ingestion && \ - chmod +x /entrypoint.sh - USER datahub -ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt -ENV VIRTUAL_ENV=/datahub-ingestion/.venv +WORKDIR $HOME +RUN uv venv --python "$PYTHON_VERSION" +ENV VIRTUAL_ENV=$HOME/.venv ENV PATH="${VIRTUAL_ENV}/bin:$PATH" -RUN python3 -m venv $VIRTUAL_ENV && \ - pip install --no-cache --upgrade pip 'uv>=0.1.10' && \ - uv pip install --no-cache --upgrade setuptools wheel python-ldap +# Requests comes with it's own CA bundle, but we want to use always use the system CA bundle. +ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt -# Note: Normally uv will create hardlinks from the cache directory to the venv. -# In our docker files, we normally use `RUN --mount=type=cache,... uv pip install ...`, -# which means the cache directory is on a separate filesystem. uv will emit a warning: -# Failed to hardlink files; falling back to full copy. This may lead to degraded performance. -# If the cache and target directories are on different filesystems, hardlinking may not be supported. -# If this is intentional, set `export UV_LINK_MODE=copy` or use `--link-mode=copy` to suppress this warning. -ENTRYPOINT [ "/entrypoint.sh" ] - -FROM ${BASE_IMAGE} AS full-install +FROM base-empty AS full-deps-prebuild USER 0 -RUN apt-get update && apt-get install -y -qq \ +RUN apt-get update && apt-get install --no-install-recommends -y -qq \ + build-essential \ + maven \ + && rm -rf /var/lib/apt/lists/* +USER datahub + +RUN uv pip install python-ldap==3.4.4 +RUN --mount=type=cache,target=$HOME/.cache/uv,uid=1000,gid=1000 \ + --mount=type=bind,source=./docker/datahub-ingestion/pyspark_jars.sh,target=/pyspark_jars.sh \ + uv pip install python-ldap==3.4.4 pyspark~=3.5.0 && \ + /pyspark_jars.sh + +FROM base-empty AS base-slim +# Nothing to do here. + +FROM base-slim AS base-full + +USER 0 +RUN apt-get update && apt-get install --no-install-recommends -y -qq \ default-jre-headless \ - && rm -rf /var/lib/apt/lists/* /var/cache/apk/* + && rm -rf /var/lib/apt/lists/* RUN if [ $(arch) = "x86_64" ]; then \ mkdir /opt/oracle && \ @@ -140,9 +146,13 @@ RUN if [ $(arch) = "x86_64" ]; then \ USER datahub -FROM ${BASE_IMAGE} AS slim-install -# Do nothing else on top of base +RUN --mount=from=full-deps-prebuild,source=$HOME/.venv,target=/venv-full \ + rm -r .venv && \ + cp -r /venv-full .venv +# INLINE-END -FROM ${APP_ENV}-install +FROM base-${APP_ENV} AS final +COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh +ENTRYPOINT [ "/entrypoint.sh" ] ENV PATH="/datahub-ingestion/.local/bin:$PATH" diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index a9fd3a6662..c18317db2e 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -1,64 +1,184 @@ -# Defining environment +# This image has two variants: full and slim. +# The full variant has a larger set of ingestion sources, but is a larger image. ARG APP_ENV=full -ARG BASE_IMAGE=acryldata/datahub-ingestion-base -ARG DOCKER_VERSION=head-full -ARG DEBIAN_REPO_URL=https://deb.debian.org/debian +ARG PYTHON_VERSION=3.10 + +# INLINE-BEGIN @/docker/snippets/ingestion_base +# This is the "base" image workflow. +# While it has a bunch of intermediate stages, it "exports" a couple +# stages for consumption. +# - base-empty: A basic stage, with basic deps, Python, and a venv. +# - base-slim: Currently the same as base-empty. +# - base-full: Adds a JRE and Oracle client. + +FROM ubuntu:24.04 AS base-empty + +ARG PYTHON_VERSION +RUN test -n "${PYTHON_VERSION}" # PYTHON_VERSION must be set + +# TODO: This may not work on Ubuntu 24.04 due to the new deb822 package format. ARG UBUNTU_REPO_URL=http://ports.ubuntu.com/ubuntu-ports -ARG PIP_MIRROR_URL=https://pypi.python.org/simple - -FROM $BASE_IMAGE:$DOCKER_VERSION AS base - -# Optionally set corporate mirror for deb -USER 0 -ARG DEBIAN_REPO_URL -ARG UBUNTU_REPO_URL -RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list ; fi RUN if [ "${UBUNTU_REPO_URL}" != "http://ports.ubuntu.com/ubuntu-ports" ] ; then sed -i "s#http.*://ports.ubuntu.com/ubuntu-ports#${UBUNTU_REPO_URL}#g" /etc/apt/sources.list ; fi -USER datahub -# Optionally set corporate mirror for pip -ARG PIP_MIRROR_URL -RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi + +ENV HOME=/home/datahub +RUN existing_group=$(getent group 1000 | cut -d: -f1) && \ + if [ -n "$existing_group" ] && [ "$existing_group" != "datahub" ]; then \ + echo "Renaming existing group $existing_group to datahub"; \ + groupmod -n datahub "$existing_group"; \ + elif [ -z "$existing_group" ]; then \ + echo "Creating new group datahub with GID 1000"; \ + addgroup --gid 1000 datahub; \ + fi && \ + existing_user=$(id -nu 1000 2>/dev/null || echo "") && \ + if [ -n "$existing_user" ] && [ "$existing_user" != "datahub" ]; then \ + echo "Renaming existing user $existing_user to datahub"; \ + usermod -l datahub -d $HOME "$existing_user"; \ + usermod -g datahub datahub; \ + elif [ -z "$existing_user" ]; then \ + echo "Creating new user datahub with UID 1000"; \ + adduser --disabled-password --uid 1000 --gid 1000 --home $HOME datahub; \ + fi && \ + # Create and set proper permissions for datahub directories + mkdir -p $HOME && \ + chown -R datahub:datahub $HOME + + +# Setup the PPA for alternative Python versions. +# TODO: Eventually we should switch to using uv's support for python-build-standalone. +RUN apt-get update && apt-get install -y \ + software-properties-common \ + lsb-release \ + gnupg \ + ca-certificates \ + && add-apt-repository --no-update ppa:deadsnakes/ppa \ + && rm -rf /var/lib/apt/lists/* + +RUN apt-get update && apt-get install -y \ + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-venv \ + python${PYTHON_VERSION}-dev \ + python${PYTHON_VERSION}-distutils \ + python3-pip \ + python3-ldap \ + python-is-python3 \ + libldap2-dev \ + libsasl2-dev \ + libsasl2-modules \ + libaio-dev \ + libaio1t64 \ + libsasl2-modules-gssapi-mit \ + krb5-user \ + krb5-config \ + libkrb5-dev \ + librdkafka-dev \ + git \ + wget \ + curl \ + zip \ + unzip \ + ldap-utils \ + unixodbc \ + libodbc2 \ + && apt-get clean \ + && rm -rf /var/lib/{apt,dpkg,cache,log}/ + +# Set the default python version. +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --install /usr/bin/python python /usr/bin/python3 1 + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +ARG PIP_MIRROR_URL=https://pypi.python.org/simple +RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then uvx --no-cache pip config set global.index-url ${PIP_MIRROR_URL} ; fi ENV UV_INDEX_URL=${PIP_MIRROR_URL} -COPY --chown=datahub ./metadata-ingestion /metadata-ingestion -COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /metadata-ingestion/airflow-plugin +USER datahub +WORKDIR $HOME +RUN uv venv --python "$PYTHON_VERSION" +ENV VIRTUAL_ENV=$HOME/.venv +ENV PATH="${VIRTUAL_ENV}/bin:$PATH" +# Requests comes with it's own CA bundle, but we want to use always use the system CA bundle. +ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt -ARG RELEASE_VERSION -WORKDIR /metadata-ingestion -RUN sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/_version.py && \ - sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" airflow-plugin/src/datahub_airflow_plugin/_version.py && \ - cat src/datahub/_version.py | grep __version__ && \ - cat airflow-plugin/src/datahub_airflow_plugin/_version.py | grep __version__ -FROM base AS slim-install - -RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \ - UV_LINK_MODE=copy uv pip install -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" - -FROM base AS full-install-build +FROM base-empty AS full-deps-prebuild USER 0 -RUN apt-get update && apt-get install -y -qq maven +RUN apt-get update && apt-get install --no-install-recommends -y -qq \ + build-essential \ + maven \ + && rm -rf /var/lib/apt/lists/* +USER datahub + +RUN uv pip install python-ldap==3.4.4 +RUN --mount=type=cache,target=$HOME/.cache/uv,uid=1000,gid=1000 \ + --mount=type=bind,source=./docker/datahub-ingestion/pyspark_jars.sh,target=/pyspark_jars.sh \ + uv pip install python-ldap==3.4.4 pyspark~=3.5.0 && \ + /pyspark_jars.sh + +FROM base-empty AS base-slim +# Nothing to do here. + +FROM base-slim AS base-full + +USER 0 +RUN apt-get update && apt-get install --no-install-recommends -y -qq \ + default-jre-headless \ + && rm -rf /var/lib/apt/lists/* + +RUN if [ $(arch) = "x86_64" ]; then \ + mkdir /opt/oracle && \ + cd /opt/oracle && \ + wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/2115000/instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \ + unzip instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \ + rm instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \ + sh -c "echo /opt/oracle/instantclient_21_15 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \ + ldconfig; \ + else \ + mkdir /opt/oracle && \ + cd /opt/oracle && \ + wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/1923000/instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \ + unzip instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \ + rm instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \ + sh -c "echo /opt/oracle/instantclient_19_23 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \ + ldconfig; \ + fi; USER datahub -COPY ./docker/datahub-ingestion/pyspark_jars.sh . -RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \ - UV_LINK_MODE=copy uv pip install -e ".[base,all]" "./airflow-plugin[plugin-v2]" && \ - ./pyspark_jars.sh && \ +RUN --mount=from=full-deps-prebuild,source=$HOME/.venv,target=/venv-full \ + rm -r .venv && \ + cp -r /venv-full .venv +# INLINE-END + +FROM base-${APP_ENV} AS add-code + +COPY --chown=datahub ./metadata-ingestion /metadata-ingestion +COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /airflow-plugin + +ARG RELEASE_VERSION +RUN test -n "$RELEASE_VERSION" # RELEASE_VERSION is a required build arg +RUN test -d /metadata-ingestion/src/datahub/metadata # codegen must be run prior to building the image +RUN sed -i.bak "s/__version__ = .*$/__version__ = \"$RELEASE_VERSION\"/" /metadata-ingestion/src/datahub/_version.py && \ + sed -i.bak "s/__version__ = .*$/__version__ = \"$RELEASE_VERSION\"/" /airflow-plugin/src/datahub_airflow_plugin/_version.py && \ + cat /metadata-ingestion/src/datahub/_version.py | grep __version__ && \ + cat /airflow-plugin/src/datahub_airflow_plugin/_version.py | grep __version__ + +FROM add-code AS install-slim + +RUN --mount=type=cache,target=$HOME/.cache/uv,uid=1000,gid=1000 \ + UV_LINK_MODE=copy uv pip install -e "/metadata-ingestion/[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" && \ datahub --version -FROM base AS full-install +FROM add-code AS install-full -COPY --from=full-install-build ${VIRTUAL_ENV} ${VIRTUAL_ENV} +RUN --mount=type=cache,target=$HOME/.cache/uv,uid=1000,gid=1000 \ + UV_LINK_MODE=copy uv pip install \ + -e "/metadata-ingestion/[base,all]" \ + -e "/airflow-plugin/[plugin-v2]" && \ + datahub --version -FROM base AS dev-install -# Dummy stage for development. Assumes code is built on your machine and mounted to this image. -# See this excellent thread https://github.com/docker/cli/issues/1134 +FROM install-${APP_ENV} AS final -FROM ${APP_ENV}-install AS final - -WORKDIR /datahub-ingestion - -USER datahub +ENTRYPOINT [ "datahub" ] diff --git a/docker/datahub-ingestion/Dockerfile-slim-only b/docker/datahub-ingestion/Dockerfile-slim-only deleted file mode 100644 index 80abff204d..0000000000 --- a/docker/datahub-ingestion/Dockerfile-slim-only +++ /dev/null @@ -1,31 +0,0 @@ -# Defining environment -ARG BASE_IMAGE=acryldata/datahub-ingestion-base -ARG DOCKER_VERSION=head-slim -ARG PIP_MIRROR_URL=https://pypi.python.org/simple - -FROM $BASE_IMAGE:$DOCKER_VERSION as base -USER datahub - -# Optionally set corporate mirror for apk and pip -ARG PIP_MIRROR_URL -RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi -ENV UV_INDEX_URL=${PIP_MIRROR_URL} - -COPY --chown=datahub ./metadata-ingestion /metadata-ingestion - -ARG RELEASE_VERSION -WORKDIR /metadata-ingestion -RUN sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/_version.py && \ - cat src/datahub/_version.py - -FROM base as slim-install - -RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \ - UV_LINK_MODE=copy uv pip install -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" && \ - datahub --version - -FROM slim-install as final - -WORKDIR /datahub-ingestion - -USER datahub diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index 4bb7b739f7..22ec6a0bfe 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -16,14 +16,13 @@ ext { } dependencies { - project(':docker:datahub-ingestion-base') project(':metadata-ingestion') } docker { - dependsOn 'build', ':docker:datahub-ingestion-base:docker', ':metadata-ingestion:codegen' + dependsOn 'build', ':metadata-ingestion:codegen' name "${docker_registry}/${docker_repo}:${docker_version}" - dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile${docker_target == "slim" ? "-slim-only" : ""}") + dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include '.dockerignore' include "docker/${docker_dir}/*" @@ -34,7 +33,7 @@ docker { } version "v${docker_version}" - def dockerBuildArgs = [DOCKER_VERSION: version, RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", ''), BASE_IMAGE: "${docker_registry}/datahub-ingestion-base"] + def dockerBuildArgs = [RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')] // Add build args if they are defined (needed for some CI or enterprise environments) if (project.hasProperty('pipMirrorUrl')) { diff --git a/docker/snippets/.gitignore b/docker/snippets/.gitignore new file mode 100644 index 0000000000..2cbb917ab6 --- /dev/null +++ b/docker/snippets/.gitignore @@ -0,0 +1,2 @@ +# rendered from the template file +/ingestion_base diff --git a/docker/snippets/ingestion_base.template b/docker/snippets/ingestion_base.template new file mode 100644 index 0000000000..8955e7676c --- /dev/null +++ b/docker/snippets/ingestion_base.template @@ -0,0 +1,46 @@ +# This is the "base" image workflow. +# While it has a bunch of intermediate stages, it "exports" a couple +# stages for consumption. +# - base-empty: A basic stage, with basic deps, Python, and a venv. +# - base-slim: Currently the same as base-empty. +# - base-full: Adds a JRE and Oracle client. + +FROM ubuntu:24.04 AS base-empty + +ARG PYTHON_VERSION +RUN test -n "${PYTHON_VERSION}" # PYTHON_VERSION must be set + +# INLINE-BEGIN @/docker/snippets/ubuntu_mirror_setup +# INLINE-END + +# INLINE-BEGIN @/docker/snippets/ubuntu_python_base +# INLINE-END + +FROM base-empty AS full-deps-prebuild + +USER 0 +RUN apt-get update && apt-get install --no-install-recommends -y -qq \ + build-essential \ + maven \ + && rm -rf /var/lib/apt/lists/* +USER datahub + +RUN uv pip install python-ldap==3.4.4 +RUN --mount=type=cache,target=$HOME/.cache/uv,uid=1000,gid=1000 \ + --mount=type=bind,source=./docker/datahub-ingestion/pyspark_jars.sh,target=/pyspark_jars.sh \ + uv pip install python-ldap==3.4.4 pyspark~=3.5.0 && \ + /pyspark_jars.sh + +FROM base-empty AS base-slim +# Nothing to do here. + +FROM base-slim AS base-full + +USER 0 +# INLINE-BEGIN @/docker/snippets/ingestion_full_deps +# INLINE-END +USER datahub + +RUN --mount=from=full-deps-prebuild,source=$HOME/.venv,target=/venv-full \ + rm -r .venv && \ + cp -r /venv-full .venv diff --git a/docker/snippets/ingestion_full_deps b/docker/snippets/ingestion_full_deps new file mode 100644 index 0000000000..92f6b8cf73 --- /dev/null +++ b/docker/snippets/ingestion_full_deps @@ -0,0 +1,21 @@ +RUN apt-get update && apt-get install --no-install-recommends -y -qq \ + default-jre-headless \ + && rm -rf /var/lib/apt/lists/* + +RUN if [ $(arch) = "x86_64" ]; then \ + mkdir /opt/oracle && \ + cd /opt/oracle && \ + wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/2115000/instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \ + unzip instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \ + rm instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \ + sh -c "echo /opt/oracle/instantclient_21_15 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \ + ldconfig; \ + else \ + mkdir /opt/oracle && \ + cd /opt/oracle && \ + wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/1923000/instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \ + unzip instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \ + rm instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \ + sh -c "echo /opt/oracle/instantclient_19_23 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \ + ldconfig; \ + fi; diff --git a/docker/snippets/ubuntu_mirror_setup b/docker/snippets/ubuntu_mirror_setup new file mode 100644 index 0000000000..7e1fb4fb1a --- /dev/null +++ b/docker/snippets/ubuntu_mirror_setup @@ -0,0 +1,3 @@ +# TODO: This may not work on Ubuntu 24.04 due to the new deb822 package format. +ARG UBUNTU_REPO_URL=http://ports.ubuntu.com/ubuntu-ports +RUN if [ "${UBUNTU_REPO_URL}" != "http://ports.ubuntu.com/ubuntu-ports" ] ; then sed -i "s#http.*://ports.ubuntu.com/ubuntu-ports#${UBUNTU_REPO_URL}#g" /etc/apt/sources.list ; fi diff --git a/docker/snippets/ubuntu_python_base b/docker/snippets/ubuntu_python_base new file mode 100644 index 0000000000..e28ee692fc --- /dev/null +++ b/docker/snippets/ubuntu_python_base @@ -0,0 +1,80 @@ + +ENV HOME=/home/datahub +RUN existing_group=$(getent group 1000 | cut -d: -f1) && \ + if [ -n "$existing_group" ] && [ "$existing_group" != "datahub" ]; then \ + echo "Renaming existing group $existing_group to datahub"; \ + groupmod -n datahub "$existing_group"; \ + elif [ -z "$existing_group" ]; then \ + echo "Creating new group datahub with GID 1000"; \ + addgroup --gid 1000 datahub; \ + fi && \ + existing_user=$(id -nu 1000 2>/dev/null || echo "") && \ + if [ -n "$existing_user" ] && [ "$existing_user" != "datahub" ]; then \ + echo "Renaming existing user $existing_user to datahub"; \ + usermod -l datahub -d $HOME "$existing_user"; \ + usermod -g datahub datahub; \ + elif [ -z "$existing_user" ]; then \ + echo "Creating new user datahub with UID 1000"; \ + adduser --disabled-password --uid 1000 --gid 1000 --home $HOME datahub; \ + fi && \ + # Create and set proper permissions for datahub directories + mkdir -p $HOME && \ + chown -R datahub:datahub $HOME + + +# Setup the PPA for alternative Python versions. +# TODO: Eventually we should switch to using uv's support for python-build-standalone. +RUN apt-get update && apt-get install -y \ + software-properties-common \ + lsb-release \ + gnupg \ + ca-certificates \ + && add-apt-repository --no-update ppa:deadsnakes/ppa \ + && rm -rf /var/lib/apt/lists/* + +RUN apt-get update && apt-get install -y \ + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-venv \ + python${PYTHON_VERSION}-dev \ + python${PYTHON_VERSION}-distutils \ + python3-pip \ + python3-ldap \ + python-is-python3 \ + libldap2-dev \ + libsasl2-dev \ + libsasl2-modules \ + libaio-dev \ + libaio1t64 \ + libsasl2-modules-gssapi-mit \ + krb5-user \ + krb5-config \ + libkrb5-dev \ + librdkafka-dev \ + git \ + wget \ + curl \ + zip \ + unzip \ + ldap-utils \ + unixodbc \ + libodbc2 \ + && apt-get clean \ + && rm -rf /var/lib/{apt,dpkg,cache,log}/ + +# Set the default python version. +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --install /usr/bin/python python /usr/bin/python3 1 + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +ARG PIP_MIRROR_URL=https://pypi.python.org/simple +RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then uvx --no-cache pip config set global.index-url ${PIP_MIRROR_URL} ; fi +ENV UV_INDEX_URL=${PIP_MIRROR_URL} + +USER datahub +WORKDIR $HOME +RUN uv venv --python "$PYTHON_VERSION" +ENV VIRTUAL_ENV=$HOME/.venv +ENV PATH="${VIRTUAL_ENV}/bin:$PATH" +# Requests comes with it's own CA bundle, but we want to use always use the system CA bundle. +ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt diff --git a/python-build/generate_ingestion_docker.py b/python-build/generate_ingestion_docker.py new file mode 100644 index 0000000000..8dd3b90880 --- /dev/null +++ b/python-build/generate_ingestion_docker.py @@ -0,0 +1,87 @@ +import re +import sys +from pathlib import Path +from typing import Optional + +_repo_root = Path(__file__).parent.parent +assert (_repo_root / ".git").exists(), "Unable to find git repo root" + + +def _load_file(path: str, context_dir: Path) -> str: + if path.startswith("@/"): + resolved_path = Path(_repo_root / path[2:]) + return resolved_path.read_text() + else: + raise ValueError( + f"Only repo-rooted paths, which have the '@/' prefix, are supported: got {path}" + ) + + +def update_template( + template_file: Path, + outfile: Optional[Path] = None, + check_only: bool = False, +) -> None: + """ + Update a template file in-place, injecting content from files referenced in inline directives. + + Args: + template_file: Path to the template file that will be modified + """ + + render_mode = bool(outfile) + + subs = 0 + content = template_file.read_text() + + def handle_multiline(match: re.Match) -> str: + nonlocal subs + subs += 1 + + path = match.group(2) + replacement = _load_file(path, template_file.parent).strip() + replacement = replacement.strip() + "\n" + + if render_mode: + return f"{replacement}" + else: + return f"{match.group(1)}{replacement}{match.group(3)}" + + # Handle multiline inline directives + content = re.sub( + r"^([ \t]*# INLINE-BEGIN (.*?)\n).*?^([ \t]*# INLINE-END)$", + handle_multiline, + content, + flags=re.DOTALL | re.MULTILINE, + ) + + # if subs == 0: + # raise ValueError(f"No templates found in {template_file}") + + output = outfile or template_file + if check_only: + if output.read_text() != content: + print(f"ERROR: {template_file} is out of date") + sys.exit(1) + else: + print(f"Applied {subs} substitutions while processing {template_file}") + output.write_text(content) + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "--check": + check_only = True + else: + check_only = False + + update_template( + Path(_repo_root / "docker/snippets/ingestion_base.template"), + outfile=Path(_repo_root / "docker/snippets/ingestion_base"), + check_only=check_only, + ) + + for file in [ + "docker/datahub-ingestion-base/Dockerfile", + "docker/datahub-ingestion/Dockerfile", + ]: + update_template(Path(_repo_root / file), check_only=check_only)