mirror of
https://github.com/datahub-project/datahub.git
synced 2025-06-27 05:03:31 +00:00
feat: start removing ingestion-base image (#13146)
This commit is contained in:
parent
bbbeab8467
commit
275535d4d3
@ -5,6 +5,7 @@
|
|||||||
**/.tox/
|
**/.tox/
|
||||||
**/.mypy_cache/
|
**/.mypy_cache/
|
||||||
**/.pytest_cache/
|
**/.pytest_cache/
|
||||||
|
**/.ruff_cache/
|
||||||
**/__pycache__/
|
**/__pycache__/
|
||||||
out
|
out
|
||||||
**/*.class
|
**/*.class
|
||||||
@ -16,3 +17,6 @@ out
|
|||||||
.git/COMMIT_*
|
.git/COMMIT_*
|
||||||
.git/index
|
.git/index
|
||||||
.gradle
|
.gradle
|
||||||
|
|
||||||
|
/metadata-ingestion/tests
|
||||||
|
/metadata-ingestion/examples
|
||||||
|
24
.github/workflows/docker-unified.yml
vendored
24
.github/workflows/docker-unified.yml
vendored
@ -754,7 +754,7 @@ jobs:
|
|||||||
if: ${{ needs.setup.outputs.ingestion_base_change == 'true' }}
|
if: ${{ needs.setup.outputs.ingestion_base_change == 'true' }}
|
||||||
uses: ./.github/actions/docker-custom-build-and-push
|
uses: ./.github/actions/docker-custom-build-and-push
|
||||||
with:
|
with:
|
||||||
target: base
|
target: base-empty
|
||||||
images: |
|
images: |
|
||||||
${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
|
${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
|
||||||
image_tag: ${{ needs.setup.outputs.tag }}
|
image_tag: ${{ needs.setup.outputs.tag }}
|
||||||
@ -798,7 +798,6 @@ jobs:
|
|||||||
if: ${{ needs.setup.outputs.ingestion_base_change == 'true' }}
|
if: ${{ needs.setup.outputs.ingestion_base_change == 'true' }}
|
||||||
uses: ./.github/actions/docker-custom-build-and-push
|
uses: ./.github/actions/docker-custom-build-and-push
|
||||||
with:
|
with:
|
||||||
target: slim-install
|
|
||||||
images: |
|
images: |
|
||||||
${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
|
${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
|
||||||
image_tag: ${{ needs.setup.outputs.slim_tag }}
|
image_tag: ${{ needs.setup.outputs.slim_tag }}
|
||||||
@ -841,7 +840,6 @@ jobs:
|
|||||||
if: ${{ needs.setup.outputs.ingestion_base_change == 'true' }}
|
if: ${{ needs.setup.outputs.ingestion_base_change == 'true' }}
|
||||||
uses: ./.github/actions/docker-custom-build-and-push
|
uses: ./.github/actions/docker-custom-build-and-push
|
||||||
with:
|
with:
|
||||||
target: full-install
|
|
||||||
images: |
|
images: |
|
||||||
${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
|
${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
|
||||||
image_tag: ${{ needs.setup.outputs.full_tag }}
|
image_tag: ${{ needs.setup.outputs.full_tag }}
|
||||||
@ -864,7 +862,7 @@ jobs:
|
|||||||
outputs:
|
outputs:
|
||||||
tag: ${{ steps.tag.outputs.tag }}
|
tag: ${{ steps.tag.outputs.tag }}
|
||||||
needs_artifact_download: ${{ needs.setup.outputs.ingestion_change == 'true' && ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true') }}
|
needs_artifact_download: ${{ needs.setup.outputs.ingestion_change == 'true' && ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true') }}
|
||||||
needs: [setup, smoke_test_lint,datahub_ingestion_base_slim_build]
|
needs: [setup, smoke_test_lint]
|
||||||
if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
|
if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
@ -881,11 +879,6 @@ jobs:
|
|||||||
- name: Build codegen
|
- name: Build codegen
|
||||||
if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish =='true' }}
|
if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish =='true' }}
|
||||||
run: ./gradlew :metadata-ingestion:codegen
|
run: ./gradlew :metadata-ingestion:codegen
|
||||||
- name: Download Base Image
|
|
||||||
uses: ishworkh/container-image-artifact-download@v2.0.0
|
|
||||||
if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }}
|
|
||||||
with:
|
|
||||||
image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}
|
|
||||||
- name: Login to DockerHub
|
- name: Login to DockerHub
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v3
|
||||||
if: ${{ needs.setup.outputs.docker-login == 'true' && needs.setup.outputs.publish == 'false' && needs.setup.outputs.pr-publish == 'false' && needs.setup.outputs.ingestion_base_change == 'false' }}
|
if: ${{ needs.setup.outputs.docker-login == 'true' && needs.setup.outputs.publish == 'false' && needs.setup.outputs.pr-publish == 'false' && needs.setup.outputs.ingestion_base_change == 'false' }}
|
||||||
@ -896,12 +889,9 @@ jobs:
|
|||||||
if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
|
if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
|
||||||
uses: ./.github/actions/docker-custom-build-and-push
|
uses: ./.github/actions/docker-custom-build-and-push
|
||||||
with:
|
with:
|
||||||
target: final
|
|
||||||
images: |
|
images: |
|
||||||
${{ env.DATAHUB_INGESTION_IMAGE }}
|
${{ env.DATAHUB_INGESTION_IMAGE }}
|
||||||
build-args: |
|
build-args: |
|
||||||
BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
|
|
||||||
DOCKER_VERSION=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}
|
|
||||||
RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }}
|
RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }}
|
||||||
APP_ENV=slim
|
APP_ENV=slim
|
||||||
image_tag: ${{ needs.setup.outputs.slim_tag }}
|
image_tag: ${{ needs.setup.outputs.slim_tag }}
|
||||||
@ -960,7 +950,7 @@ jobs:
|
|||||||
outputs:
|
outputs:
|
||||||
tag: ${{ steps.tag.outputs.tag }}
|
tag: ${{ steps.tag.outputs.tag }}
|
||||||
needs_artifact_download: ${{ needs.setup.outputs.ingestion_change == 'true' && ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) }}
|
needs_artifact_download: ${{ needs.setup.outputs.ingestion_change == 'true' && ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) }}
|
||||||
needs: [setup, smoke_test_lint,datahub_ingestion_base_full_build]
|
needs: [setup, smoke_test_lint]
|
||||||
if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
|
if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
@ -978,11 +968,6 @@ jobs:
|
|||||||
- name: Build codegen
|
- name: Build codegen
|
||||||
if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
|
if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
|
||||||
run: ./gradlew :metadata-ingestion:codegen
|
run: ./gradlew :metadata-ingestion:codegen
|
||||||
- name: Download Base Image
|
|
||||||
uses: ishworkh/container-image-artifact-download@v2.0.0
|
|
||||||
if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' && needs.setup.outputs.ingestion_base_change == 'true' }}
|
|
||||||
with:
|
|
||||||
image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}
|
|
||||||
- name: Login to DockerHub
|
- name: Login to DockerHub
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v3
|
||||||
if: ${{ needs.setup.outputs.docker-login == 'true' && needs.setup.outputs.publish == 'false' && needs.setup.outputs.pr-publish == 'false' && needs.setup.outputs.ingestion_base_change == 'false' }}
|
if: ${{ needs.setup.outputs.docker-login == 'true' && needs.setup.outputs.publish == 'false' && needs.setup.outputs.pr-publish == 'false' && needs.setup.outputs.ingestion_base_change == 'false' }}
|
||||||
@ -993,12 +978,9 @@ jobs:
|
|||||||
if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
|
if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }}
|
||||||
uses: ./.github/actions/docker-custom-build-and-push
|
uses: ./.github/actions/docker-custom-build-and-push
|
||||||
with:
|
with:
|
||||||
target: final
|
|
||||||
images: |
|
images: |
|
||||||
${{ env.DATAHUB_INGESTION_IMAGE }}
|
${{ env.DATAHUB_INGESTION_IMAGE }}
|
||||||
build-args: |
|
build-args: |
|
||||||
BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
|
|
||||||
DOCKER_VERSION=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}
|
|
||||||
RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }}
|
RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }}
|
||||||
image_tag: ${{ needs.setup.outputs.tag }}
|
image_tag: ${{ needs.setup.outputs.tag }}
|
||||||
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
|
username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
|
||||||
|
@ -1,42 +1,63 @@
|
|||||||
|
# This image has two variants: full and slim.
|
||||||
ARG APP_ENV=full
|
ARG APP_ENV=full
|
||||||
ARG BASE_IMAGE=base
|
ARG PYTHON_VERSION=3.10
|
||||||
|
|
||||||
# Defining custom repo urls for use in enterprise environments. Re-used between stages below.
|
# INLINE-BEGIN @/docker/snippets/ingestion_base
|
||||||
ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine
|
# This is the "base" image workflow.
|
||||||
ARG GITHUB_REPO_URL=https://github.com
|
# While it has a bunch of intermediate stages, it "exports" a couple
|
||||||
ARG DEBIAN_REPO_URL=https://deb.debian.org/debian
|
# stages for consumption.
|
||||||
|
# - base-empty: A basic stage, with basic deps, Python, and a venv.
|
||||||
|
# - base-slim: Currently the same as base-empty.
|
||||||
|
# - base-full: Adds a JRE and Oracle client.
|
||||||
|
|
||||||
|
FROM ubuntu:24.04 AS base-empty
|
||||||
|
|
||||||
|
ARG PYTHON_VERSION
|
||||||
|
RUN test -n "${PYTHON_VERSION}" # PYTHON_VERSION must be set
|
||||||
|
|
||||||
|
# TODO: This may not work on Ubuntu 24.04 due to the new deb822 package format.
|
||||||
ARG UBUNTU_REPO_URL=http://ports.ubuntu.com/ubuntu-ports
|
ARG UBUNTU_REPO_URL=http://ports.ubuntu.com/ubuntu-ports
|
||||||
ARG PIP_MIRROR_URL=https://pypi.python.org/simple
|
|
||||||
|
|
||||||
FROM ubuntu:24.04 AS base
|
|
||||||
|
|
||||||
ARG GITHUB_REPO_URL
|
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
|
|
||||||
# Optionally set corporate mirror for deb
|
|
||||||
ARG DEBIAN_REPO_URL
|
|
||||||
ARG UBUNTU_REPO_URL
|
|
||||||
RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list ; fi
|
|
||||||
RUN if [ "${UBUNTU_REPO_URL}" != "http://ports.ubuntu.com/ubuntu-ports" ] ; then sed -i "s#http.*://ports.ubuntu.com/ubuntu-ports#${UBUNTU_REPO_URL}#g" /etc/apt/sources.list ; fi
|
RUN if [ "${UBUNTU_REPO_URL}" != "http://ports.ubuntu.com/ubuntu-ports" ] ; then sed -i "s#http.*://ports.ubuntu.com/ubuntu-ports#${UBUNTU_REPO_URL}#g" /etc/apt/sources.list ; fi
|
||||||
|
|
||||||
# Install software-properties-common to add PPAs
|
|
||||||
|
ENV HOME=/home/datahub
|
||||||
|
RUN existing_group=$(getent group 1000 | cut -d: -f1) && \
|
||||||
|
if [ -n "$existing_group" ] && [ "$existing_group" != "datahub" ]; then \
|
||||||
|
echo "Renaming existing group $existing_group to datahub"; \
|
||||||
|
groupmod -n datahub "$existing_group"; \
|
||||||
|
elif [ -z "$existing_group" ]; then \
|
||||||
|
echo "Creating new group datahub with GID 1000"; \
|
||||||
|
addgroup --gid 1000 datahub; \
|
||||||
|
fi && \
|
||||||
|
existing_user=$(id -nu 1000 2>/dev/null || echo "") && \
|
||||||
|
if [ -n "$existing_user" ] && [ "$existing_user" != "datahub" ]; then \
|
||||||
|
echo "Renaming existing user $existing_user to datahub"; \
|
||||||
|
usermod -l datahub -d $HOME "$existing_user"; \
|
||||||
|
usermod -g datahub datahub; \
|
||||||
|
elif [ -z "$existing_user" ]; then \
|
||||||
|
echo "Creating new user datahub with UID 1000"; \
|
||||||
|
adduser --disabled-password --uid 1000 --gid 1000 --home $HOME datahub; \
|
||||||
|
fi && \
|
||||||
|
# Create and set proper permissions for datahub directories
|
||||||
|
mkdir -p $HOME && \
|
||||||
|
chown -R datahub:datahub $HOME
|
||||||
|
|
||||||
|
|
||||||
|
# Setup the PPA for alternative Python versions.
|
||||||
|
# TODO: Eventually we should switch to using uv's support for python-build-standalone.
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
software-properties-common \
|
software-properties-common \
|
||||||
lsb-release \
|
lsb-release \
|
||||||
gnupg \
|
gnupg \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
|
&& add-apt-repository --no-update ppa:deadsnakes/ppa \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Add deadsnakes PPA for Python 3.10
|
RUN apt-get update && apt-get install -y \
|
||||||
RUN add-apt-repository ppa:deadsnakes/ppa
|
python${PYTHON_VERSION} \
|
||||||
|
python${PYTHON_VERSION}-venv \
|
||||||
RUN apt-get update && apt-get upgrade -y \
|
python${PYTHON_VERSION}-dev \
|
||||||
&& apt-get install -y -qq \
|
python${PYTHON_VERSION}-distutils \
|
||||||
python3.10 \
|
|
||||||
python3.10-venv \
|
|
||||||
python3.10-dev \
|
|
||||||
python3.10-distutils \
|
|
||||||
python3-pip \
|
python3-pip \
|
||||||
python3-ldap \
|
python3-ldap \
|
||||||
python-is-python3 \
|
python-is-python3 \
|
||||||
@ -61,64 +82,49 @@ RUN apt-get update && apt-get upgrade -y \
|
|||||||
&& apt-get clean \
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/{apt,dpkg,cache,log}/
|
&& rm -rf /var/lib/{apt,dpkg,cache,log}/
|
||||||
|
|
||||||
# Set Python 3.10 as the default python3
|
# Set the default python version.
|
||||||
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 \
|
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||||
&& update-alternatives --install /usr/bin/python python /usr/bin/python3 1
|
&& update-alternatives --install /usr/bin/python python /usr/bin/python3 1
|
||||||
|
|
||||||
# Optionally set corporate mirror for pip
|
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
||||||
ARG PIP_MIRROR_URL
|
|
||||||
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
|
ARG PIP_MIRROR_URL=https://pypi.python.org/simple
|
||||||
|
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then uvx --no-cache pip config set global.index-url ${PIP_MIRROR_URL} ; fi
|
||||||
ENV UV_INDEX_URL=${PIP_MIRROR_URL}
|
ENV UV_INDEX_URL=${PIP_MIRROR_URL}
|
||||||
|
|
||||||
COPY --from=powerman/dockerize:0.19 /usr/local/bin/dockerize /usr/local/bin
|
|
||||||
|
|
||||||
COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh
|
|
||||||
|
|
||||||
RUN existing_group=$(getent group 1000 | cut -d: -f1) && \
|
|
||||||
if [ -n "$existing_group" ] && [ "$existing_group" != "datahub" ]; then \
|
|
||||||
echo "Renaming existing group $existing_group to datahub"; \
|
|
||||||
groupmod -n datahub "$existing_group"; \
|
|
||||||
elif [ -z "$existing_group" ]; then \
|
|
||||||
echo "Creating new group datahub with GID 1000"; \
|
|
||||||
addgroup --gid 1000 datahub; \
|
|
||||||
fi && \
|
|
||||||
existing_user=$(id -nu 1000 2>/dev/null || echo "") && \
|
|
||||||
if [ -n "$existing_user" ] && [ "$existing_user" != "datahub" ]; then \
|
|
||||||
echo "Renaming existing user $existing_user to datahub"; \
|
|
||||||
usermod -l datahub -d /datahub-ingestion "$existing_user"; \
|
|
||||||
usermod -g datahub datahub; \
|
|
||||||
elif [ -z "$existing_user" ]; then \
|
|
||||||
echo "Creating new user datahub with UID 1000"; \
|
|
||||||
adduser --disabled-password --uid 1000 --gid 1000 --home /datahub-ingestion datahub; \
|
|
||||||
fi && \
|
|
||||||
# Create and set proper permissions for datahub directories
|
|
||||||
mkdir -p /datahub-ingestion && \
|
|
||||||
chown -R datahub:datahub /datahub-ingestion && \
|
|
||||||
chmod +x /entrypoint.sh
|
|
||||||
|
|
||||||
USER datahub
|
USER datahub
|
||||||
ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
|
WORKDIR $HOME
|
||||||
ENV VIRTUAL_ENV=/datahub-ingestion/.venv
|
RUN uv venv --python "$PYTHON_VERSION"
|
||||||
|
ENV VIRTUAL_ENV=$HOME/.venv
|
||||||
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
|
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
|
||||||
RUN python3 -m venv $VIRTUAL_ENV && \
|
# Requests comes with it's own CA bundle, but we want to use always use the system CA bundle.
|
||||||
pip install --no-cache --upgrade pip 'uv>=0.1.10' && \
|
ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
|
||||||
uv pip install --no-cache --upgrade setuptools wheel python-ldap
|
|
||||||
|
|
||||||
# Note: Normally uv will create hardlinks from the cache directory to the venv.
|
|
||||||
# In our docker files, we normally use `RUN --mount=type=cache,... uv pip install ...`,
|
|
||||||
# which means the cache directory is on a separate filesystem. uv will emit a warning:
|
|
||||||
# Failed to hardlink files; falling back to full copy. This may lead to degraded performance.
|
|
||||||
# If the cache and target directories are on different filesystems, hardlinking may not be supported.
|
|
||||||
# If this is intentional, set `export UV_LINK_MODE=copy` or use `--link-mode=copy` to suppress this warning.
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/entrypoint.sh" ]
|
FROM base-empty AS full-deps-prebuild
|
||||||
|
|
||||||
FROM ${BASE_IMAGE} AS full-install
|
|
||||||
|
|
||||||
USER 0
|
USER 0
|
||||||
RUN apt-get update && apt-get install -y -qq \
|
RUN apt-get update && apt-get install --no-install-recommends -y -qq \
|
||||||
|
build-essential \
|
||||||
|
maven \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
USER datahub
|
||||||
|
|
||||||
|
RUN uv pip install python-ldap==3.4.4
|
||||||
|
RUN --mount=type=cache,target=$HOME/.cache/uv,uid=1000,gid=1000 \
|
||||||
|
--mount=type=bind,source=./docker/datahub-ingestion/pyspark_jars.sh,target=/pyspark_jars.sh \
|
||||||
|
uv pip install python-ldap==3.4.4 pyspark~=3.5.0 && \
|
||||||
|
/pyspark_jars.sh
|
||||||
|
|
||||||
|
FROM base-empty AS base-slim
|
||||||
|
# Nothing to do here.
|
||||||
|
|
||||||
|
FROM base-slim AS base-full
|
||||||
|
|
||||||
|
USER 0
|
||||||
|
RUN apt-get update && apt-get install --no-install-recommends -y -qq \
|
||||||
default-jre-headless \
|
default-jre-headless \
|
||||||
&& rm -rf /var/lib/apt/lists/* /var/cache/apk/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN if [ $(arch) = "x86_64" ]; then \
|
RUN if [ $(arch) = "x86_64" ]; then \
|
||||||
mkdir /opt/oracle && \
|
mkdir /opt/oracle && \
|
||||||
@ -140,9 +146,13 @@ RUN if [ $(arch) = "x86_64" ]; then \
|
|||||||
|
|
||||||
USER datahub
|
USER datahub
|
||||||
|
|
||||||
FROM ${BASE_IMAGE} AS slim-install
|
RUN --mount=from=full-deps-prebuild,source=$HOME/.venv,target=/venv-full \
|
||||||
# Do nothing else on top of base
|
rm -r .venv && \
|
||||||
|
cp -r /venv-full .venv
|
||||||
|
# INLINE-END
|
||||||
|
|
||||||
FROM ${APP_ENV}-install
|
FROM base-${APP_ENV} AS final
|
||||||
|
|
||||||
|
COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh
|
||||||
|
ENTRYPOINT [ "/entrypoint.sh" ]
|
||||||
ENV PATH="/datahub-ingestion/.local/bin:$PATH"
|
ENV PATH="/datahub-ingestion/.local/bin:$PATH"
|
||||||
|
@ -1,64 +1,184 @@
|
|||||||
# Defining environment
|
# This image has two variants: full and slim.
|
||||||
|
# The full variant has a larger set of ingestion sources, but is a larger image.
|
||||||
ARG APP_ENV=full
|
ARG APP_ENV=full
|
||||||
ARG BASE_IMAGE=acryldata/datahub-ingestion-base
|
ARG PYTHON_VERSION=3.10
|
||||||
ARG DOCKER_VERSION=head-full
|
|
||||||
ARG DEBIAN_REPO_URL=https://deb.debian.org/debian
|
# INLINE-BEGIN @/docker/snippets/ingestion_base
|
||||||
|
# This is the "base" image workflow.
|
||||||
|
# While it has a bunch of intermediate stages, it "exports" a couple
|
||||||
|
# stages for consumption.
|
||||||
|
# - base-empty: A basic stage, with basic deps, Python, and a venv.
|
||||||
|
# - base-slim: Currently the same as base-empty.
|
||||||
|
# - base-full: Adds a JRE and Oracle client.
|
||||||
|
|
||||||
|
FROM ubuntu:24.04 AS base-empty
|
||||||
|
|
||||||
|
ARG PYTHON_VERSION
|
||||||
|
RUN test -n "${PYTHON_VERSION}" # PYTHON_VERSION must be set
|
||||||
|
|
||||||
|
# TODO: This may not work on Ubuntu 24.04 due to the new deb822 package format.
|
||||||
ARG UBUNTU_REPO_URL=http://ports.ubuntu.com/ubuntu-ports
|
ARG UBUNTU_REPO_URL=http://ports.ubuntu.com/ubuntu-ports
|
||||||
ARG PIP_MIRROR_URL=https://pypi.python.org/simple
|
|
||||||
|
|
||||||
FROM $BASE_IMAGE:$DOCKER_VERSION AS base
|
|
||||||
|
|
||||||
# Optionally set corporate mirror for deb
|
|
||||||
USER 0
|
|
||||||
ARG DEBIAN_REPO_URL
|
|
||||||
ARG UBUNTU_REPO_URL
|
|
||||||
RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list ; fi
|
|
||||||
RUN if [ "${UBUNTU_REPO_URL}" != "http://ports.ubuntu.com/ubuntu-ports" ] ; then sed -i "s#http.*://ports.ubuntu.com/ubuntu-ports#${UBUNTU_REPO_URL}#g" /etc/apt/sources.list ; fi
|
RUN if [ "${UBUNTU_REPO_URL}" != "http://ports.ubuntu.com/ubuntu-ports" ] ; then sed -i "s#http.*://ports.ubuntu.com/ubuntu-ports#${UBUNTU_REPO_URL}#g" /etc/apt/sources.list ; fi
|
||||||
USER datahub
|
|
||||||
|
|
||||||
# Optionally set corporate mirror for pip
|
|
||||||
ARG PIP_MIRROR_URL
|
ENV HOME=/home/datahub
|
||||||
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
|
RUN existing_group=$(getent group 1000 | cut -d: -f1) && \
|
||||||
|
if [ -n "$existing_group" ] && [ "$existing_group" != "datahub" ]; then \
|
||||||
|
echo "Renaming existing group $existing_group to datahub"; \
|
||||||
|
groupmod -n datahub "$existing_group"; \
|
||||||
|
elif [ -z "$existing_group" ]; then \
|
||||||
|
echo "Creating new group datahub with GID 1000"; \
|
||||||
|
addgroup --gid 1000 datahub; \
|
||||||
|
fi && \
|
||||||
|
existing_user=$(id -nu 1000 2>/dev/null || echo "") && \
|
||||||
|
if [ -n "$existing_user" ] && [ "$existing_user" != "datahub" ]; then \
|
||||||
|
echo "Renaming existing user $existing_user to datahub"; \
|
||||||
|
usermod -l datahub -d $HOME "$existing_user"; \
|
||||||
|
usermod -g datahub datahub; \
|
||||||
|
elif [ -z "$existing_user" ]; then \
|
||||||
|
echo "Creating new user datahub with UID 1000"; \
|
||||||
|
adduser --disabled-password --uid 1000 --gid 1000 --home $HOME datahub; \
|
||||||
|
fi && \
|
||||||
|
# Create and set proper permissions for datahub directories
|
||||||
|
mkdir -p $HOME && \
|
||||||
|
chown -R datahub:datahub $HOME
|
||||||
|
|
||||||
|
|
||||||
|
# Setup the PPA for alternative Python versions.
|
||||||
|
# TODO: Eventually we should switch to using uv's support for python-build-standalone.
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
software-properties-common \
|
||||||
|
lsb-release \
|
||||||
|
gnupg \
|
||||||
|
ca-certificates \
|
||||||
|
&& add-apt-repository --no-update ppa:deadsnakes/ppa \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
python${PYTHON_VERSION} \
|
||||||
|
python${PYTHON_VERSION}-venv \
|
||||||
|
python${PYTHON_VERSION}-dev \
|
||||||
|
python${PYTHON_VERSION}-distutils \
|
||||||
|
python3-pip \
|
||||||
|
python3-ldap \
|
||||||
|
python-is-python3 \
|
||||||
|
libldap2-dev \
|
||||||
|
libsasl2-dev \
|
||||||
|
libsasl2-modules \
|
||||||
|
libaio-dev \
|
||||||
|
libaio1t64 \
|
||||||
|
libsasl2-modules-gssapi-mit \
|
||||||
|
krb5-user \
|
||||||
|
krb5-config \
|
||||||
|
libkrb5-dev \
|
||||||
|
librdkafka-dev \
|
||||||
|
git \
|
||||||
|
wget \
|
||||||
|
curl \
|
||||||
|
zip \
|
||||||
|
unzip \
|
||||||
|
ldap-utils \
|
||||||
|
unixodbc \
|
||||||
|
libodbc2 \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/{apt,dpkg,cache,log}/
|
||||||
|
|
||||||
|
# Set the default python version.
|
||||||
|
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||||
|
&& update-alternatives --install /usr/bin/python python /usr/bin/python3 1
|
||||||
|
|
||||||
|
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
||||||
|
|
||||||
|
ARG PIP_MIRROR_URL=https://pypi.python.org/simple
|
||||||
|
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then uvx --no-cache pip config set global.index-url ${PIP_MIRROR_URL} ; fi
|
||||||
ENV UV_INDEX_URL=${PIP_MIRROR_URL}
|
ENV UV_INDEX_URL=${PIP_MIRROR_URL}
|
||||||
|
|
||||||
COPY --chown=datahub ./metadata-ingestion /metadata-ingestion
|
USER datahub
|
||||||
COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /metadata-ingestion/airflow-plugin
|
WORKDIR $HOME
|
||||||
|
RUN uv venv --python "$PYTHON_VERSION"
|
||||||
|
ENV VIRTUAL_ENV=$HOME/.venv
|
||||||
|
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
|
||||||
|
# Requests comes with it's own CA bundle, but we want to use always use the system CA bundle.
|
||||||
|
ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
|
||||||
|
|
||||||
ARG RELEASE_VERSION
|
|
||||||
WORKDIR /metadata-ingestion
|
|
||||||
RUN sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/_version.py && \
|
|
||||||
sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" airflow-plugin/src/datahub_airflow_plugin/_version.py && \
|
|
||||||
cat src/datahub/_version.py | grep __version__ && \
|
|
||||||
cat airflow-plugin/src/datahub_airflow_plugin/_version.py | grep __version__
|
|
||||||
|
|
||||||
FROM base AS slim-install
|
FROM base-empty AS full-deps-prebuild
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \
|
|
||||||
UV_LINK_MODE=copy uv pip install -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]"
|
|
||||||
|
|
||||||
FROM base AS full-install-build
|
|
||||||
|
|
||||||
USER 0
|
USER 0
|
||||||
RUN apt-get update && apt-get install -y -qq maven
|
RUN apt-get update && apt-get install --no-install-recommends -y -qq \
|
||||||
|
build-essential \
|
||||||
|
maven \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
USER datahub
|
||||||
|
|
||||||
|
RUN uv pip install python-ldap==3.4.4
|
||||||
|
RUN --mount=type=cache,target=$HOME/.cache/uv,uid=1000,gid=1000 \
|
||||||
|
--mount=type=bind,source=./docker/datahub-ingestion/pyspark_jars.sh,target=/pyspark_jars.sh \
|
||||||
|
uv pip install python-ldap==3.4.4 pyspark~=3.5.0 && \
|
||||||
|
/pyspark_jars.sh
|
||||||
|
|
||||||
|
FROM base-empty AS base-slim
|
||||||
|
# Nothing to do here.
|
||||||
|
|
||||||
|
FROM base-slim AS base-full
|
||||||
|
|
||||||
|
USER 0
|
||||||
|
RUN apt-get update && apt-get install --no-install-recommends -y -qq \
|
||||||
|
default-jre-headless \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN if [ $(arch) = "x86_64" ]; then \
|
||||||
|
mkdir /opt/oracle && \
|
||||||
|
cd /opt/oracle && \
|
||||||
|
wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/2115000/instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \
|
||||||
|
unzip instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \
|
||||||
|
rm instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \
|
||||||
|
sh -c "echo /opt/oracle/instantclient_21_15 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \
|
||||||
|
ldconfig; \
|
||||||
|
else \
|
||||||
|
mkdir /opt/oracle && \
|
||||||
|
cd /opt/oracle && \
|
||||||
|
wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/1923000/instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \
|
||||||
|
unzip instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \
|
||||||
|
rm instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \
|
||||||
|
sh -c "echo /opt/oracle/instantclient_19_23 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \
|
||||||
|
ldconfig; \
|
||||||
|
fi;
|
||||||
|
|
||||||
USER datahub
|
USER datahub
|
||||||
COPY ./docker/datahub-ingestion/pyspark_jars.sh .
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \
|
RUN --mount=from=full-deps-prebuild,source=$HOME/.venv,target=/venv-full \
|
||||||
UV_LINK_MODE=copy uv pip install -e ".[base,all]" "./airflow-plugin[plugin-v2]" && \
|
rm -r .venv && \
|
||||||
./pyspark_jars.sh && \
|
cp -r /venv-full .venv
|
||||||
|
# INLINE-END
|
||||||
|
|
||||||
|
FROM base-${APP_ENV} AS add-code
|
||||||
|
|
||||||
|
COPY --chown=datahub ./metadata-ingestion /metadata-ingestion
|
||||||
|
COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /airflow-plugin
|
||||||
|
|
||||||
|
ARG RELEASE_VERSION
|
||||||
|
RUN test -n "$RELEASE_VERSION" # RELEASE_VERSION is a required build arg
|
||||||
|
RUN test -d /metadata-ingestion/src/datahub/metadata # codegen must be run prior to building the image
|
||||||
|
RUN sed -i.bak "s/__version__ = .*$/__version__ = \"$RELEASE_VERSION\"/" /metadata-ingestion/src/datahub/_version.py && \
|
||||||
|
sed -i.bak "s/__version__ = .*$/__version__ = \"$RELEASE_VERSION\"/" /airflow-plugin/src/datahub_airflow_plugin/_version.py && \
|
||||||
|
cat /metadata-ingestion/src/datahub/_version.py | grep __version__ && \
|
||||||
|
cat /airflow-plugin/src/datahub_airflow_plugin/_version.py | grep __version__
|
||||||
|
|
||||||
|
FROM add-code AS install-slim
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=$HOME/.cache/uv,uid=1000,gid=1000 \
|
||||||
|
UV_LINK_MODE=copy uv pip install -e "/metadata-ingestion/[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" && \
|
||||||
datahub --version
|
datahub --version
|
||||||
|
|
||||||
FROM base AS full-install
|
FROM add-code AS install-full
|
||||||
|
|
||||||
COPY --from=full-install-build ${VIRTUAL_ENV} ${VIRTUAL_ENV}
|
RUN --mount=type=cache,target=$HOME/.cache/uv,uid=1000,gid=1000 \
|
||||||
|
UV_LINK_MODE=copy uv pip install \
|
||||||
|
-e "/metadata-ingestion/[base,all]" \
|
||||||
|
-e "/airflow-plugin/[plugin-v2]" && \
|
||||||
|
datahub --version
|
||||||
|
|
||||||
FROM base AS dev-install
|
FROM install-${APP_ENV} AS final
|
||||||
# Dummy stage for development. Assumes code is built on your machine and mounted to this image.
|
|
||||||
# See this excellent thread https://github.com/docker/cli/issues/1134
|
|
||||||
|
|
||||||
FROM ${APP_ENV}-install AS final
|
ENTRYPOINT [ "datahub" ]
|
||||||
|
|
||||||
WORKDIR /datahub-ingestion
|
|
||||||
|
|
||||||
USER datahub
|
|
||||||
|
@ -1,31 +0,0 @@
|
|||||||
# Defining environment
|
|
||||||
ARG BASE_IMAGE=acryldata/datahub-ingestion-base
|
|
||||||
ARG DOCKER_VERSION=head-slim
|
|
||||||
ARG PIP_MIRROR_URL=https://pypi.python.org/simple
|
|
||||||
|
|
||||||
FROM $BASE_IMAGE:$DOCKER_VERSION as base
|
|
||||||
USER datahub
|
|
||||||
|
|
||||||
# Optionally set corporate mirror for apk and pip
|
|
||||||
ARG PIP_MIRROR_URL
|
|
||||||
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
|
|
||||||
ENV UV_INDEX_URL=${PIP_MIRROR_URL}
|
|
||||||
|
|
||||||
COPY --chown=datahub ./metadata-ingestion /metadata-ingestion
|
|
||||||
|
|
||||||
ARG RELEASE_VERSION
|
|
||||||
WORKDIR /metadata-ingestion
|
|
||||||
RUN sed -i.bak "s/__version__ = .*$/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/_version.py && \
|
|
||||||
cat src/datahub/_version.py
|
|
||||||
|
|
||||||
FROM base as slim-install
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \
|
|
||||||
UV_LINK_MODE=copy uv pip install -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" && \
|
|
||||||
datahub --version
|
|
||||||
|
|
||||||
FROM slim-install as final
|
|
||||||
|
|
||||||
WORKDIR /datahub-ingestion
|
|
||||||
|
|
||||||
USER datahub
|
|
@ -16,14 +16,13 @@ ext {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
project(':docker:datahub-ingestion-base')
|
|
||||||
project(':metadata-ingestion')
|
project(':metadata-ingestion')
|
||||||
}
|
}
|
||||||
|
|
||||||
docker {
|
docker {
|
||||||
dependsOn 'build', ':docker:datahub-ingestion-base:docker', ':metadata-ingestion:codegen'
|
dependsOn 'build', ':metadata-ingestion:codegen'
|
||||||
name "${docker_registry}/${docker_repo}:${docker_version}"
|
name "${docker_registry}/${docker_repo}:${docker_version}"
|
||||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile${docker_target == "slim" ? "-slim-only" : ""}")
|
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
||||||
files fileTree(rootProject.projectDir) {
|
files fileTree(rootProject.projectDir) {
|
||||||
include '.dockerignore'
|
include '.dockerignore'
|
||||||
include "docker/${docker_dir}/*"
|
include "docker/${docker_dir}/*"
|
||||||
@ -34,7 +33,7 @@ docker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
version "v${docker_version}"
|
version "v${docker_version}"
|
||||||
def dockerBuildArgs = [DOCKER_VERSION: version, RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", ''), BASE_IMAGE: "${docker_registry}/datahub-ingestion-base"]
|
def dockerBuildArgs = [RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')]
|
||||||
|
|
||||||
// Add build args if they are defined (needed for some CI or enterprise environments)
|
// Add build args if they are defined (needed for some CI or enterprise environments)
|
||||||
if (project.hasProperty('pipMirrorUrl')) {
|
if (project.hasProperty('pipMirrorUrl')) {
|
||||||
|
2
docker/snippets/.gitignore
vendored
Normal file
2
docker/snippets/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
# rendered from the template file
|
||||||
|
/ingestion_base
|
46
docker/snippets/ingestion_base.template
Normal file
46
docker/snippets/ingestion_base.template
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
# This is the "base" image workflow.
|
||||||
|
# While it has a bunch of intermediate stages, it "exports" a couple
|
||||||
|
# stages for consumption.
|
||||||
|
# - base-empty: A basic stage, with basic deps, Python, and a venv.
|
||||||
|
# - base-slim: Currently the same as base-empty.
|
||||||
|
# - base-full: Adds a JRE and Oracle client.
|
||||||
|
|
||||||
|
FROM ubuntu:24.04 AS base-empty
|
||||||
|
|
||||||
|
ARG PYTHON_VERSION
|
||||||
|
RUN test -n "${PYTHON_VERSION}" # PYTHON_VERSION must be set
|
||||||
|
|
||||||
|
# INLINE-BEGIN @/docker/snippets/ubuntu_mirror_setup
|
||||||
|
# INLINE-END
|
||||||
|
|
||||||
|
# INLINE-BEGIN @/docker/snippets/ubuntu_python_base
|
||||||
|
# INLINE-END
|
||||||
|
|
||||||
|
FROM base-empty AS full-deps-prebuild
|
||||||
|
|
||||||
|
USER 0
|
||||||
|
RUN apt-get update && apt-get install --no-install-recommends -y -qq \
|
||||||
|
build-essential \
|
||||||
|
maven \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
USER datahub
|
||||||
|
|
||||||
|
RUN uv pip install python-ldap==3.4.4
|
||||||
|
RUN --mount=type=cache,target=$HOME/.cache/uv,uid=1000,gid=1000 \
|
||||||
|
--mount=type=bind,source=./docker/datahub-ingestion/pyspark_jars.sh,target=/pyspark_jars.sh \
|
||||||
|
uv pip install python-ldap==3.4.4 pyspark~=3.5.0 && \
|
||||||
|
/pyspark_jars.sh
|
||||||
|
|
||||||
|
FROM base-empty AS base-slim
|
||||||
|
# Nothing to do here.
|
||||||
|
|
||||||
|
FROM base-slim AS base-full
|
||||||
|
|
||||||
|
USER 0
|
||||||
|
# INLINE-BEGIN @/docker/snippets/ingestion_full_deps
|
||||||
|
# INLINE-END
|
||||||
|
USER datahub
|
||||||
|
|
||||||
|
RUN --mount=from=full-deps-prebuild,source=$HOME/.venv,target=/venv-full \
|
||||||
|
rm -r .venv && \
|
||||||
|
cp -r /venv-full .venv
|
21
docker/snippets/ingestion_full_deps
Normal file
21
docker/snippets/ingestion_full_deps
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
RUN apt-get update && apt-get install --no-install-recommends -y -qq \
|
||||||
|
default-jre-headless \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN if [ $(arch) = "x86_64" ]; then \
|
||||||
|
mkdir /opt/oracle && \
|
||||||
|
cd /opt/oracle && \
|
||||||
|
wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/2115000/instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \
|
||||||
|
unzip instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \
|
||||||
|
rm instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \
|
||||||
|
sh -c "echo /opt/oracle/instantclient_21_15 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \
|
||||||
|
ldconfig; \
|
||||||
|
else \
|
||||||
|
mkdir /opt/oracle && \
|
||||||
|
cd /opt/oracle && \
|
||||||
|
wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/1923000/instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \
|
||||||
|
unzip instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \
|
||||||
|
rm instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \
|
||||||
|
sh -c "echo /opt/oracle/instantclient_19_23 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \
|
||||||
|
ldconfig; \
|
||||||
|
fi;
|
3
docker/snippets/ubuntu_mirror_setup
Normal file
3
docker/snippets/ubuntu_mirror_setup
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# TODO: This may not work on Ubuntu 24.04 due to the new deb822 package format.
|
||||||
|
ARG UBUNTU_REPO_URL=http://ports.ubuntu.com/ubuntu-ports
|
||||||
|
RUN if [ "${UBUNTU_REPO_URL}" != "http://ports.ubuntu.com/ubuntu-ports" ] ; then sed -i "s#http.*://ports.ubuntu.com/ubuntu-ports#${UBUNTU_REPO_URL}#g" /etc/apt/sources.list ; fi
|
80
docker/snippets/ubuntu_python_base
Normal file
80
docker/snippets/ubuntu_python_base
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
|
||||||
|
ENV HOME=/home/datahub
|
||||||
|
RUN existing_group=$(getent group 1000 | cut -d: -f1) && \
|
||||||
|
if [ -n "$existing_group" ] && [ "$existing_group" != "datahub" ]; then \
|
||||||
|
echo "Renaming existing group $existing_group to datahub"; \
|
||||||
|
groupmod -n datahub "$existing_group"; \
|
||||||
|
elif [ -z "$existing_group" ]; then \
|
||||||
|
echo "Creating new group datahub with GID 1000"; \
|
||||||
|
addgroup --gid 1000 datahub; \
|
||||||
|
fi && \
|
||||||
|
existing_user=$(id -nu 1000 2>/dev/null || echo "") && \
|
||||||
|
if [ -n "$existing_user" ] && [ "$existing_user" != "datahub" ]; then \
|
||||||
|
echo "Renaming existing user $existing_user to datahub"; \
|
||||||
|
usermod -l datahub -d $HOME "$existing_user"; \
|
||||||
|
usermod -g datahub datahub; \
|
||||||
|
elif [ -z "$existing_user" ]; then \
|
||||||
|
echo "Creating new user datahub with UID 1000"; \
|
||||||
|
adduser --disabled-password --uid 1000 --gid 1000 --home $HOME datahub; \
|
||||||
|
fi && \
|
||||||
|
# Create and set proper permissions for datahub directories
|
||||||
|
mkdir -p $HOME && \
|
||||||
|
chown -R datahub:datahub $HOME
|
||||||
|
|
||||||
|
|
||||||
|
# Setup the PPA for alternative Python versions.
|
||||||
|
# TODO: Eventually we should switch to using uv's support for python-build-standalone.
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
software-properties-common \
|
||||||
|
lsb-release \
|
||||||
|
gnupg \
|
||||||
|
ca-certificates \
|
||||||
|
&& add-apt-repository --no-update ppa:deadsnakes/ppa \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
python${PYTHON_VERSION} \
|
||||||
|
python${PYTHON_VERSION}-venv \
|
||||||
|
python${PYTHON_VERSION}-dev \
|
||||||
|
python${PYTHON_VERSION}-distutils \
|
||||||
|
python3-pip \
|
||||||
|
python3-ldap \
|
||||||
|
python-is-python3 \
|
||||||
|
libldap2-dev \
|
||||||
|
libsasl2-dev \
|
||||||
|
libsasl2-modules \
|
||||||
|
libaio-dev \
|
||||||
|
libaio1t64 \
|
||||||
|
libsasl2-modules-gssapi-mit \
|
||||||
|
krb5-user \
|
||||||
|
krb5-config \
|
||||||
|
libkrb5-dev \
|
||||||
|
librdkafka-dev \
|
||||||
|
git \
|
||||||
|
wget \
|
||||||
|
curl \
|
||||||
|
zip \
|
||||||
|
unzip \
|
||||||
|
ldap-utils \
|
||||||
|
unixodbc \
|
||||||
|
libodbc2 \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/{apt,dpkg,cache,log}/
|
||||||
|
|
||||||
|
# Set the default python version.
|
||||||
|
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||||
|
&& update-alternatives --install /usr/bin/python python /usr/bin/python3 1
|
||||||
|
|
||||||
|
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
||||||
|
|
||||||
|
ARG PIP_MIRROR_URL=https://pypi.python.org/simple
|
||||||
|
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then uvx --no-cache pip config set global.index-url ${PIP_MIRROR_URL} ; fi
|
||||||
|
ENV UV_INDEX_URL=${PIP_MIRROR_URL}
|
||||||
|
|
||||||
|
USER datahub
|
||||||
|
WORKDIR $HOME
|
||||||
|
RUN uv venv --python "$PYTHON_VERSION"
|
||||||
|
ENV VIRTUAL_ENV=$HOME/.venv
|
||||||
|
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
|
||||||
|
# Requests comes with it's own CA bundle, but we want to use always use the system CA bundle.
|
||||||
|
ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
|
87
python-build/generate_ingestion_docker.py
Normal file
87
python-build/generate_ingestion_docker.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
_repo_root = Path(__file__).parent.parent
|
||||||
|
assert (_repo_root / ".git").exists(), "Unable to find git repo root"
|
||||||
|
|
||||||
|
|
||||||
|
def _load_file(path: str, context_dir: Path) -> str:
|
||||||
|
if path.startswith("@/"):
|
||||||
|
resolved_path = Path(_repo_root / path[2:])
|
||||||
|
return resolved_path.read_text()
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Only repo-rooted paths, which have the '@/' prefix, are supported: got {path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def update_template(
|
||||||
|
template_file: Path,
|
||||||
|
outfile: Optional[Path] = None,
|
||||||
|
check_only: bool = False,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Update a template file in-place, injecting content from files referenced in inline directives.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
template_file: Path to the template file that will be modified
|
||||||
|
"""
|
||||||
|
|
||||||
|
render_mode = bool(outfile)
|
||||||
|
|
||||||
|
subs = 0
|
||||||
|
content = template_file.read_text()
|
||||||
|
|
||||||
|
def handle_multiline(match: re.Match) -> str:
|
||||||
|
nonlocal subs
|
||||||
|
subs += 1
|
||||||
|
|
||||||
|
path = match.group(2)
|
||||||
|
replacement = _load_file(path, template_file.parent).strip()
|
||||||
|
replacement = replacement.strip() + "\n"
|
||||||
|
|
||||||
|
if render_mode:
|
||||||
|
return f"{replacement}"
|
||||||
|
else:
|
||||||
|
return f"{match.group(1)}{replacement}{match.group(3)}"
|
||||||
|
|
||||||
|
# Handle multiline inline directives
|
||||||
|
content = re.sub(
|
||||||
|
r"^([ \t]*# INLINE-BEGIN (.*?)\n).*?^([ \t]*# INLINE-END)$",
|
||||||
|
handle_multiline,
|
||||||
|
content,
|
||||||
|
flags=re.DOTALL | re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# if subs == 0:
|
||||||
|
# raise ValueError(f"No templates found in {template_file}")
|
||||||
|
|
||||||
|
output = outfile or template_file
|
||||||
|
if check_only:
|
||||||
|
if output.read_text() != content:
|
||||||
|
print(f"ERROR: {template_file} is out of date")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print(f"Applied {subs} substitutions while processing {template_file}")
|
||||||
|
output.write_text(content)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) > 1 and sys.argv[1] == "--check":
|
||||||
|
check_only = True
|
||||||
|
else:
|
||||||
|
check_only = False
|
||||||
|
|
||||||
|
update_template(
|
||||||
|
Path(_repo_root / "docker/snippets/ingestion_base.template"),
|
||||||
|
outfile=Path(_repo_root / "docker/snippets/ingestion_base"),
|
||||||
|
check_only=check_only,
|
||||||
|
)
|
||||||
|
|
||||||
|
for file in [
|
||||||
|
"docker/datahub-ingestion-base/Dockerfile",
|
||||||
|
"docker/datahub-ingestion/Dockerfile",
|
||||||
|
]:
|
||||||
|
update_template(Path(_repo_root / file), check_only=check_only)
|
Loading…
x
Reference in New Issue
Block a user