mirror of
https://github.com/datahub-project/datahub.git
synced 2025-06-27 05:03:31 +00:00
185 lines
6.8 KiB
Docker
185 lines
6.8 KiB
Docker
# This image has two variants: full and slim.
|
|
# The full variant has a larger set of ingestion sources, but is a larger image.
|
|
ARG APP_ENV=full
|
|
ARG PYTHON_VERSION=3.10
|
|
|
|
# INLINE-BEGIN @/docker/snippets/ingestion_base
|
|
# This is the "base" image workflow.
|
|
# While it has a bunch of intermediate stages, it "exports" a couple
|
|
# stages for consumption.
|
|
# - base-empty: A basic stage, with basic deps, Python, and a venv.
|
|
# - base-slim: Currently the same as base-empty.
|
|
# - base-full: Adds a JRE and Oracle client.
|
|
|
|
FROM ubuntu:24.04 AS base-empty
|
|
|
|
ARG PYTHON_VERSION
|
|
RUN test -n "${PYTHON_VERSION}" # PYTHON_VERSION must be set
|
|
|
|
# TODO: This may not work on Ubuntu 24.04 due to the new deb822 package format.
|
|
ARG UBUNTU_REPO_URL=http://ports.ubuntu.com/ubuntu-ports
|
|
RUN if [ "${UBUNTU_REPO_URL}" != "http://ports.ubuntu.com/ubuntu-ports" ] ; then sed -i "s#http.*://ports.ubuntu.com/ubuntu-ports#${UBUNTU_REPO_URL}#g" /etc/apt/sources.list ; fi
|
|
|
|
|
|
ENV HOME=/home/datahub
|
|
RUN existing_group=$(getent group 1000 | cut -d: -f1) && \
|
|
if [ -n "$existing_group" ] && [ "$existing_group" != "datahub" ]; then \
|
|
echo "Renaming existing group $existing_group to datahub"; \
|
|
groupmod -n datahub "$existing_group"; \
|
|
elif [ -z "$existing_group" ]; then \
|
|
echo "Creating new group datahub with GID 1000"; \
|
|
addgroup --gid 1000 datahub; \
|
|
fi && \
|
|
existing_user=$(id -nu 1000 2>/dev/null || echo "") && \
|
|
if [ -n "$existing_user" ] && [ "$existing_user" != "datahub" ]; then \
|
|
echo "Renaming existing user $existing_user to datahub"; \
|
|
usermod -l datahub -d $HOME "$existing_user"; \
|
|
usermod -g datahub datahub; \
|
|
elif [ -z "$existing_user" ]; then \
|
|
echo "Creating new user datahub with UID 1000"; \
|
|
adduser --disabled-password --uid 1000 --gid 1000 --home $HOME datahub; \
|
|
fi && \
|
|
# Create and set proper permissions for datahub directories
|
|
mkdir -p $HOME && \
|
|
chown -R datahub:datahub $HOME
|
|
|
|
|
|
# Setup the PPA for alternative Python versions.
|
|
# TODO: Eventually we should switch to using uv's support for python-build-standalone.
|
|
RUN apt-get update && apt-get install -y \
|
|
software-properties-common \
|
|
lsb-release \
|
|
gnupg \
|
|
ca-certificates \
|
|
&& add-apt-repository --no-update ppa:deadsnakes/ppa \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
RUN apt-get update && apt-get install -y \
|
|
python${PYTHON_VERSION} \
|
|
python${PYTHON_VERSION}-venv \
|
|
python${PYTHON_VERSION}-dev \
|
|
python${PYTHON_VERSION}-distutils \
|
|
python3-pip \
|
|
python3-ldap \
|
|
python-is-python3 \
|
|
libldap2-dev \
|
|
libsasl2-dev \
|
|
libsasl2-modules \
|
|
libaio-dev \
|
|
libaio1t64 \
|
|
libsasl2-modules-gssapi-mit \
|
|
krb5-user \
|
|
krb5-config \
|
|
libkrb5-dev \
|
|
librdkafka-dev \
|
|
git \
|
|
wget \
|
|
curl \
|
|
zip \
|
|
unzip \
|
|
ldap-utils \
|
|
unixodbc \
|
|
libodbc2 \
|
|
&& apt-get clean \
|
|
&& rm -rf /var/lib/{apt,dpkg,cache,log}/
|
|
|
|
# Set the default python version.
|
|
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
|
&& update-alternatives --install /usr/bin/python python /usr/bin/python3 1
|
|
|
|
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
|
|
|
ARG PIP_MIRROR_URL=https://pypi.python.org/simple
|
|
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then uvx --no-cache pip config set global.index-url ${PIP_MIRROR_URL} ; fi
|
|
ENV UV_INDEX_URL=${PIP_MIRROR_URL}
|
|
|
|
USER datahub
|
|
WORKDIR $HOME
|
|
RUN uv venv --python "$PYTHON_VERSION"
|
|
ENV VIRTUAL_ENV=$HOME/.venv
|
|
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
|
|
# Requests comes with it's own CA bundle, but we want to use always use the system CA bundle.
|
|
ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
|
|
|
|
|
|
FROM base-empty AS full-deps-prebuild
|
|
|
|
USER 0
|
|
RUN apt-get update && apt-get install --no-install-recommends -y -qq \
|
|
build-essential \
|
|
maven \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
USER datahub
|
|
|
|
RUN uv pip install python-ldap==3.4.4
|
|
RUN --mount=type=cache,target=$HOME/.cache/uv,uid=1000,gid=1000 \
|
|
--mount=type=bind,source=./docker/datahub-ingestion/pyspark_jars.sh,target=/pyspark_jars.sh \
|
|
uv pip install python-ldap==3.4.4 pyspark~=3.5.0 && \
|
|
/pyspark_jars.sh
|
|
|
|
FROM base-empty AS base-slim
|
|
# Nothing to do here.
|
|
|
|
FROM base-slim AS base-full
|
|
|
|
USER 0
|
|
RUN apt-get update && apt-get install --no-install-recommends -y -qq \
|
|
default-jre-headless \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
RUN if [ $(arch) = "x86_64" ]; then \
|
|
mkdir /opt/oracle && \
|
|
cd /opt/oracle && \
|
|
wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/2115000/instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \
|
|
unzip instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \
|
|
rm instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \
|
|
sh -c "echo /opt/oracle/instantclient_21_15 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \
|
|
ldconfig; \
|
|
else \
|
|
mkdir /opt/oracle && \
|
|
cd /opt/oracle && \
|
|
wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/1923000/instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \
|
|
unzip instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \
|
|
rm instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \
|
|
sh -c "echo /opt/oracle/instantclient_19_23 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \
|
|
ldconfig; \
|
|
fi;
|
|
|
|
USER datahub
|
|
|
|
RUN --mount=from=full-deps-prebuild,source=$HOME/.venv,target=/venv-full \
|
|
rm -r .venv && \
|
|
cp -r /venv-full .venv
|
|
# INLINE-END
|
|
|
|
FROM base-${APP_ENV} AS add-code
|
|
|
|
COPY --chown=datahub ./metadata-ingestion /metadata-ingestion
|
|
COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /airflow-plugin
|
|
|
|
ARG RELEASE_VERSION
|
|
RUN test -n "$RELEASE_VERSION" # RELEASE_VERSION is a required build arg
|
|
RUN test -d /metadata-ingestion/src/datahub/metadata # codegen must be run prior to building the image
|
|
RUN sed -i.bak "s/__version__ = .*$/__version__ = \"$RELEASE_VERSION\"/" /metadata-ingestion/src/datahub/_version.py && \
|
|
sed -i.bak "s/__version__ = .*$/__version__ = \"$RELEASE_VERSION\"/" /airflow-plugin/src/datahub_airflow_plugin/_version.py && \
|
|
cat /metadata-ingestion/src/datahub/_version.py | grep __version__ && \
|
|
cat /airflow-plugin/src/datahub_airflow_plugin/_version.py | grep __version__
|
|
|
|
FROM add-code AS install-slim
|
|
|
|
RUN --mount=type=cache,target=$HOME/.cache/uv,uid=1000,gid=1000 \
|
|
UV_LINK_MODE=copy uv pip install -e "/metadata-ingestion/[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" && \
|
|
datahub --version
|
|
|
|
FROM add-code AS install-full
|
|
|
|
RUN --mount=type=cache,target=$HOME/.cache/uv,uid=1000,gid=1000 \
|
|
UV_LINK_MODE=copy uv pip install \
|
|
-e "/metadata-ingestion/[base,all]" \
|
|
-e "/airflow-plugin/[plugin-v2]" && \
|
|
datahub --version
|
|
|
|
FROM install-${APP_ENV} AS final
|
|
|
|
ENTRYPOINT [ "datahub" ]
|