From e203ece20c4c79c75f104c46e863bbbe9c6198b5 Mon Sep 17 00:00:00 2001 From: Matias Puerta Date: Wed, 2 Aug 2023 14:36:27 +0200 Subject: [PATCH] [CHORE] Reduce docker image size by removing pip cache (#12708) * [CHORE] Reduce docker image size by removing pip cache * [CHORE] Reduce image size for ingestion/operators as well * [CHORE] Reduce image size for CI --- ingestion/Dockerfile | 37 ++++++++++++--------- ingestion/Dockerfile.ci | 32 ++++++++++++------- ingestion/operators/docker/Dockerfile | 35 ++++++++++++-------- ingestion/operators/docker/Dockerfile-dev | 39 +++++++++++++---------- 4 files changed, 87 insertions(+), 56 deletions(-) diff --git a/ingestion/Dockerfile b/ingestion/Dockerfile index 48c216a8e1b..379e2591bb6 100644 --- a/ingestion/Dockerfile +++ b/ingestion/Dockerfile @@ -1,10 +1,12 @@ FROM apache/airflow:2.6.3-python3.9 USER root -RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - -RUN curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list +RUN curl -sS https://packages.microsoft.com/keys/microsoft.asc | apt-key add - +RUN curl -sS https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list # Install Dependencies (listed in alphabetical order) -RUN apt-get update \ - && apt-get install -y alien \ +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get -qq update \ + && apt-get -qq install -y \ + alien \ build-essential \ default-libmysqlclient-dev \ freetds-bin \ @@ -31,7 +33,6 @@ RUN apt-get update \ unixodbc \ unixodbc-dev \ unzip \ - vim \ wget --no-install-recommends \ # Accept MSSQL ODBC License && ACCEPT_EULA=Y apt-get install -y msodbcsql18 \ @@ -39,11 +40,11 @@ RUN apt-get update \ RUN if [[ $(uname -m) == "arm64" || $(uname -m) == "aarch64" ]]; \ then \ - wget https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip -O /oracle-instantclient.zip && \ - unzip -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \ + wget -q https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip -O /oracle-instantclient.zip && \ + unzip -qq -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \ else \ - wget https://download.oracle.com/otn_software/linux/instantclient/1917000/instantclient-basic-linux.x64-19.17.0.0.0dbru.zip -O /oracle-instantclient.zip && \ - unzip -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \ + wget -q https://download.oracle.com/otn_software/linux/instantclient/1917000/instantclient-basic-linux.x64-19.17.0.0.0dbru.zip -O /oracle-instantclient.zip && \ + unzip -qq -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \ fi ENV LD_LIBRARY_PATH=/instantclient @@ -55,25 +56,31 @@ ENV LD_LIBRARY_PATH=/instantclient # https://security.snyk.io/vuln/SNYK-DEBIAN11-BIND9-3027852 # https://security.snyk.io/vuln/SNYK-DEBIAN11-EXPAT-3023031 we are already installed the latest RUN echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/backports.list -RUN apt-get update \ - && apt-get install -t bullseye-backports -y \ +RUN apt-get -qq update \ + && apt-get -qq install -t bullseye-backports -y \ curl \ libpcre2-8-0 \ postgresql-common \ expat \ - bind9 + bind9 \ + && rm -rf /var/lib/apt/lists/* # Required for Starting Ingestion Container in Docker Compose -COPY --chown=airflow:0 ingestion/ingestion_dependency.sh /opt/airflow +COPY --chown=airflow:0 --chmod=775 ingestion/ingestion_dependency.sh /opt/airflow # Required for Ingesting Sample Data COPY --chown=airflow:0 ingestion/examples/sample_data /home/airflow/ingestion/examples/sample_data # Required for Airflow DAGs of Sample Data COPY --chown=airflow:0 ingestion/examples/airflow/dags /opt/airflow/dags -# Provide Execute Permissions to shell script -RUN chmod +x /opt/airflow/ingestion_dependency.sh USER airflow # Argument to provide for Ingestion Dependencies to install. Defaults to all ARG INGESTION_DEPENDENCY="all" + +# Disable pip cache dir +# https://pip.pypa.io/en/stable/topics/caching/#avoiding-caching +ENV PIP_NO_CACHE_DIR=1 +# Make pip silent +ENV PIP_QUIET=1 + RUN pip install --upgrade pip RUN pip install "openmetadata-managed-apis~=1.1.0.4" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.9.txt" RUN pip install "openmetadata-ingestion[${INGESTION_DEPENDENCY}]~=1.1.0.4" diff --git a/ingestion/Dockerfile.ci b/ingestion/Dockerfile.ci index 3d15d480a69..6931b899f21 100644 --- a/ingestion/Dockerfile.ci +++ b/ingestion/Dockerfile.ci @@ -1,10 +1,11 @@ FROM apache/airflow:2.6.3-python3.9 USER root -RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - -RUN curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list +RUN curl -sS https://packages.microsoft.com/keys/microsoft.asc | apt-key add - +RUN curl -sS https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list # Install Dependencies (listed in alphabetical order) -RUN apt-get update \ - && apt-get install -y alien \ +RUN apt-get -qq update \ + && apt-get -qq install -y \ + alien \ build-essential \ default-libmysqlclient-dev \ freetds-bin \ @@ -34,7 +35,7 @@ RUN apt-get update \ vim \ wget --no-install-recommends \ # Accept MSSQL ODBC License - && ACCEPT_EULA=Y apt-get install -y msodbcsql18 \ + && ACCEPT_EULA=Y apt-get -qq install -y msodbcsql18 \ && rm -rf /var/lib/apt/lists/* RUN if [[ $(uname -m) == "arm64" || $(uname -m) == "aarch64" ]]; \ @@ -55,8 +56,8 @@ ENV LD_LIBRARY_PATH=/instantclient # https://security.snyk.io/vuln/SNYK-DEBIAN11-BIND9-3027852 # https://security.snyk.io/vuln/SNYK-DEBIAN11-EXPAT-3023031 we are already installed the latest RUN echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/backports.list -RUN apt-get update \ - && apt-get install -t bullseye-backports -y \ +RUN apt-get -qq update \ + && apt-get -qq install -t bullseye-backports -y \ curl \ libpcre2-8-0 \ postgresql-common \ @@ -64,24 +65,32 @@ RUN apt-get update \ bind9 # Required for Starting Ingestion Container in Docker Compose -COPY --chown=airflow:0 ingestion/ingestion_dependency.sh /opt/airflow +# Provide Execute Permissions to shell script +COPY --chown=airflow:0 --chmod=775 ingestion/ingestion_dependency.sh /opt/airflow # Required for Ingesting Sample Data COPY --chown=airflow:0 ingestion /home/airflow/ingestion COPY --chown=airflow:0 openmetadata-airflow-apis /home/airflow/openmetadata-airflow-apis # Required for Airflow DAGs of Sample Data COPY --chown=airflow:0 ingestion/examples/airflow/dags /opt/airflow/dags -# Provide Execute Permissions to shell script -RUN chmod +x /opt/airflow/ingestion_dependency.sh + USER airflow ARG AIRFLOW_CONSTRAINTS_LOCATION="https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.9.txt" -# Argument to provide for Ingestion Dependencies to install. Defaults to all + +# Disable pip cache dir +# https://pip.pypa.io/en/stable/topics/caching/#avoiding-caching +ENV PIP_NO_CACHE_DIR=1 +# Make pip silent +ENV PIP_QUIET=1 + RUN pip install --upgrade pip WORKDIR /home/airflow/openmetadata-airflow-apis RUN pip install "." WORKDIR /home/airflow/ingestion + +# Argument to provide for Ingestion Dependencies to install. Defaults to all ARG INGESTION_DEPENDENCY="all" RUN pip install ".[${INGESTION_DEPENDENCY}]" @@ -105,6 +114,7 @@ RUN pip uninstall psycopg2-binary -y RUN pip install psycopg2 mysqlclient==2.1.1 # Make required folders for openmetadata-airflow-apis RUN mkdir -p /opt/airflow/dag_generated_configs + EXPOSE 8080 # This is required as it's responsible to create airflow.cfg file RUN airflow db init && rm -f /opt/airflow/airflow.db diff --git a/ingestion/operators/docker/Dockerfile b/ingestion/operators/docker/Dockerfile index 68da6a96805..1bdc6580702 100644 --- a/ingestion/operators/docker/Dockerfile +++ b/ingestion/operators/docker/Dockerfile @@ -1,11 +1,12 @@ FROM python:3.9-bullseye -RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - -RUN curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list +RUN curl -sS https://packages.microsoft.com/keys/microsoft.asc | apt-key add - +RUN curl -sS https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list # Install Dependencies (listed in alphabetical order) -RUN apt-get update \ - && apt-get install -y alien \ +RUN apt-get -qq update \ + && apt-get -qq install -y \ + alien \ build-essential \ default-libmysqlclient-dev \ freetds-bin \ @@ -34,24 +35,24 @@ RUN apt-get update \ unzip \ wget --no-install-recommends \ # Accept MSSQL ODBC License - && ACCEPT_EULA=Y apt-get install -y msodbcsql18 \ + && ACCEPT_EULA=Y apt-get -qq install -y msodbcsql18 \ && rm -rf /var/lib/apt/lists/* # Add updated postgres/redshift dependencies based on libq RUN curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - RUN echo "deb https://apt.postgresql.org/pub/repos/apt/ buster-pgdg main" > /etc/apt/sources.list.d/pgdg.list; \ - apt-get update; \ - apt-get install --no-install-recommends -y libpq-dev postgresql-client postgresql-common postgresql postgresql-contrib; \ - apt-get autoremove -yqq --purge; \ - apt-get clean && rm -rf /var/lib/apt/lists/* + apt-get -qq update; \ + apt-get -qq install --no-install-recommends -y libpq-dev postgresql-client postgresql-common postgresql postgresql-contrib; \ + apt-get -qq autoremove -yqq --purge; \ + apt-get -qq clean && rm -rf /var/lib/apt/lists/* RUN if [[ $(uname -m) == "arm64" || $(uname -m) == "aarch64" ]]; \ then \ - wget https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip -O /oracle-instantclient.zip && \ - unzip -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \ + wget -q https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip -O /oracle-instantclient.zip && \ + unzip -qq -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \ else \ - wget https://download.oracle.com/otn_software/linux/instantclient/1917000/instantclient-basic-linux.x64-19.17.0.0.0dbru.zip -O /oracle-instantclient.zip && \ - unzip -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \ + wget -q https://download.oracle.com/otn_software/linux/instantclient/1917000/instantclient-basic-linux.x64-19.17.0.0.0dbru.zip -O /oracle-instantclient.zip && \ + unzip -qq -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \ fi ENV LD_LIBRARY_PATH=/instantclient @@ -63,7 +64,7 @@ ENV LD_LIBRARY_PATH=/instantclient # https://security.snyk.io/vuln/SNYK-DEBIAN11-BIND9-3027852 # https://security.snyk.io/vuln/SNYK-DEBIAN11-EXPAT-3023031 we are already installed the latest RUN echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/backports.list -RUN apt-get update \ +RUN apt-get -qq update \ && apt-get install -t bullseye-backports -y \ curl \ libpcre2-8-0 \ @@ -76,6 +77,12 @@ WORKDIR ingestion/ # Required for Airflow DockerOperator, as we need to run the workflows from a `python main.py` command in the container. COPY ingestion/operators/docker/*.py . +# Disable pip cache dir +# https://pip.pypa.io/en/stable/topics/caching/#avoiding-caching +ENV PIP_NO_CACHE_DIR=1 +# Make pip silent +ENV PIP_QUIET=1 + RUN pip install --upgrade pip ARG INGESTION_DEPENDENCY="all" diff --git a/ingestion/operators/docker/Dockerfile-dev b/ingestion/operators/docker/Dockerfile-dev index ccd12475435..d4f069405d5 100644 --- a/ingestion/operators/docker/Dockerfile-dev +++ b/ingestion/operators/docker/Dockerfile-dev @@ -1,11 +1,12 @@ FROM python:3.9-bullseye -RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - -RUN curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list +RUN curl -sS https://packages.microsoft.com/keys/microsoft.asc | apt-key add - +RUN curl -sS https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list # Install Dependencies (listed in alphabetical order) -RUN apt-get update \ - && apt-get install -y alien \ +RUN apt-get -qq update \ + && apt-get -qq install -y \ + alien \ build-essential \ default-libmysqlclient-dev \ freetds-bin \ @@ -35,24 +36,24 @@ RUN apt-get update \ vim \ wget --no-install-recommends \ # Accept MSSQL ODBC License - && ACCEPT_EULA=Y apt-get install -y msodbcsql18 \ + && ACCEPT_EULA=Y apt-get -qq install -y msodbcsql18 \ && rm -rf /var/lib/apt/lists/* # Add updated postgres/redshift dependencies based on libq -RUN curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - +RUN curl -sS https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - RUN echo "deb https://apt.postgresql.org/pub/repos/apt/ buster-pgdg main" > /etc/apt/sources.list.d/pgdg.list; \ - apt-get update; \ - apt-get install --no-install-recommends -y libpq-dev postgresql-client postgresql-common postgresql postgresql-contrib; \ - apt-get autoremove -yqq --purge; \ - apt-get clean && rm -rf /var/lib/apt/lists/* + apt-get -qq update; \ + apt-get -qq install --no-install-recommends -y libpq-dev postgresql-client postgresql-common postgresql postgresql-contrib; \ + apt-get -qq autoremove -yqq --purge; \ + apt-get -qq clean && rm -rf /var/lib/apt/lists/* RUN if [[ $(uname -m) == "arm64" || $(uname -m) == "aarch64" ]]; \ then \ - wget https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip -O /oracle-instantclient.zip && \ - unzip -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \ + wget -q https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip -O /oracle-instantclient.zip && \ + unzip -qq -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \ else \ - wget https://download.oracle.com/otn_software/linux/instantclient/1917000/instantclient-basic-linux.x64-19.17.0.0.0dbru.zip -O /oracle-instantclient.zip && \ - unzip -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \ + wget -q https://download.oracle.com/otn_software/linux/instantclient/1917000/instantclient-basic-linux.x64-19.17.0.0.0dbru.zip -O /oracle-instantclient.zip && \ + unzip -qq -d /instantclient -j /oracle-instantclient.zip && rm -f /oracle-instantclient.zip; \ fi ENV LD_LIBRARY_PATH=/instantclient @@ -64,8 +65,8 @@ ENV LD_LIBRARY_PATH=/instantclient # https://security.snyk.io/vuln/SNYK-DEBIAN11-BIND9-3027852 # https://security.snyk.io/vuln/SNYK-DEBIAN11-EXPAT-3023031 we are already installed the latest RUN echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/backports.list -RUN apt-get update \ - && apt-get install -t bullseye-backports -y \ +RUN apt-get -qq update \ + && apt-get -qq install -t bullseye-backports -y \ curl \ libpcre2-8-0 \ postgresql-common \ @@ -77,6 +78,12 @@ WORKDIR ingestion/ # For the dev build, we copy all files COPY ingestion/ . +# Disable pip cache dir +# https://pip.pypa.io/en/stable/topics/caching/#avoiding-caching +ENV PIP_NO_CACHE_DIR=1 +# Make pip silent +ENV PIP_QUIET=1 + RUN pip install --upgrade pip setuptools ARG INGESTION_DEPENDENCY="all"