mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-07 01:00:41 +00:00
42 lines
1.3 KiB
Docker
42 lines
1.3 KiB
Docker
FROM python:3.9
|
|
|
|
ARG shared_workspace=/opt/workspace
|
|
|
|
|
|
ENV SHARED_WORKSPACE=${shared_workspace}
|
|
|
|
# -- Layer: Apache Spark
|
|
|
|
ARG spark_version=3.2.0
|
|
ARG hadoop_version=2.7
|
|
|
|
RUN apt-get update -y && \
|
|
apt-get install -y --no-install-recommends curl gnupg software-properties-common && \
|
|
apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 0xB1998361219BD9C9 && \
|
|
curl https://cdn.azul.com/zulu/bin/zulu-repo_1.0.0-3_all.deb -o /tmp/zulu-repo_1.0.0-3_all.deb && \
|
|
apt-get install /tmp/zulu-repo_1.0.0-3_all.deb && \
|
|
apt-get update && \
|
|
# apt-cache search zulu && \
|
|
apt-get install -y --no-install-recommends zulu17-jre && \
|
|
apt-get clean && \
|
|
curl -sS https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz && \
|
|
tar -xf spark.tgz && \
|
|
mv spark-${spark_version}-bin-hadoop${hadoop_version} /usr/bin/ && \
|
|
mkdir /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/logs && \
|
|
rm spark.tgz && \
|
|
rm -rf /var/tmp/* /tmp/* /var/lib/apt/lists/*
|
|
|
|
RUN set -e; \
|
|
pip install JPype1
|
|
|
|
ENV SPARK_HOME /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}
|
|
ENV SPARK_MASTER_HOST spark-master
|
|
ENV SPARK_MASTER_PORT 7077
|
|
ENV PYSPARK_PYTHON python3.9
|
|
ENV PATH=$PATH:$SPARK_HOME/bin
|
|
|
|
COPY workspace $SHARED_WORKSPACE
|
|
|
|
WORKDIR ${SPARK_HOME}
|
|
|