datahub/docker/datahub-ingestion/pyspark_jars.sh

35 lines
1.5 KiB
Bash
Executable File

#!/bin/bash
set -ex
PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars"
function replace_jar {
JAR_PREFIX=$1
TRANSITIVE=$2
DEPENDENCY=$3
echo "Removing version conflicts for $PYSPARK_JARS/$JAR_PREFIX*.jar"
ls "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
rm "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
rm -r "$HOME/.m2" || true
if [ ! -z "$DEPENDENCY" ]; then
echo "Resolving $DEPENDENCY"
mvn dependency:get -Dtransitive=$TRANSITIVE -Dartifact="$DEPENDENCY" >/dev/null
echo "Moving jars to $PYSPARK_JARS"
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec echo "{}" \;
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec cp {} "$PYSPARK_JARS/" \;
fi
}
replace_jar "zookeeper-" "false" "${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_API_DEPENDENCY:-org.apache.hadoop:hadoop-client-api:3.3.6}"
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_RUNTIME_DEPENDENCY:-org.apache.hadoop:hadoop-client-runtime:3.3.6}"
replace_jar "hadoop-yarn-" "true" "${HADOOP_YARN_DEPENDENCY:-org.apache.hadoop:hadoop-yarn-server-web-proxy:3.3.6}"
replace_jar "snappy-java-" "false" "${SNAPPY_JAVA_DEPENDENCY:-org.xerial.snappy:snappy-java:1.1.10.5}"
replace_jar "libthrift-" "false" "${LIBTHRIFT_DEPENDENCY:-org.apache.thrift:libthrift:0.19.0}"
replace_jar "ivy-" "false" "${IVY_DEPENDENCY:-org.apache.ivy:ivy:2.5.2}"
replace_jar "parquet-jackson-" "false" "${PARQUET_JACKSON_DEPENDENCY:-org.apache.parquet:parquet-jackson:1.13.1}"