mirror of
https://github.com/datahub-project/datahub.git
synced 2025-06-27 05:03:31 +00:00
fix(datahub-ingestion): prevent transitive deps, bump addtional pyspa… (#9233)
This commit is contained in:
parent
3844b78fa2
commit
ff90fb633d
@ -1,6 +1,7 @@
|
||||
**/node_modules/
|
||||
datahub-frontend/build/
|
||||
metadata-ingestion/venv/
|
||||
*/build/
|
||||
*/*/build/
|
||||
*/venv/
|
||||
out
|
||||
**/*.class
|
||||
# Have to copy gradle/wrapper/gradle-wrapper.jar, can't exclude ALL jars
|
||||
|
@ -77,10 +77,11 @@ docker {
|
||||
version "v${version}"
|
||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
||||
files fileTree(rootProject.projectDir) {
|
||||
include '.dockerignore'
|
||||
include 'docker/monitoring/*'
|
||||
include "docker/${docker_dir}/*"
|
||||
}.exclude {
|
||||
i -> i.file.isHidden() || i.file == buildDir
|
||||
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||
}
|
||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||
|
||||
|
@ -88,10 +88,11 @@ docker {
|
||||
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
|
||||
files bootJar.outputs.files
|
||||
files fileTree(rootProject.projectDir) {
|
||||
include '.dockerignore'
|
||||
include "docker/${docker_repo}/*"
|
||||
include 'metadata-models/src/main/resources/*'
|
||||
}.exclude {
|
||||
i -> i.file.isHidden() || i.file == buildDir
|
||||
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||
}
|
||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||
|
||||
|
@ -10,18 +10,20 @@ ext {
|
||||
docker_repo = 'datahub-ingestion-base'
|
||||
docker_dir = 'datahub-ingestion-base'
|
||||
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
|
||||
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"
|
||||
|
||||
revision = 2 // increment to trigger rebuild
|
||||
}
|
||||
|
||||
docker {
|
||||
name "${docker_registry}/${docker_repo}:v${version}-${docker_target}"
|
||||
version "v${version}-${docker_target}"
|
||||
name "${docker_registry}/${docker_repo}:v${docker_version}"
|
||||
version "v${docker_version}"
|
||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
||||
files fileTree(rootProject.projectDir) {
|
||||
include '.dockerignore'
|
||||
include "docker/${docker_dir}/*"
|
||||
}.exclude {
|
||||
i -> i.file.isHidden() || i.file == buildDir
|
||||
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||
}
|
||||
buildArgs([APP_ENV: docker_target])
|
||||
}
|
||||
|
@ -32,8 +32,8 @@ COPY ./docker/datahub-ingestion/pyspark_jars.sh .
|
||||
|
||||
RUN pip install --no-cache --user ".[base]" && \
|
||||
pip install --no-cache --user "./airflow-plugin[acryl-datahub-airflow-plugin]" && \
|
||||
pip install --no-cache --user ".[all]" && \
|
||||
./pyspark_jars.sh
|
||||
pip install --no-cache --user ".[all]"
|
||||
RUN ./pyspark_jars.sh
|
||||
|
||||
FROM base as full-install
|
||||
|
||||
|
@ -2,3 +2,10 @@
|
||||
[](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml)
|
||||
|
||||
Refer to the [metadata ingestion framework](../../metadata-ingestion) to understand the architecture and responsibilities of this service.
|
||||
|
||||
## Slim vs Full Image Build
|
||||
|
||||
There are two versions of this image. One includes pyspark and Oracle dependencies and is larger due to the java dependencies.
|
||||
|
||||
Running the standard build results in the `slim` image without pyspark being generated by default. In order to build the full
|
||||
image with pyspark use the following project property `-PdockerTarget=full`.
|
||||
|
@ -9,6 +9,8 @@ ext {
|
||||
docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry
|
||||
docker_repo = 'datahub-ingestion'
|
||||
docker_dir = 'datahub-ingestion'
|
||||
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
|
||||
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"
|
||||
|
||||
revision = 2 // increment to trigger rebuild
|
||||
}
|
||||
@ -19,21 +21,19 @@ dependencies {
|
||||
}
|
||||
|
||||
docker {
|
||||
name "${docker_registry}/${docker_repo}:v${version}-slim"
|
||||
version "v${version}-slim"
|
||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile-slim-only")
|
||||
name "${docker_registry}/${docker_repo}:v${docker_version}"
|
||||
version "v${docker_version}"
|
||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile${docker_target == "slim" ? "-slim-only" : ""}")
|
||||
files fileTree(rootProject.projectDir) {
|
||||
include '.dockerignore'
|
||||
include "docker/${docker_dir}/*"
|
||||
include "metadata-ingestion/**"
|
||||
include "metadata-ingestion-modules/**"
|
||||
}.exclude {
|
||||
i -> i.file.isHidden() ||
|
||||
i.file == buildDir ||
|
||||
i.file == project(':metadata-ingestion').buildDir ||
|
||||
i.file == project(':metadata-ingestion-modules').buildDir
|
||||
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||
}
|
||||
buildArgs([DOCKER_VERSION: version,
|
||||
RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace('-slim', '')])
|
||||
RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')])
|
||||
}
|
||||
tasks.getByName('docker').dependsOn(['build',
|
||||
':docker:datahub-ingestion-base:docker',
|
||||
|
@ -2,21 +2,33 @@
|
||||
|
||||
set -ex
|
||||
|
||||
HADOOP_CLIENT_DEPENDENCY="${HADOOP_CLIENT_DEPENDENCY:-org.apache.hadoop:hadoop-client:3.3.6}"
|
||||
ZOOKEEPER_DEPENDENCY="${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
|
||||
PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars"
|
||||
|
||||
# Remove conflicting versions
|
||||
echo "Removing version conflicts from $PYSPARK_JARS"
|
||||
CONFLICTS="zookeeper hadoop- slf4j-"
|
||||
for jar in $CONFLICTS; do
|
||||
rm "$PYSPARK_JARS/$jar"*.jar
|
||||
done
|
||||
function replace_jar {
|
||||
JAR_PREFIX=$1
|
||||
TRANSITIVE=$2
|
||||
DEPENDENCY=$3
|
||||
|
||||
# Fetch dependencies
|
||||
mvn dependency:get -Dtransitive=true -Dartifact="$HADOOP_CLIENT_DEPENDENCY"
|
||||
mvn dependency:get -Dtransitive=true -Dartifact="$ZOOKEEPER_DEPENDENCY"
|
||||
echo "Removing version conflicts for $PYSPARK_JARS/$JAR_PREFIX*.jar"
|
||||
ls "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
|
||||
rm "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
|
||||
rm -r "$HOME/.m2" || true
|
||||
|
||||
# Move to pyspark location
|
||||
echo "Moving jars to $PYSPARK_JARS"
|
||||
find "$HOME/.m2" -type f -name "*.jar" -exec mv {} "$PYSPARK_JARS/" \;
|
||||
if [ ! -z "$DEPENDENCY" ]; then
|
||||
echo "Resolving $DEPENDENCY"
|
||||
mvn dependency:get -Dtransitive=$TRANSITIVE -Dartifact="$DEPENDENCY" >/dev/null
|
||||
|
||||
echo "Moving jars to $PYSPARK_JARS"
|
||||
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec echo "{}" \;
|
||||
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec cp {} "$PYSPARK_JARS/" \;
|
||||
fi
|
||||
}
|
||||
|
||||
replace_jar "zookeeper-" "false" "${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
|
||||
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_API_DEPENDENCY:-org.apache.hadoop:hadoop-client-api:3.3.6}"
|
||||
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_RUNTIME_DEPENDENCY:-org.apache.hadoop:hadoop-client-runtime:3.3.6}"
|
||||
replace_jar "hadoop-yarn-" "true" "${HADOOP_YARN_DEPENDENCY:-org.apache.hadoop:hadoop-yarn-server-web-proxy:3.3.6}"
|
||||
replace_jar "snappy-java-" "false" "${SNAPPY_JAVA_DEPENDENCY:-org.xerial.snappy:snappy-java:1.1.10.5}"
|
||||
replace_jar "libthrift-" "false" "${LIBTHRIFT_DEPENDENCY:-org.apache.thrift:libthrift:0.19.0}"
|
||||
replace_jar "ivy-" "false" "${IVY_DEPENDENCY:-org.apache.ivy:ivy:2.5.2}"
|
||||
replace_jar "parquet-jackson-" "false" "${PARQUET_JACKSON_DEPENDENCY:-org.apache.parquet:parquet-jackson:1.13.1}"
|
||||
|
@ -15,10 +15,11 @@ docker {
|
||||
version "v${version}"
|
||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
||||
files fileTree(rootProject.projectDir) {
|
||||
include '.dockerignore'
|
||||
include "docker/${docker_dir}/*"
|
||||
include "metadata-service/restli-servlet-impl/src/main/resources/index/**"
|
||||
include 'metadata-service/restli-servlet-impl/src/main/resources/index/**'
|
||||
}.exclude {
|
||||
i -> i.file.isHidden() || i.file == buildDir
|
||||
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||
}
|
||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||
|
||||
|
@ -15,9 +15,10 @@ docker {
|
||||
version "v${version}"
|
||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
||||
files fileTree(rootProject.projectDir) {
|
||||
include '.dockerignore'
|
||||
include "docker/${docker_dir}/*"
|
||||
}.exclude {
|
||||
i -> i.file.isHidden() || i.file == buildDir
|
||||
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||
}
|
||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||
|
||||
|
@ -16,9 +16,10 @@ docker {
|
||||
version "v${version}"
|
||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
||||
files fileTree(rootProject.projectDir) {
|
||||
include '.dockerignore'
|
||||
include "docker/${docker_dir}/*"
|
||||
}.exclude {
|
||||
i -> i.file.isHidden() || i.file == buildDir
|
||||
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||
}
|
||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||
|
||||
|
@ -16,9 +16,10 @@ docker {
|
||||
version "v${version}"
|
||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
||||
files fileTree(rootProject.projectDir) {
|
||||
include '.dockerignore'
|
||||
include "docker/${docker_dir}/*"
|
||||
}.exclude {
|
||||
i -> i.file.isHidden() || i.file == buildDir
|
||||
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||
}
|
||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||
|
||||
|
@ -45,11 +45,12 @@ docker {
|
||||
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
|
||||
files bootJar.outputs.files
|
||||
files fileTree(rootProject.projectDir) {
|
||||
include '.dockerignore'
|
||||
include 'docker/monitoring/*'
|
||||
include "docker/${docker_repo}/*"
|
||||
include 'metadata-models/src/main/resources/*'
|
||||
}.exclude {
|
||||
i -> i.file.isHidden() || i.file == buildDir
|
||||
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||
}
|
||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||
|
||||
|
@ -56,11 +56,12 @@ docker {
|
||||
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
|
||||
files bootJar.outputs.files
|
||||
files fileTree(rootProject.projectDir) {
|
||||
include '.dockerignore'
|
||||
include 'docker/monitoring/*'
|
||||
include "docker/${docker_repo}/*"
|
||||
include 'metadata-models/src/main/resources/*'
|
||||
}.exclude {
|
||||
i -> i.file.isHidden() || i.file == buildDir
|
||||
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||
}
|
||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||
|
||||
|
@ -70,11 +70,12 @@ docker {
|
||||
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
|
||||
files war.outputs.files
|
||||
files fileTree(rootProject.projectDir) {
|
||||
include '.dockerignore'
|
||||
include 'docker/monitoring/*'
|
||||
include "docker/${docker_repo}/*"
|
||||
include 'metadata-models/src/main/resources/*'
|
||||
}.exclude {
|
||||
i -> i.file.isHidden() || i.file == buildDir
|
||||
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||
}
|
||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user