fix(datahub-ingestion): prevent transitive deps, bump addtional pyspa… (#9233)

This commit is contained in:
david-leifker 2023-11-13 16:26:53 -06:00 committed by GitHub
parent 3844b78fa2
commit ff90fb633d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 70 additions and 39 deletions

View File

@ -1,6 +1,7 @@
**/node_modules/ **/node_modules/
datahub-frontend/build/ */build/
metadata-ingestion/venv/ */*/build/
*/venv/
out out
**/*.class **/*.class
# Have to copy gradle/wrapper/gradle-wrapper.jar, can't exclude ALL jars # Have to copy gradle/wrapper/gradle-wrapper.jar, can't exclude ALL jars

View File

@ -77,10 +77,11 @@ docker {
version "v${version}" version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) { files fileTree(rootProject.projectDir) {
include '.dockerignore'
include 'docker/monitoring/*' include 'docker/monitoring/*'
include "docker/${docker_dir}/*" include "docker/${docker_dir}/*"
}.exclude { }.exclude {
i -> i.file.isHidden() || i.file == buildDir i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
} }
tag("Debug", "${docker_registry}/${docker_repo}:debug") tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -88,10 +88,11 @@ docker {
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
files bootJar.outputs.files files bootJar.outputs.files
files fileTree(rootProject.projectDir) { files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_repo}/*" include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*' include 'metadata-models/src/main/resources/*'
}.exclude { }.exclude {
i -> i.file.isHidden() || i.file == buildDir i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
} }
tag("Debug", "${docker_registry}/${docker_repo}:debug") tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -10,18 +10,20 @@ ext {
docker_repo = 'datahub-ingestion-base' docker_repo = 'datahub-ingestion-base'
docker_dir = 'datahub-ingestion-base' docker_dir = 'datahub-ingestion-base'
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"
revision = 2 // increment to trigger rebuild revision = 2 // increment to trigger rebuild
} }
docker { docker {
name "${docker_registry}/${docker_repo}:v${version}-${docker_target}" name "${docker_registry}/${docker_repo}:v${docker_version}"
version "v${version}-${docker_target}" version "v${docker_version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) { files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*" include "docker/${docker_dir}/*"
}.exclude { }.exclude {
i -> i.file.isHidden() || i.file == buildDir i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
} }
buildArgs([APP_ENV: docker_target]) buildArgs([APP_ENV: docker_target])
} }

View File

@ -32,8 +32,8 @@ COPY ./docker/datahub-ingestion/pyspark_jars.sh .
RUN pip install --no-cache --user ".[base]" && \ RUN pip install --no-cache --user ".[base]" && \
pip install --no-cache --user "./airflow-plugin[acryl-datahub-airflow-plugin]" && \ pip install --no-cache --user "./airflow-plugin[acryl-datahub-airflow-plugin]" && \
pip install --no-cache --user ".[all]" && \ pip install --no-cache --user ".[all]"
./pyspark_jars.sh RUN ./pyspark_jars.sh
FROM base as full-install FROM base as full-install

View File

@ -2,3 +2,10 @@
[![datahub-ingestion docker](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml/badge.svg)](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml) [![datahub-ingestion docker](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml/badge.svg)](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml)
Refer to the [metadata ingestion framework](../../metadata-ingestion) to understand the architecture and responsibilities of this service. Refer to the [metadata ingestion framework](../../metadata-ingestion) to understand the architecture and responsibilities of this service.
## Slim vs Full Image Build
There are two versions of this image. One includes pyspark and Oracle dependencies and is larger due to the java dependencies.
Running the standard build results in the `slim` image without pyspark being generated by default. In order to build the full
image with pyspark use the following project property `-PdockerTarget=full`.

View File

@ -9,6 +9,8 @@ ext {
docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry
docker_repo = 'datahub-ingestion' docker_repo = 'datahub-ingestion'
docker_dir = 'datahub-ingestion' docker_dir = 'datahub-ingestion'
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"
revision = 2 // increment to trigger rebuild revision = 2 // increment to trigger rebuild
} }
@ -19,21 +21,19 @@ dependencies {
} }
docker { docker {
name "${docker_registry}/${docker_repo}:v${version}-slim" name "${docker_registry}/${docker_repo}:v${docker_version}"
version "v${version}-slim" version "v${docker_version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile-slim-only") dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile${docker_target == "slim" ? "-slim-only" : ""}")
files fileTree(rootProject.projectDir) { files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*" include "docker/${docker_dir}/*"
include "metadata-ingestion/**" include "metadata-ingestion/**"
include "metadata-ingestion-modules/**" include "metadata-ingestion-modules/**"
}.exclude { }.exclude {
i -> i.file.isHidden() || i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
i.file == buildDir ||
i.file == project(':metadata-ingestion').buildDir ||
i.file == project(':metadata-ingestion-modules').buildDir
} }
buildArgs([DOCKER_VERSION: version, buildArgs([DOCKER_VERSION: version,
RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace('-slim', '')]) RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')])
} }
tasks.getByName('docker').dependsOn(['build', tasks.getByName('docker').dependsOn(['build',
':docker:datahub-ingestion-base:docker', ':docker:datahub-ingestion-base:docker',

View File

@ -2,21 +2,33 @@
set -ex set -ex
HADOOP_CLIENT_DEPENDENCY="${HADOOP_CLIENT_DEPENDENCY:-org.apache.hadoop:hadoop-client:3.3.6}"
ZOOKEEPER_DEPENDENCY="${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars" PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars"
# Remove conflicting versions function replace_jar {
echo "Removing version conflicts from $PYSPARK_JARS" JAR_PREFIX=$1
CONFLICTS="zookeeper hadoop- slf4j-" TRANSITIVE=$2
for jar in $CONFLICTS; do DEPENDENCY=$3
rm "$PYSPARK_JARS/$jar"*.jar
done
# Fetch dependencies echo "Removing version conflicts for $PYSPARK_JARS/$JAR_PREFIX*.jar"
mvn dependency:get -Dtransitive=true -Dartifact="$HADOOP_CLIENT_DEPENDENCY" ls "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
mvn dependency:get -Dtransitive=true -Dartifact="$ZOOKEEPER_DEPENDENCY" rm "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
rm -r "$HOME/.m2" || true
# Move to pyspark location if [ ! -z "$DEPENDENCY" ]; then
echo "Moving jars to $PYSPARK_JARS" echo "Resolving $DEPENDENCY"
find "$HOME/.m2" -type f -name "*.jar" -exec mv {} "$PYSPARK_JARS/" \; mvn dependency:get -Dtransitive=$TRANSITIVE -Dartifact="$DEPENDENCY" >/dev/null
echo "Moving jars to $PYSPARK_JARS"
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec echo "{}" \;
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec cp {} "$PYSPARK_JARS/" \;
fi
}
replace_jar "zookeeper-" "false" "${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_API_DEPENDENCY:-org.apache.hadoop:hadoop-client-api:3.3.6}"
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_RUNTIME_DEPENDENCY:-org.apache.hadoop:hadoop-client-runtime:3.3.6}"
replace_jar "hadoop-yarn-" "true" "${HADOOP_YARN_DEPENDENCY:-org.apache.hadoop:hadoop-yarn-server-web-proxy:3.3.6}"
replace_jar "snappy-java-" "false" "${SNAPPY_JAVA_DEPENDENCY:-org.xerial.snappy:snappy-java:1.1.10.5}"
replace_jar "libthrift-" "false" "${LIBTHRIFT_DEPENDENCY:-org.apache.thrift:libthrift:0.19.0}"
replace_jar "ivy-" "false" "${IVY_DEPENDENCY:-org.apache.ivy:ivy:2.5.2}"
replace_jar "parquet-jackson-" "false" "${PARQUET_JACKSON_DEPENDENCY:-org.apache.parquet:parquet-jackson:1.13.1}"

View File

@ -15,10 +15,11 @@ docker {
version "v${version}" version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) { files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*" include "docker/${docker_dir}/*"
include "metadata-service/restli-servlet-impl/src/main/resources/index/**" include 'metadata-service/restli-servlet-impl/src/main/resources/index/**'
}.exclude { }.exclude {
i -> i.file.isHidden() || i.file == buildDir i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
} }
tag("Debug", "${docker_registry}/${docker_repo}:debug") tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -15,9 +15,10 @@ docker {
version "v${version}" version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) { files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*" include "docker/${docker_dir}/*"
}.exclude { }.exclude {
i -> i.file.isHidden() || i.file == buildDir i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
} }
tag("Debug", "${docker_registry}/${docker_repo}:debug") tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -16,9 +16,10 @@ docker {
version "v${version}" version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) { files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*" include "docker/${docker_dir}/*"
}.exclude { }.exclude {
i -> i.file.isHidden() || i.file == buildDir i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
} }
tag("Debug", "${docker_registry}/${docker_repo}:debug") tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -16,9 +16,10 @@ docker {
version "v${version}" version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) { files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*" include "docker/${docker_dir}/*"
}.exclude { }.exclude {
i -> i.file.isHidden() || i.file == buildDir i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
} }
tag("Debug", "${docker_registry}/${docker_repo}:debug") tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -45,11 +45,12 @@ docker {
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
files bootJar.outputs.files files bootJar.outputs.files
files fileTree(rootProject.projectDir) { files fileTree(rootProject.projectDir) {
include '.dockerignore'
include 'docker/monitoring/*' include 'docker/monitoring/*'
include "docker/${docker_repo}/*" include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*' include 'metadata-models/src/main/resources/*'
}.exclude { }.exclude {
i -> i.file.isHidden() || i.file == buildDir i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
} }
tag("Debug", "${docker_registry}/${docker_repo}:debug") tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -56,11 +56,12 @@ docker {
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
files bootJar.outputs.files files bootJar.outputs.files
files fileTree(rootProject.projectDir) { files fileTree(rootProject.projectDir) {
include '.dockerignore'
include 'docker/monitoring/*' include 'docker/monitoring/*'
include "docker/${docker_repo}/*" include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*' include 'metadata-models/src/main/resources/*'
}.exclude { }.exclude {
i -> i.file.isHidden() || i.file == buildDir i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
} }
tag("Debug", "${docker_registry}/${docker_repo}:debug") tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -70,11 +70,12 @@ docker {
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
files war.outputs.files files war.outputs.files
files fileTree(rootProject.projectDir) { files fileTree(rootProject.projectDir) {
include '.dockerignore'
include 'docker/monitoring/*' include 'docker/monitoring/*'
include "docker/${docker_repo}/*" include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*' include 'metadata-models/src/main/resources/*'
}.exclude { }.exclude {
i -> i.file.isHidden() || i.file == buildDir i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
} }
tag("Debug", "${docker_registry}/${docker_repo}:debug") tag("Debug", "${docker_registry}/${docker_repo}:debug")