fix(datahub-ingestion): prevent transitive deps, bump addtional pyspa… (#9233)

This commit is contained in:
david-leifker 2023-11-13 16:26:53 -06:00 committed by GitHub
parent 3844b78fa2
commit ff90fb633d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 70 additions and 39 deletions

View File

@ -1,6 +1,7 @@
**/node_modules/
datahub-frontend/build/
metadata-ingestion/venv/
*/build/
*/*/build/
*/venv/
out
**/*.class
# Have to copy gradle/wrapper/gradle-wrapper.jar, can't exclude ALL jars

View File

@ -77,10 +77,11 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include 'docker/monitoring/*'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -88,10 +88,11 @@ docker {
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
files bootJar.outputs.files
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*'
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -10,18 +10,20 @@ ext {
docker_repo = 'datahub-ingestion-base'
docker_dir = 'datahub-ingestion-base'
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"
revision = 2 // increment to trigger rebuild
}
docker {
name "${docker_registry}/${docker_repo}:v${version}-${docker_target}"
version "v${version}-${docker_target}"
name "${docker_registry}/${docker_repo}:v${docker_version}"
version "v${docker_version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
buildArgs([APP_ENV: docker_target])
}

View File

@ -32,8 +32,8 @@ COPY ./docker/datahub-ingestion/pyspark_jars.sh .
RUN pip install --no-cache --user ".[base]" && \
pip install --no-cache --user "./airflow-plugin[acryl-datahub-airflow-plugin]" && \
pip install --no-cache --user ".[all]" && \
./pyspark_jars.sh
pip install --no-cache --user ".[all]"
RUN ./pyspark_jars.sh
FROM base as full-install

View File

@ -2,3 +2,10 @@
[![datahub-ingestion docker](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml/badge.svg)](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml)
Refer to the [metadata ingestion framework](../../metadata-ingestion) to understand the architecture and responsibilities of this service.
## Slim vs Full Image Build
There are two versions of this image. One includes pyspark and Oracle dependencies and is larger due to the java dependencies.
Running the standard build results in the `slim` image without pyspark being generated by default. In order to build the full
image with pyspark use the following project property `-PdockerTarget=full`.

View File

@ -9,6 +9,8 @@ ext {
docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry
docker_repo = 'datahub-ingestion'
docker_dir = 'datahub-ingestion'
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"
revision = 2 // increment to trigger rebuild
}
@ -19,21 +21,19 @@ dependencies {
}
docker {
name "${docker_registry}/${docker_repo}:v${version}-slim"
version "v${version}-slim"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile-slim-only")
name "${docker_registry}/${docker_repo}:v${docker_version}"
version "v${docker_version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile${docker_target == "slim" ? "-slim-only" : ""}")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
include "metadata-ingestion/**"
include "metadata-ingestion-modules/**"
}.exclude {
i -> i.file.isHidden() ||
i.file == buildDir ||
i.file == project(':metadata-ingestion').buildDir ||
i.file == project(':metadata-ingestion-modules').buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
buildArgs([DOCKER_VERSION: version,
RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace('-slim', '')])
RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')])
}
tasks.getByName('docker').dependsOn(['build',
':docker:datahub-ingestion-base:docker',

View File

@ -2,21 +2,33 @@
set -ex
HADOOP_CLIENT_DEPENDENCY="${HADOOP_CLIENT_DEPENDENCY:-org.apache.hadoop:hadoop-client:3.3.6}"
ZOOKEEPER_DEPENDENCY="${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars"
# Remove conflicting versions
echo "Removing version conflicts from $PYSPARK_JARS"
CONFLICTS="zookeeper hadoop- slf4j-"
for jar in $CONFLICTS; do
rm "$PYSPARK_JARS/$jar"*.jar
done
function replace_jar {
JAR_PREFIX=$1
TRANSITIVE=$2
DEPENDENCY=$3
# Fetch dependencies
mvn dependency:get -Dtransitive=true -Dartifact="$HADOOP_CLIENT_DEPENDENCY"
mvn dependency:get -Dtransitive=true -Dartifact="$ZOOKEEPER_DEPENDENCY"
echo "Removing version conflicts for $PYSPARK_JARS/$JAR_PREFIX*.jar"
ls "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
rm "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
rm -r "$HOME/.m2" || true
if [ ! -z "$DEPENDENCY" ]; then
echo "Resolving $DEPENDENCY"
mvn dependency:get -Dtransitive=$TRANSITIVE -Dartifact="$DEPENDENCY" >/dev/null
# Move to pyspark location
echo "Moving jars to $PYSPARK_JARS"
find "$HOME/.m2" -type f -name "*.jar" -exec mv {} "$PYSPARK_JARS/" \;
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec echo "{}" \;
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec cp {} "$PYSPARK_JARS/" \;
fi
}
replace_jar "zookeeper-" "false" "${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_API_DEPENDENCY:-org.apache.hadoop:hadoop-client-api:3.3.6}"
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_RUNTIME_DEPENDENCY:-org.apache.hadoop:hadoop-client-runtime:3.3.6}"
replace_jar "hadoop-yarn-" "true" "${HADOOP_YARN_DEPENDENCY:-org.apache.hadoop:hadoop-yarn-server-web-proxy:3.3.6}"
replace_jar "snappy-java-" "false" "${SNAPPY_JAVA_DEPENDENCY:-org.xerial.snappy:snappy-java:1.1.10.5}"
replace_jar "libthrift-" "false" "${LIBTHRIFT_DEPENDENCY:-org.apache.thrift:libthrift:0.19.0}"
replace_jar "ivy-" "false" "${IVY_DEPENDENCY:-org.apache.ivy:ivy:2.5.2}"
replace_jar "parquet-jackson-" "false" "${PARQUET_JACKSON_DEPENDENCY:-org.apache.parquet:parquet-jackson:1.13.1}"

View File

@ -15,10 +15,11 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
include "metadata-service/restli-servlet-impl/src/main/resources/index/**"
include 'metadata-service/restli-servlet-impl/src/main/resources/index/**'
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -15,9 +15,10 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -16,9 +16,10 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -16,9 +16,10 @@ docker {
version "v${version}"
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include "docker/${docker_dir}/*"
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -45,11 +45,12 @@ docker {
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
files bootJar.outputs.files
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include 'docker/monitoring/*'
include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*'
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -56,11 +56,12 @@ docker {
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
files bootJar.outputs.files
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include 'docker/monitoring/*'
include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*'
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")

View File

@ -70,11 +70,12 @@ docker {
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
files war.outputs.files
files fileTree(rootProject.projectDir) {
include '.dockerignore'
include 'docker/monitoring/*'
include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*'
}.exclude {
i -> i.file.isHidden() || i.file == buildDir
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
}
tag("Debug", "${docker_registry}/${docker_repo}:debug")