mirror of
https://github.com/datahub-project/datahub.git
synced 2025-06-27 05:03:31 +00:00
fix(datahub-ingestion): prevent transitive deps, bump addtional pyspa… (#9233)
This commit is contained in:
parent
3844b78fa2
commit
ff90fb633d
@ -1,6 +1,7 @@
|
|||||||
**/node_modules/
|
**/node_modules/
|
||||||
datahub-frontend/build/
|
*/build/
|
||||||
metadata-ingestion/venv/
|
*/*/build/
|
||||||
|
*/venv/
|
||||||
out
|
out
|
||||||
**/*.class
|
**/*.class
|
||||||
# Have to copy gradle/wrapper/gradle-wrapper.jar, can't exclude ALL jars
|
# Have to copy gradle/wrapper/gradle-wrapper.jar, can't exclude ALL jars
|
||||||
|
@ -77,10 +77,11 @@ docker {
|
|||||||
version "v${version}"
|
version "v${version}"
|
||||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
||||||
files fileTree(rootProject.projectDir) {
|
files fileTree(rootProject.projectDir) {
|
||||||
|
include '.dockerignore'
|
||||||
include 'docker/monitoring/*'
|
include 'docker/monitoring/*'
|
||||||
include "docker/${docker_dir}/*"
|
include "docker/${docker_dir}/*"
|
||||||
}.exclude {
|
}.exclude {
|
||||||
i -> i.file.isHidden() || i.file == buildDir
|
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||||
}
|
}
|
||||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||||
|
|
||||||
|
@ -88,10 +88,11 @@ docker {
|
|||||||
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
|
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
|
||||||
files bootJar.outputs.files
|
files bootJar.outputs.files
|
||||||
files fileTree(rootProject.projectDir) {
|
files fileTree(rootProject.projectDir) {
|
||||||
|
include '.dockerignore'
|
||||||
include "docker/${docker_repo}/*"
|
include "docker/${docker_repo}/*"
|
||||||
include 'metadata-models/src/main/resources/*'
|
include 'metadata-models/src/main/resources/*'
|
||||||
}.exclude {
|
}.exclude {
|
||||||
i -> i.file.isHidden() || i.file == buildDir
|
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||||
}
|
}
|
||||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||||
|
|
||||||
|
@ -10,18 +10,20 @@ ext {
|
|||||||
docker_repo = 'datahub-ingestion-base'
|
docker_repo = 'datahub-ingestion-base'
|
||||||
docker_dir = 'datahub-ingestion-base'
|
docker_dir = 'datahub-ingestion-base'
|
||||||
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
|
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
|
||||||
|
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"
|
||||||
|
|
||||||
revision = 2 // increment to trigger rebuild
|
revision = 2 // increment to trigger rebuild
|
||||||
}
|
}
|
||||||
|
|
||||||
docker {
|
docker {
|
||||||
name "${docker_registry}/${docker_repo}:v${version}-${docker_target}"
|
name "${docker_registry}/${docker_repo}:v${docker_version}"
|
||||||
version "v${version}-${docker_target}"
|
version "v${docker_version}"
|
||||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
||||||
files fileTree(rootProject.projectDir) {
|
files fileTree(rootProject.projectDir) {
|
||||||
|
include '.dockerignore'
|
||||||
include "docker/${docker_dir}/*"
|
include "docker/${docker_dir}/*"
|
||||||
}.exclude {
|
}.exclude {
|
||||||
i -> i.file.isHidden() || i.file == buildDir
|
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||||
}
|
}
|
||||||
buildArgs([APP_ENV: docker_target])
|
buildArgs([APP_ENV: docker_target])
|
||||||
}
|
}
|
||||||
|
@ -32,8 +32,8 @@ COPY ./docker/datahub-ingestion/pyspark_jars.sh .
|
|||||||
|
|
||||||
RUN pip install --no-cache --user ".[base]" && \
|
RUN pip install --no-cache --user ".[base]" && \
|
||||||
pip install --no-cache --user "./airflow-plugin[acryl-datahub-airflow-plugin]" && \
|
pip install --no-cache --user "./airflow-plugin[acryl-datahub-airflow-plugin]" && \
|
||||||
pip install --no-cache --user ".[all]" && \
|
pip install --no-cache --user ".[all]"
|
||||||
./pyspark_jars.sh
|
RUN ./pyspark_jars.sh
|
||||||
|
|
||||||
FROM base as full-install
|
FROM base as full-install
|
||||||
|
|
||||||
|
@ -2,3 +2,10 @@
|
|||||||
[](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml)
|
[](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml)
|
||||||
|
|
||||||
Refer to the [metadata ingestion framework](../../metadata-ingestion) to understand the architecture and responsibilities of this service.
|
Refer to the [metadata ingestion framework](../../metadata-ingestion) to understand the architecture and responsibilities of this service.
|
||||||
|
|
||||||
|
## Slim vs Full Image Build
|
||||||
|
|
||||||
|
There are two versions of this image. One includes pyspark and Oracle dependencies and is larger due to the java dependencies.
|
||||||
|
|
||||||
|
Running the standard build results in the `slim` image without pyspark being generated by default. In order to build the full
|
||||||
|
image with pyspark use the following project property `-PdockerTarget=full`.
|
||||||
|
@ -9,6 +9,8 @@ ext {
|
|||||||
docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry
|
docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry
|
||||||
docker_repo = 'datahub-ingestion'
|
docker_repo = 'datahub-ingestion'
|
||||||
docker_dir = 'datahub-ingestion'
|
docker_dir = 'datahub-ingestion'
|
||||||
|
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
|
||||||
|
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"
|
||||||
|
|
||||||
revision = 2 // increment to trigger rebuild
|
revision = 2 // increment to trigger rebuild
|
||||||
}
|
}
|
||||||
@ -19,21 +21,19 @@ dependencies {
|
|||||||
}
|
}
|
||||||
|
|
||||||
docker {
|
docker {
|
||||||
name "${docker_registry}/${docker_repo}:v${version}-slim"
|
name "${docker_registry}/${docker_repo}:v${docker_version}"
|
||||||
version "v${version}-slim"
|
version "v${docker_version}"
|
||||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile-slim-only")
|
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile${docker_target == "slim" ? "-slim-only" : ""}")
|
||||||
files fileTree(rootProject.projectDir) {
|
files fileTree(rootProject.projectDir) {
|
||||||
|
include '.dockerignore'
|
||||||
include "docker/${docker_dir}/*"
|
include "docker/${docker_dir}/*"
|
||||||
include "metadata-ingestion/**"
|
include "metadata-ingestion/**"
|
||||||
include "metadata-ingestion-modules/**"
|
include "metadata-ingestion-modules/**"
|
||||||
}.exclude {
|
}.exclude {
|
||||||
i -> i.file.isHidden() ||
|
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||||
i.file == buildDir ||
|
|
||||||
i.file == project(':metadata-ingestion').buildDir ||
|
|
||||||
i.file == project(':metadata-ingestion-modules').buildDir
|
|
||||||
}
|
}
|
||||||
buildArgs([DOCKER_VERSION: version,
|
buildArgs([DOCKER_VERSION: version,
|
||||||
RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace('-slim', '')])
|
RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')])
|
||||||
}
|
}
|
||||||
tasks.getByName('docker').dependsOn(['build',
|
tasks.getByName('docker').dependsOn(['build',
|
||||||
':docker:datahub-ingestion-base:docker',
|
':docker:datahub-ingestion-base:docker',
|
||||||
|
@ -2,21 +2,33 @@
|
|||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
HADOOP_CLIENT_DEPENDENCY="${HADOOP_CLIENT_DEPENDENCY:-org.apache.hadoop:hadoop-client:3.3.6}"
|
|
||||||
ZOOKEEPER_DEPENDENCY="${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
|
|
||||||
PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars"
|
PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars"
|
||||||
|
|
||||||
# Remove conflicting versions
|
function replace_jar {
|
||||||
echo "Removing version conflicts from $PYSPARK_JARS"
|
JAR_PREFIX=$1
|
||||||
CONFLICTS="zookeeper hadoop- slf4j-"
|
TRANSITIVE=$2
|
||||||
for jar in $CONFLICTS; do
|
DEPENDENCY=$3
|
||||||
rm "$PYSPARK_JARS/$jar"*.jar
|
|
||||||
done
|
|
||||||
|
|
||||||
# Fetch dependencies
|
echo "Removing version conflicts for $PYSPARK_JARS/$JAR_PREFIX*.jar"
|
||||||
mvn dependency:get -Dtransitive=true -Dartifact="$HADOOP_CLIENT_DEPENDENCY"
|
ls "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
|
||||||
mvn dependency:get -Dtransitive=true -Dartifact="$ZOOKEEPER_DEPENDENCY"
|
rm "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true
|
||||||
|
rm -r "$HOME/.m2" || true
|
||||||
|
|
||||||
# Move to pyspark location
|
if [ ! -z "$DEPENDENCY" ]; then
|
||||||
echo "Moving jars to $PYSPARK_JARS"
|
echo "Resolving $DEPENDENCY"
|
||||||
find "$HOME/.m2" -type f -name "*.jar" -exec mv {} "$PYSPARK_JARS/" \;
|
mvn dependency:get -Dtransitive=$TRANSITIVE -Dartifact="$DEPENDENCY" >/dev/null
|
||||||
|
|
||||||
|
echo "Moving jars to $PYSPARK_JARS"
|
||||||
|
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec echo "{}" \;
|
||||||
|
find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec cp {} "$PYSPARK_JARS/" \;
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
replace_jar "zookeeper-" "false" "${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}"
|
||||||
|
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_API_DEPENDENCY:-org.apache.hadoop:hadoop-client-api:3.3.6}"
|
||||||
|
replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_RUNTIME_DEPENDENCY:-org.apache.hadoop:hadoop-client-runtime:3.3.6}"
|
||||||
|
replace_jar "hadoop-yarn-" "true" "${HADOOP_YARN_DEPENDENCY:-org.apache.hadoop:hadoop-yarn-server-web-proxy:3.3.6}"
|
||||||
|
replace_jar "snappy-java-" "false" "${SNAPPY_JAVA_DEPENDENCY:-org.xerial.snappy:snappy-java:1.1.10.5}"
|
||||||
|
replace_jar "libthrift-" "false" "${LIBTHRIFT_DEPENDENCY:-org.apache.thrift:libthrift:0.19.0}"
|
||||||
|
replace_jar "ivy-" "false" "${IVY_DEPENDENCY:-org.apache.ivy:ivy:2.5.2}"
|
||||||
|
replace_jar "parquet-jackson-" "false" "${PARQUET_JACKSON_DEPENDENCY:-org.apache.parquet:parquet-jackson:1.13.1}"
|
||||||
|
@ -15,10 +15,11 @@ docker {
|
|||||||
version "v${version}"
|
version "v${version}"
|
||||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
||||||
files fileTree(rootProject.projectDir) {
|
files fileTree(rootProject.projectDir) {
|
||||||
|
include '.dockerignore'
|
||||||
include "docker/${docker_dir}/*"
|
include "docker/${docker_dir}/*"
|
||||||
include "metadata-service/restli-servlet-impl/src/main/resources/index/**"
|
include 'metadata-service/restli-servlet-impl/src/main/resources/index/**'
|
||||||
}.exclude {
|
}.exclude {
|
||||||
i -> i.file.isHidden() || i.file == buildDir
|
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||||
}
|
}
|
||||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||||
|
|
||||||
|
@ -15,9 +15,10 @@ docker {
|
|||||||
version "v${version}"
|
version "v${version}"
|
||||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
||||||
files fileTree(rootProject.projectDir) {
|
files fileTree(rootProject.projectDir) {
|
||||||
|
include '.dockerignore'
|
||||||
include "docker/${docker_dir}/*"
|
include "docker/${docker_dir}/*"
|
||||||
}.exclude {
|
}.exclude {
|
||||||
i -> i.file.isHidden() || i.file == buildDir
|
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||||
}
|
}
|
||||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||||
|
|
||||||
|
@ -16,9 +16,10 @@ docker {
|
|||||||
version "v${version}"
|
version "v${version}"
|
||||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
||||||
files fileTree(rootProject.projectDir) {
|
files fileTree(rootProject.projectDir) {
|
||||||
|
include '.dockerignore'
|
||||||
include "docker/${docker_dir}/*"
|
include "docker/${docker_dir}/*"
|
||||||
}.exclude {
|
}.exclude {
|
||||||
i -> i.file.isHidden() || i.file == buildDir
|
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||||
}
|
}
|
||||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||||
|
|
||||||
|
@ -16,9 +16,10 @@ docker {
|
|||||||
version "v${version}"
|
version "v${version}"
|
||||||
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile")
|
||||||
files fileTree(rootProject.projectDir) {
|
files fileTree(rootProject.projectDir) {
|
||||||
|
include '.dockerignore'
|
||||||
include "docker/${docker_dir}/*"
|
include "docker/${docker_dir}/*"
|
||||||
}.exclude {
|
}.exclude {
|
||||||
i -> i.file.isHidden() || i.file == buildDir
|
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||||
}
|
}
|
||||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||||
|
|
||||||
|
@ -45,11 +45,12 @@ docker {
|
|||||||
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
|
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
|
||||||
files bootJar.outputs.files
|
files bootJar.outputs.files
|
||||||
files fileTree(rootProject.projectDir) {
|
files fileTree(rootProject.projectDir) {
|
||||||
|
include '.dockerignore'
|
||||||
include 'docker/monitoring/*'
|
include 'docker/monitoring/*'
|
||||||
include "docker/${docker_repo}/*"
|
include "docker/${docker_repo}/*"
|
||||||
include 'metadata-models/src/main/resources/*'
|
include 'metadata-models/src/main/resources/*'
|
||||||
}.exclude {
|
}.exclude {
|
||||||
i -> i.file.isHidden() || i.file == buildDir
|
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||||
}
|
}
|
||||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||||
|
|
||||||
|
@ -56,11 +56,12 @@ docker {
|
|||||||
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
|
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
|
||||||
files bootJar.outputs.files
|
files bootJar.outputs.files
|
||||||
files fileTree(rootProject.projectDir) {
|
files fileTree(rootProject.projectDir) {
|
||||||
|
include '.dockerignore'
|
||||||
include 'docker/monitoring/*'
|
include 'docker/monitoring/*'
|
||||||
include "docker/${docker_repo}/*"
|
include "docker/${docker_repo}/*"
|
||||||
include 'metadata-models/src/main/resources/*'
|
include 'metadata-models/src/main/resources/*'
|
||||||
}.exclude {
|
}.exclude {
|
||||||
i -> i.file.isHidden() || i.file == buildDir
|
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||||
}
|
}
|
||||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||||
|
|
||||||
|
@ -70,11 +70,12 @@ docker {
|
|||||||
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
|
dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile")
|
||||||
files war.outputs.files
|
files war.outputs.files
|
||||||
files fileTree(rootProject.projectDir) {
|
files fileTree(rootProject.projectDir) {
|
||||||
|
include '.dockerignore'
|
||||||
include 'docker/monitoring/*'
|
include 'docker/monitoring/*'
|
||||||
include "docker/${docker_repo}/*"
|
include "docker/${docker_repo}/*"
|
||||||
include 'metadata-models/src/main/resources/*'
|
include 'metadata-models/src/main/resources/*'
|
||||||
}.exclude {
|
}.exclude {
|
||||||
i -> i.file.isHidden() || i.file == buildDir
|
i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden())
|
||||||
}
|
}
|
||||||
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
tag("Debug", "${docker_registry}/${docker_repo}:debug")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user