diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 17f5c9dee1..c8ca5f9c46 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -33,8 +33,7 @@ jobs: # running build first without datahub-web-react:yarnBuild and then with it is 100% stable # datahub-frontend:unzipAssets depends on datahub-web-react:yarnBuild but gradle does not know about it run: | - ./gradlew :metadata-integration:java:spark-lineage:build - ./gradlew build -x :metadata-ingestion:build -x :metadata-ingestion:check -x docs-website:build -x datahub-web-react:yarnBuild -x datahub-frontend:unzipAssets -x :metadata-integration:java:spark-lineage:test + ./gradlew build -x :metadata-ingestion:build -x :metadata-ingestion:check -x docs-website:build -x datahub-web-react:yarnBuild -x datahub-frontend:unzipAssets ./gradlew build -x :metadata-ingestion:build -x :metadata-ingestion:check -x docs-website:build -x :metadata-integration:java:spark-lineage:test - uses: actions/upload-artifact@v2 if: always() diff --git a/build.gradle b/build.gradle index aff9b30306..ec1bc3c491 100644 --- a/build.gradle +++ b/build.gradle @@ -121,9 +121,9 @@ project.ext.externalDependency = [ 'springKafka': 'org.springframework.kafka:spring-kafka:2.2.14.RELEASE', 'springActuator': 'org.springframework.boot:spring-boot-starter-actuator:2.1.4.RELEASE', 'testng': 'org.testng:testng:7.3.0', - 'testContainers': 'org.testcontainers:testcontainers:1.15.1', - 'testContainersJunit': 'org.testcontainers:junit-jupiter:1.15.1', - 'testContainersPostgresql':'org.testcontainers:postgresql:1.2.0', + 'testContainers': 'org.testcontainers:testcontainers:1.15.3', + 'testContainersJunit': 'org.testcontainers:junit-jupiter:1.15.3', + 'testContainersPostgresql':'org.testcontainers:postgresql:1.15.3', 'testContainersElasticsearch': 'org.testcontainers:elasticsearch:1.15.3', 'wiremock':'com.github.tomakehurst:wiremock:2.10.0', 'zookeeper': 'org.apache.zookeeper:zookeeper:3.4.14' @@ -135,14 +135,19 @@ allprojects { apply plugin: 'checkstyle' } -subprojects { - apply plugin: 'maven' +configure(subprojects.findAll {it.name != 'spark-lineage'}) { configurations.all { exclude group: "io.netty", module: "netty" exclude group: "log4j", module: "log4j" } +} + +subprojects { + + apply plugin: 'maven' + plugins.withType(JavaPlugin) { dependencies { testCompile externalDependency.testng diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index 4968ced847..754402f836 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -2,13 +2,16 @@ apply plugin: 'java' apply plugin: 'com.github.johnrengelman.shadow' apply plugin: 'jacoco' +jar.enabled = false // Since we only want to build shadow jars, disabling the regular jar creation + dependencies { - compile project(':metadata-models') - compile externalDependency.httpAsyncClient - compile externalDependency.jacksonDataBind + implementation project(':metadata-models') + shadow externalDependency.httpAsyncClient // we want our clients to provide this + implementation externalDependency.jacksonDataBind compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok + testCompile externalDependency.httpAsyncClient // needed as shadow excludes it testCompile externalDependency.mockito testCompile externalDependency.mockServer testCompile externalDependency.mockServerClient @@ -23,9 +26,39 @@ test { finalizedBy jacocoTestReport } +task checkShadowJar(type: Exec) { + commandLine 'sh', '-c', 'scripts/check_jar.sh' +} + + shadowJar { zip64=true - classifier='' + archiveClassifier = '' + dependencies { + exclude(dependency('org.apache.httpcomponents:httpasyncclient')) + } + mergeServiceFiles() + // we relocate namespaces manually, because we want to know exactly which libs we are exposing and why + // we can move to automatic relocation using ConfigureShadowRelocation after we get to a good place on these first + relocate 'com.fasterxml.jackson', 'datahub.shaded.jackson' + relocate 'net.jcip.annotations', 'datahub.shaded.annotations' + relocate 'javassist', 'datahub.shaded.javassist' + relocate 'edu.umd.cs.findbugs', 'datahub.shaded.findbugs' + relocate 'org.antlr', 'datahub.shaded.org.antlr' + relocate 'antlr', 'datahub.shaded.antlr' + relocate 'com.google.common', 'datahub.shaded.com.google.common' + relocate 'org.apache.commons', 'datahub.shaded.org.apache.commons' + relocate 'org.reflections', 'datahub.shaded.org.reflections' + relocate 'st4hidden', 'datahub.shaded.st4hidden' + relocate 'org.stringtemplate', 'datahub.shaded.org.stringtemplate' + relocate 'org.abego.treelayout', 'datahub.shaded.treelayout' + relocate 'org.slf4j', 'datahub.shaded.slf4j' + relocate 'javax.annotation', 'datahub.shaded.javax.annotation' + finalizedBy checkShadowJar +} + +checkShadowJar { + dependsOn shadowJar } assemble { diff --git a/metadata-integration/java/datahub-client/scripts/check_jar.sh b/metadata-integration/java/datahub-client/scripts/check_jar.sh new file mode 100755 index 0000000000..4051e26f63 --- /dev/null +++ b/metadata-integration/java/datahub-client/scripts/check_jar.sh @@ -0,0 +1,9 @@ +# This script checks the shadow jar to ensure that we only have allowed classes being exposed through the jar +jar -tvf build/libs/datahub-client.jar| grep -v "datahub/shaded" | grep -v "META-INF" | grep -v "com/linkedin" | grep -v "com/datahub" | grep -v "datahub" | grep -v "entity-registry" | grep -v "pegasus" | grep -v "legacyPegasusSchemas/" | grep -v " com/$" +if [ $? -ne 0 ]; then + echo "No other packages found. Great" + exit 0 +else + echo "Found other packages than what we were expecting" + exit 1 +fi diff --git a/metadata-integration/java/spark-lineage/build.gradle b/metadata-integration/java/spark-lineage/build.gradle index 1d04d534d8..14a7b84948 100644 --- a/metadata-integration/java/spark-lineage/build.gradle +++ b/metadata-integration/java/spark-lineage/build.gradle @@ -22,18 +22,16 @@ dependencies { compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - implementation(project(':metadata-integration:java:datahub-client')) { - exclude group: "org.antlr" - exclude group: "com.google.guava" // causes issues with Guava Stopwatch constructor - exclude group: "com.fasterxml.jackson.core" - } + implementation project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow') + + + implementation(externalDependency.sparkSql) + implementation(externalDependency.sparkHive) + + // Tests need a concrete log4j available. Providing it here + testImplementation 'org.apache.logging.log4j:log4j-api:2.17.1' + testImplementation 'org.apache.logging.log4j:log4j-core:2.17.1' - implementation(externalDependency.sparkSql){ - exclude group: "org.apache.hadoop" - } - implementation(externalDependency.sparkHive){ - exclude group: "org.apache.hadoop" - } testImplementation(externalDependency.postgresql){ exclude group: "com.fasterxml.jackson.core" @@ -48,7 +46,7 @@ dependencies { exclude group: "com.fasterxml.jackson.core" } // older version to allow older guava - testImplementation(externalDependency.testContainersPostgresql) // older version to allow older jackson + testImplementation(externalDependency.testContainersPostgresql) } diff --git a/metadata-integration/java/spark-lineage/src/main/java/com/linkedin/datahub/lineage/spark/model/dataset/JdbcDataset.java b/metadata-integration/java/spark-lineage/src/main/java/com/linkedin/datahub/lineage/spark/model/dataset/JdbcDataset.java index 6d33578743..9582b0a4f2 100644 --- a/metadata-integration/java/spark-lineage/src/main/java/com/linkedin/datahub/lineage/spark/model/dataset/JdbcDataset.java +++ b/metadata-integration/java/spark-lineage/src/main/java/com/linkedin/datahub/lineage/spark/model/dataset/JdbcDataset.java @@ -32,6 +32,7 @@ public class JdbcDataset implements SparkDataset { url = url.replaceFirst("jdbc:", ""); if (url.contains("postgres")) { url = url.substring(url.lastIndexOf('/') + 1); + url = url.substring(0, url.indexOf('?')); } // TODO different DBs have different formats. TBD mapping to data source names return url + "." + tbl;