2021-06-03 13:24:33 -07:00
plugins {
id 'org.springframework.boot'
id 'java'
2022-12-26 10:09:08 -06:00
}
2023-01-31 18:44:37 -06:00
apply from: "../gradle/versioning/versioning.gradle"
2024-12-03 06:57:43 +05:30
apply from: "../gradle/coverage/java-coverage.gradle"
2025-04-02 11:51:10 +05:30
apply from: "../gradle/docker/docker.gradle"
2023-01-31 18:44:37 -06:00
2022-12-26 10:09:08 -06:00
ext {
2023-02-10 09:39:09 -06:00
docker_registry = rootProject . ext . docker_registry = = 'linkedin' ? 'acryldata' : docker_registry
2022-12-26 10:09:08 -06:00
docker_repo = 'datahub-upgrade'
2021-06-03 13:24:33 -07:00
}
dependencies {
2023-09-01 09:06:01 -05:00
implementation project ( ':metadata-io' )
implementation project ( ':metadata-service:factories' )
2024-03-23 06:15:36 -05:00
implementation project ( ':metadata-service:restli-client-api' )
2023-09-01 09:06:01 -05:00
implementation project ( ':metadata-service:configuration' )
implementation project ( ':metadata-dao-impl:kafka-producer' )
2025-10-21 15:07:52 -05:00
implementation project ( ':metadata-utils' )
2023-06-07 18:42:19 -05:00
implementation externalDependency . charle
2021-06-03 13:24:33 -07:00
2024-10-04 11:57:42 -05:00
implementation externalDependency . mustache
2023-09-01 09:06:01 -05:00
implementation externalDependency . javaxInject
2025-10-10 09:53:42 -05:00
implementation externalDependency . springActuator
2023-09-01 09:06:01 -05:00
implementation ( externalDependency . hadoopClient ) {
2021-06-30 22:57:39 -07:00
exclude group: 'net.minidev' , module: 'json-smart'
exclude group: 'com.nimbusds' , module: 'nimbus-jose-jwt'
exclude group: "org.apache.htrace" , module: "htrace-core4"
2024-03-07 14:15:22 -06:00
exclude group: "org.eclipse.jetty"
2023-02-20 19:53:59 -06:00
exclude group: "org.apache.hadoop.thirdparty" , module: "hadoop-shaded-protobuf_3_7"
2023-06-07 18:42:19 -05:00
exclude group: "com.charleskorn.kaml" , module: "kaml"
2025-10-18 14:02:26 -05:00
exclude group: "org.apache.kerby" , module: "kerb-simplekdc"
2021-06-30 22:57:39 -07:00
}
2022-12-09 14:11:12 -06:00
constraints {
implementation ( externalDependency . hadoopCommon3 ) {
because ( "previous versions are vulnerable to CVE-2021-37404" )
}
2023-02-20 00:02:05 -06:00
implementation ( externalDependency . snakeYaml ) {
because ( "previous versions are vulnerable to CVE-2022-25857" )
}
implementation ( externalDependency . woodstoxCore ) {
because ( "previous versions are vulnerable to CVE-2022-40151-2" )
}
implementation ( externalDependency . jettison ) {
because ( "previous versions are vulnerable" )
}
2024-03-07 14:15:22 -06:00
implementation ( externalDependency . guava ) {
because ( "CVE-2023-2976" )
}
2024-06-18 11:28:39 -05:00
implementation ( 'io.airlift:aircompressor:0.27' ) {
because ( "CVE-2024-36114" )
}
2024-08-23 10:26:42 -05:00
implementation ( 'dnsjava:dnsjava:3.6.1' ) {
because ( "CVE-2024-25638" )
}
2025-05-30 11:00:41 -05:00
implementation ( 'commons-beanutils:commons-beanutils:1.11.0' ) {
because ( "CVE-2025-48734" )
}
2025-10-18 21:07:42 -05:00
implementation ( 'io.netty:netty-codec-smtp:4.1.128.Final' ) {
because ( "CVE: GHSA-jq43-27x9-3v86 - Netty SMTP Command Injection" )
}
2022-12-09 14:11:12 -06:00
}
2023-05-17 10:32:23 -05:00
// mock internal schema registry
implementation externalDependency . kafkaAvroSerde
implementation externalDependency . kafkaAvroSerializer
2022-12-04 21:57:47 -06:00
implementation externalDependency . slf4jApi
compileOnly externalDependency . lombok
2023-09-01 09:06:01 -05:00
implementation externalDependency . picocli
2025-10-24 22:28:05 -05:00
implementation externalDependency . resilience4j
2023-09-01 09:06:01 -05:00
implementation externalDependency . parquet
2022-12-26 10:09:08 -06:00
implementation externalDependency . protobuf
2023-09-01 09:06:01 -05:00
implementation externalDependency . springBeans
implementation externalDependency . springBootAutoconfigure
implementation externalDependency . springCore
implementation externalDependency . springKafka
2025-05-12 16:37:57 +02:00
implementation externalDependency . kafkaClients
2025-03-14 17:09:26 -05:00
runtimeOnly externalDependency . opentelemetryExporter
2025-04-04 11:51:10 -03:00
runtimeOnly externalDependency . openTelemetryExporterLogging
runtimeOnly externalDependency . openTelemetryExporterCommon
2021-06-03 13:24:33 -07:00
2023-09-01 09:06:01 -05:00
runtimeOnly externalDependency . logbackClassic
runtimeOnly externalDependency . mariadbConnector
runtimeOnly externalDependency . mysqlConnector
runtimeOnly externalDependency . postgresql
2021-06-03 13:24:33 -07:00
2024-03-15 15:02:54 -07:00
implementation externalDependency . awsMskIamAuth
2025-05-01 20:22:47 -05:00
implementation externalDependency . azureIdentityExtensions
implementation externalDependency . azureIdentity
2022-11-22 16:36:53 -05:00
2025-05-24 13:42:10 -05:00
implementation platform ( externalDependency . jacksonBom )
implementation externalDependency . jacksonJsr310
2021-06-03 13:24:33 -07:00
annotationProcessor externalDependency . lombok
annotationProcessor externalDependency . picocli
2022-12-27 12:35:48 -06:00
testImplementation externalDependency . springBootTest
2023-09-01 09:06:01 -05:00
testImplementation externalDependency . mockito
testImplementation externalDependency . testng
2024-10-04 11:57:42 -05:00
testImplementation 'uk.org.webcompere:system-stubs-testng:2.1.7'
2023-09-01 09:06:01 -05:00
testRuntimeOnly externalDependency . logbackClassic
2023-11-14 19:00:22 -06:00
2025-03-13 10:17:14 -05:00
testImplementation externalDependency . h2
testImplementation testFixtures ( project ( ':metadata-io' ) )
2023-11-14 19:00:22 -06:00
constraints {
2025-07-28 19:58:36 +05:30
implementation ( externalDependency . parquetHadoop ) {
2023-11-14 19:00:22 -06:00
because ( "CVE-2022-42003" )
}
}
2021-06-03 13:24:33 -07:00
}
bootJar {
2023-04-13 12:01:51 -05:00
mainClass = 'com.linkedin.datahub.upgrade.UpgradeCliApplication'
2022-12-26 10:09:08 -06:00
archiveFileName = "${project.name}.jar"
}
2024-06-17 19:53:54 -05:00
bootRun {
environment "ENTITY_REGISTRY_CONFIG_PATH" , "../metadata-models/src/main/resources/entity-registry.yml"
environment "ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE" , "true"
environment "ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX" , "true"
environment "SERVER_PORT" , "8083"
args + = [ "-u" , "SystemUpdate" ]
}
/ * *
* Runs SystemUpdate on locally running system
* /
task run ( type: Exec ) {
dependsOn bootJar
group = "Execution"
description = "Run the datahub-upgrade SystemUpdate process locally."
environment "ENTITY_REGISTRY_CONFIG_PATH" , "../metadata-models/src/main/resources/entity-registry.yml"
environment "ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE" , "true"
environment "ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX" , "true"
2025-06-23 14:22:23 -07:00
environment "SCHEMA_REGISTRY_TYPE" , "INTERNAL"
2025-10-21 15:07:52 -05:00
environment "SQL_SETUP_ENABLED" , "true"
2024-09-04 15:36:12 -05:00
commandLine "java" ,
"-agentlib:jdwp=transport=dt_socket,address=5003,server=y,suspend=n" ,
"-jar" ,
"-Dserver.port=8083" , bootJar . getArchiveFile ( ) . get ( ) , "-u" , "SystemUpdate"
2024-06-17 19:53:54 -05:00
}
2025-07-28 19:58:36 +05:30
/ * *
* Runs the non - blocking system updates locally ( includes lineage index fields backfill )
* Sets up environment variables for local execution
* /
task runNonBlocking ( type: Exec ) {
dependsOn bootJar
group = "Execution"
description = "Run the non-blocking system updates locally (includes lineage index fields backfill)."
environment "ENTITY_REGISTRY_CONFIG_PATH" , "../metadata-models/src/main/resources/entity-registry.yml"
environment "ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE" , "true"
environment "ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX" , "true"
environment "BOOTSTRAP_SYSTEM_UPDATE_EXECUTOR_POOLS_ENABLED" , "false"
environment "SCHEMA_REGISTRY_TYPE" , "INTERNAL"
commandLine "java" ,
"-agentlib:jdwp=transport=dt_socket,address=5003,server=y,suspend=n" ,
"-jar" ,
"-Dserver.port=8083" , bootJar . getArchiveFile ( ) . get ( ) , "-u" , "SystemUpdateNonBlocking"
}
2025-05-16 19:59:46 +02:00
task runCron ( type: Exec ) {
dependsOn bootJar
group = "Execution"
description = "Run the datahub-upgrade SystemUpdate CRON process locally."
environment "ENTITY_REGISTRY_CONFIG_PATH" , "../metadata-models/src/main/resources/entity-registry.yml"
environment "ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE" , "true"
environment "ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX" , "true"
commandLine "java" ,
"-agentlib:jdwp=transport=dt_socket,address=5003,server=y,suspend=n" ,
"-jar" ,
"-Dserver.port=8083" , bootJar . getArchiveFile ( ) . get ( ) , "-u" , "SystemUpdateCron"
}
task runCronDryRun ( type: Exec ) {
dependsOn bootJar
group = "Execution"
description = "Run the datahub-upgrade SystemUpdate CRON process locally."
environment "ENTITY_REGISTRY_CONFIG_PATH" , "../metadata-models/src/main/resources/entity-registry.yml"
environment "ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE" , "true"
environment "ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX" , "true"
commandLine "java" ,
"-agentlib:jdwp=transport=dt_socket,address=5003,server=y,suspend=n" ,
"-jar" ,
"-Dserver.port=8083" , bootJar . getArchiveFile ( ) . get ( ) , "-u" , "SystemUpdateCron" , "-a" , "dryRun=true"
}
2025-06-04 19:56:10 +02:00
/ * *
* Runs RestoreIndices on locally running system .
* This is useful for debugging or special situations to reindex , the index to reindex is given in - u index = argument "
* /
task runReindexDebug ( type: Exec ) {
dependsOn bootJar
group = "Execution"
description = "Run the datahub-upgrade SystemUpdate ReindexDebug"
environment "ENTITY_REGISTRY_CONFIG_PATH" , "../metadata-models/src/main/resources/entity-registry.yml"
environment "ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE" , "true"
environment "ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX" , "true"
commandLine "java" ,
"-agentlib:jdwp=transport=dt_socket,address=5003,server=y,suspend=n" ,
"-jar" ,
"-Dserver.port=8083" , bootJar . getArchiveFile ( ) . get ( ) , "-u" , "ReindexDebug" , "-a" , "index=datahubpolicyindex_v2"
}
2024-09-20 10:33:37 -05:00
/ * *
* Runs RestoreIndices on locally running system . The batchSize are set to
* test the process with pagination and not designed for optimal performance .
* /
task runRestoreIndices ( type: Exec ) {
dependsOn bootJar
group = "Execution"
description = "Run the restore indices process locally."
environment "ENTITY_REGISTRY_CONFIG_PATH" , "../metadata-models/src/main/resources/entity-registry.yml"
commandLine "java" , "-agentlib:jdwp=transport=dt_socket,address=5003,server=y,suspend=n" ,
"-jar" ,
"-Dkafka.schemaRegistry.url=http://localhost:8080/schema-registry/api" ,
"-Dserver.port=8083" ,
2025-03-13 10:17:14 -05:00
bootJar . getArchiveFile ( ) . get ( ) , "-u" , "RestoreIndices" , "-a" , "batchSize=100" , "-a" , "createDefaultAspects=true"
2024-09-20 10:33:37 -05:00
}
task runRestoreIndicesUrn ( type: Exec ) {
dependsOn bootJar
group = "Execution"
description = "Run the restore indices process locally."
environment "ENTITY_REGISTRY_CONFIG_PATH" , "../metadata-models/src/main/resources/entity-registry.yml"
commandLine "java" , "-agentlib:jdwp=transport=dt_socket,address=5003,server=y,suspend=n" ,
"-jar" ,
"-Dkafka.schemaRegistry.url=http://localhost:8080/schema-registry/api" ,
"-Dserver.port=8083" ,
bootJar . getArchiveFile ( ) . get ( ) , "-u" , "RestoreIndices" , "-a" , "batchSize=100" , "-a" , "urnBasedPagination=true"
}
2025-10-10 09:53:42 -05:00
/ * *
* Runs LoadIndices on locally running system . This ensures indices have the correct mappings
* and then loads data from database to Elasticsearch .
*
* The process includes two steps:
* 1 . BuildIndicesStep: Ensures indices have the correct mappings ( creates / updates index structure )
* 2 . LoadIndicesStep: Loads data from database to Elasticsearch
*
* The task automatically configures ES_BULK_REQUESTS_LIMIT to match the batch size
* and disables ES_BULK_FLUSH_PERIOD for optimal bulk processor performance .
*
* Optional parameters:
* - limit: Maximum number of records to process ( default : no limit )
* - batchSize: Number of records per batch ( default : 10000 )
* - esThreadCount: Elasticsearch I / O thread count ( default : 3 , enables async bulk processing )
* - urnLike: URN pattern filter ( e . g . , "urn:li:dataset:%" )
* - aspectNames: Comma - separated aspect names to filter
* - lePitEpochMs: Process records created before this timestamp
* - gePitEpochMs: Process records created after this timestamp
*
* Usage examples:
* . / gradlew runLoadIndices
* . / gradlew runLoadIndices - Plimit = 5000
* . / gradlew runLoadIndices - Plimit = 1000 - PbatchSize = 2500
* . / gradlew runLoadIndices - PurnLike = "urn:li:dataset:%"
* . / gradlew runLoadIndices - PesThreadCount = 3
* /
task runLoadIndices ( type: Exec ) {
dependsOn bootJar
group = "Execution"
description = "Run the load indices process locally - ensures correct mappings and loads data from database to Elasticsearch."
environment "ENTITY_REGISTRY_CONFIG_PATH" , "../metadata-models/src/main/resources/entity-registry.yml"
def args = [ "java" , "-agentlib:jdwp=transport=dt_socket,address=5003,server=y,suspend=n" ,
"-jar" , "-Dserver.port=8083" , bootJar . getArchiveFile ( ) . get ( ) , "-u" , "LoadIndices" ]
// Add batchSize (default: 10000)
def batchSize = project . hasProperty ( 'batchSize' ) ? project . getProperty ( 'batchSize' ) : '10000'
args . addAll ( [ "-a" , "batchSize=${batchSize}" ] )
// Optimize bulk request limit for sustained load
environment "ES_BULK_REQUESTS_LIMIT" , batchSize
// Set flush period to 5 minutes to allow proper batching
environment "ES_BULK_FLUSH_PERIOD" , "300"
// Configure Elasticsearch I/O threads for better BulkProcessor performance
// Increase from default 2 to allow more concurrent HTTP connections
def threadCount = project . hasProperty ( 'esThreadCount' ) ? project . getProperty ( 'esThreadCount' ) : '4'
environment "ELASTICSEARCH_THREAD_COUNT" , threadCount
// Add limit if specified
if ( project . hasProperty ( 'limit' ) ) {
args . addAll ( [ "-a" , "limit=${project.getProperty('limit')}" ] )
}
// Add urnLike if specified
if ( project . hasProperty ( 'urnLike' ) ) {
args . addAll ( [ "-a" , "urnLike=${project.getProperty('urnLike')}" ] )
}
// Add aspectNames if specified
if ( project . hasProperty ( 'aspectNames' ) ) {
args . addAll ( [ "-a" , "aspectNames=${project.getProperty('aspectNames')}" ] )
}
// Add lePitEpochMs if specified
if ( project . hasProperty ( 'lePitEpochMs' ) ) {
args . addAll ( [ "-a" , "lePitEpochMs=${project.getProperty('lePitEpochMs')}" ] )
}
// Add gePitEpochMs if specified
if ( project . hasProperty ( 'gePitEpochMs' ) ) {
args . addAll ( [ "-a" , "gePitEpochMs=${project.getProperty('gePitEpochMs')}" ] )
}
commandLine args
}
2025-10-21 15:07:52 -05:00
/ * *
* Runs SqlSetup for MySQL on locally running system .
* Configured for quickstart MySQL service running on localhost: 3306
* Uses environment variables for backwards compatibility with docker / mysql - setup
*
* Usage:
* . / gradlew runSqlSetupMysql
* MYSQL_USERNAME = datahub MYSQL_PASSWORD = datahub CREATE_USER = true . / gradlew runSqlSetupMysql
* CDC_MCL_PROCESSING_ENABLED = true CDC_USER = datahub_cdc CDC_PASSWORD = datahub_cdc . / gradlew runSqlSetupMysql
* /
task runSqlSetupMysql ( type: Exec ) {
dependsOn bootJar
group = "Execution"
description = "Run the SqlSetup process for MySQL locally - creates tables and optionally users/CDC. Uses environment variables for backwards compatibility."
environment "ENTITY_REGISTRY_CONFIG_PATH" , "../metadata-models/src/main/resources/entity-registry.yml"
// Set standard Ebean environment variables for MySQL
environment "EBEAN_DATASOURCE_USERNAME" , "datahub"
environment "EBEAN_DATASOURCE_PASSWORD" , "datahub"
environment "EBEAN_DATASOURCE_URL" , "jdbc:mysql://localhost:3306/datahub"
environment "EBEAN_DATASOURCE_DRIVER" , "com.mysql.cj.jdbc.Driver"
environment "EBEAN_USE_IAM_AUTH" , "false"
// Set SqlSetup-specific environment variables
environment "CREATE_TABLES" , "true"
environment "CREATE_USER" , "false"
environment "CDC_MCL_PROCESSING_ENABLED" , "false"
environment "CDC_USER" , "datahub_cdc"
environment "CDC_PASSWORD" , "datahub_cdc"
def args = [ "java" , "-agentlib:jdwp=transport=dt_socket,address=5003,server=y,suspend=n" ,
"-jar" , "-Dserver.port=8083" , bootJar . getArchiveFile ( ) . get ( ) , "-u" , "SqlSetup" ]
commandLine args
}
/ * *
* Runs SqlSetup for PostgreSQL on locally running system .
* Configured for quickstart PostgreSQL service running on localhost: 5432
* Uses environment variables for backwards compatibility with docker / postgres - setup
*
* Usage:
* . / gradlew runSqlSetupPostgres
* POSTGRES_USERNAME = datahub POSTGRES_PASSWORD = datahub CREATE_USER = true . / gradlew runSqlSetupPostgres
* CDC_MCL_PROCESSING_ENABLED = true CDC_USER = datahub_cdc CDC_PASSWORD = datahub_cdc . / gradlew runSqlSetupPostgres
* /
task runSqlSetupPostgres ( type: Exec ) {
dependsOn bootJar
group = "Execution"
description = "Run the SqlSetup process for PostgreSQL locally - creates database, tables and optionally users/CDC. Uses environment variables for backwards compatibility."
environment "ENTITY_REGISTRY_CONFIG_PATH" , "../metadata-models/src/main/resources/entity-registry.yml"
// Set standard Ebean environment variables for PostgreSQL
environment "EBEAN_DATASOURCE_USERNAME" , "datahub"
environment "EBEAN_DATASOURCE_PASSWORD" , "datahub"
environment "EBEAN_DATASOURCE_URL" , "jdbc:postgresql://localhost:5432/datahub"
environment "EBEAN_DATASOURCE_DRIVER" , "org.postgresql.Driver"
environment "EBEAN_USE_IAM_AUTH" , "false"
// Set SqlSetup-specific environment variables
environment "CREATE_TABLES" , "true"
environment "CREATE_USER" , "false"
environment "CDC_MCL_PROCESSING_ENABLED" , "false"
environment "CDC_USER" , "datahub_cdc"
environment "CDC_PASSWORD" , "datahub_cdc"
def args = [ "java" , "-agentlib:jdwp=transport=dt_socket,address=5003,server=y,suspend=n" ,
"-jar" , "-Dserver.port=8083" , bootJar . getArchiveFile ( ) . get ( ) , "-u" , "SqlSetup" ]
commandLine args
}
2022-12-26 10:09:08 -06:00
docker {
2025-04-02 11:51:10 +05:30
dependsOn ( bootJar )
2025-04-07 10:13:07 +05:30
name "${docker_registry}/${docker_repo}:${versionTag}"
2022-12-26 10:09:08 -06:00
dockerfile file ( "${rootProject.projectDir}/docker/${docker_repo}/Dockerfile" )
files bootJar . outputs . files
files fileTree ( rootProject . projectDir ) {
2023-11-13 16:26:53 -06:00
include '.dockerignore'
2025-03-14 17:09:26 -05:00
include 'docker/monitoring/*'
2022-12-26 10:09:08 -06:00
include "docker/${docker_repo}/*"
include 'metadata-models/src/main/resources/*'
2023-08-17 00:10:17 -05:00
} . exclude {
2023-11-13 16:26:53 -06:00
i - > ( ! i . file . name . endsWith ( ".dockerignore" ) & & i . file . isHidden ( ) )
2022-12-26 10:09:08 -06:00
}
2025-04-02 11:51:10 +05:30
additionalTag ( "Debug" , "${docker_registry}/${docker_repo}:debug" )
2023-11-28 21:52:11 +01:00
// Add build args if they are defined (needed for some CI or enterprise environments)
def dockerBuildArgs = [ : ]
if ( project . hasProperty ( 'alpineApkRepositoryUrl' ) ) {
dockerBuildArgs . ALPINE_REPO_URL = project . getProperty ( 'alpineApkRepositoryUrl' )
}
if ( project . hasProperty ( 'githubMirrorUrl' ) ) {
dockerBuildArgs . GITHUB_REPO_URL = project . getProperty ( 'githubMirrorUrl' )
}
if ( project . hasProperty ( 'mavenCentralRepositoryUrl' ) ) {
dockerBuildArgs . MAVEN_CENTRAL_REPO_URL = project . getProperty ( 'mavenCentralRepositoryUrl' )
}
if ( dockerBuildArgs . size ( ) > 0 ) {
buildArgs ( dockerBuildArgs )
}
2022-12-26 10:09:08 -06:00
}