diff --git a/data-model/DDL/ETL_DDL/etl_configure_tables.sql b/data-model/DDL/ETL_DDL/etl_configure_tables.sql index 3965de9abf..3673023f4b 100644 --- a/data-model/DDL/ETL_DDL/etl_configure_tables.sql +++ b/data-model/DDL/ETL_DDL/etl_configure_tables.sql @@ -142,14 +142,15 @@ CREATE TABLE cfg_database ( jdbc_url varchar(1000) NULL, uri varchar(1000) NULL, short_connection_string varchar(50) COMMENT 'Oracle TNS Name, ODBC DSN, TDPID...' NULL, - last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, PRIMARY KEY(db_id), - UNIQUE KEY `uix_cfg_database__dbcode` (db_code) USING HASH + UNIQUE KEY `uix_cfg_database__dbcode` (db_code) USING HASH ) ENGINE = InnoDB DEFAULT CHARSET = utf8 COMMENT = 'Abstract different storage instances as databases' ; + CREATE TABLE stg_cfg_object_name_map ( object_type varchar(100) NOT NULL, object_sub_type varchar(100) NULL, @@ -164,17 +165,15 @@ CREATE TABLE stg_cfg_object_name_map ( mapped_object_urn varchar(350) NULL, mapped_object_dataset_id int(11) UNSIGNED NULL, description varchar(500) NULL, - last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY(object_name,mapped_object_name) + last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY(object_name, mapped_object_name), + KEY idx_stg_cfg_object_name_map__mappedobjectname (mapped_object_name) USING BTREE ) ENGINE = InnoDB CHARACTER SET latin1 COLLATE latin1_swedish_ci COMMENT = 'Map alias (when is_identical_map=Y) and view dependency' ; -CREATE INDEX idx_stg_cfg_object_name_map__mappedobjectname USING BTREE - ON stg_cfg_object_name_map(mapped_object_name); - CREATE TABLE cfg_object_name_map ( obj_name_map_id int(11) AUTO_INCREMENT NOT NULL, object_type varchar(100) NOT NULL, @@ -188,36 +187,30 @@ CREATE TABLE cfg_object_name_map ( mapped_object_name varchar(350) NOT NULL COMMENT 'this is the original/parent object', mapped_object_dataset_id int(11) UNSIGNED NULL COMMENT 'can be the abstract dataset id for versioned objects', description varchar(500) NULL, - last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY(obj_name_map_id) + last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY(obj_name_map_id), + KEY idx_cfg_object_name_map__mappedobjectname (mapped_object_name) USING BTREE, + CONSTRAINT uix_cfg_object_name_map__objectname_mappedobjectname UNIQUE (object_name, mapped_object_name) ) ENGINE = InnoDB CHARACTER SET latin1 AUTO_INCREMENT = 1 COMMENT = 'Map alias (when is_identical_map=Y) and view dependency. Always map from Derived/Child (object) back to its Original/Parent (mapped_object)' ; -ALTER TABLE cfg_object_name_map - ADD CONSTRAINT uix_cfg_object_name_map__objectname_mappedobjectname - UNIQUE (object_name, mapped_object_name); - -CREATE INDEX idx_cfg_object_name_map__mappedobjectname USING BTREE - ON cfg_object_name_map(mapped_object_name); - CREATE TABLE cfg_deployment_tier ( tier_id tinyint(4) NOT NULL, tier_code varchar(25) COMMENT 'local,dev,test,qa,stg,prod' NOT NULL, tier_label varchar(50) COMMENT 'display full name' NULL, sort_id smallint(6) COMMENT '3-digit for group, 3-digit within group' NOT NULL, - last_modified timestamp NOT NULL, - PRIMARY KEY(tier_id) + last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY(tier_id), + UNIQUE KEY uix_cfg_deployment_tier__tiercode (tier_code) ) ENGINE = InnoDB AUTO_INCREMENT = 0 COMMENT = 'http://en.wikipedia.org/wiki/Deployment_environment'; -CREATE UNIQUE INDEX uix_cfg_deployment_tier__tiercode - ON cfg_deployment_tier(tier_code); CREATE TABLE cfg_data_center ( data_center_id smallint(6) NOT NULL DEFAULT '0', @@ -230,16 +223,14 @@ CREATE TABLE cfg_data_center ( longtitude decimal(10,6) NULL, latitude decimal(10,6) NULL, data_center_status char(1) COMMENT 'A,D,U' NULL, - last_modified timestamp NULL, - PRIMARY KEY(data_center_id) + last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY(data_center_id), + UNIQUE KEY uix_cfg_data_center__datacentercode (data_center_code) ) ENGINE = InnoDB AUTO_INCREMENT = 0 COMMENT = 'https://en.wikipedia.org/wiki/Data_center' ; -CREATE UNIQUE INDEX uix_cfg_data_center__datacentercode - ON cfg_data_center(data_center_code); - CREATE TABLE cfg_cluster ( cluster_id smallint(6) NOT NULL DEFAULT '0', @@ -249,16 +240,15 @@ CREATE TABLE cfg_cluster ( deployment_tier_code varchar(25) NOT NULL, data_center_code varchar(30) NULL, description varchar(200) NULL, - last_modified timestamp NULL, - PRIMARY KEY(cluster_id) + last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY(cluster_id), + UNIQUE KEY uix_cfg_cluster__clustercode (cluster_code) ) COMMENT = 'https://en.wikipedia.org/wiki/Computer_cluster' ; -CREATE UNIQUE INDEX uix_cfg_cluster__clustercode - ON cfg_cluster(cluster_code); CREATE TABLE IF NOT EXISTS cfg_search_score_boost ( `id` INT COMMENT 'dataset id', `static_boosting_score` INT COMMENT 'static boosting score for elastic search', PRIMARY KEY (`id`) -) ENGINE = InnoDB DEFAULT CHARSET = latin1; \ No newline at end of file +) ENGINE = InnoDB DEFAULT CHARSET = latin1; diff --git a/data-model/DDL/ETL_DDL/git_metadata.sql b/data-model/DDL/ETL_DDL/git_metadata.sql index 5360f34b2f..16e932323c 100644 --- a/data-model/DDL/ETL_DDL/git_metadata.sql +++ b/data-model/DDL/ETL_DDL/git_metadata.sql @@ -73,6 +73,7 @@ CREATE TABLE `stg_product_repo` ( `scm_type` VARCHAR(20) NOT NULL, `repo_id` INT UNSIGNED DEFAULT NULL, `project` VARCHAR(100) DEFAULT NULL, + `dataset_group` VARCHAR(200) DEFAULT NULL COMMENT 'dataset group name, database name, etc', `owner_type` VARCHAR(50) DEFAULT NULL, `owner_name` VARCHAR(300) DEFAULT NULL COMMENT 'owner names in comma separated list', `multiproduct_name` VARCHAR(100) DEFAULT NULL, @@ -88,6 +89,7 @@ CREATE TABLE `stg_repo_owner` ( `scm_repo_fullname` VARCHAR(100) NOT NULL, `scm_type` VARCHAR(20) NOT NULL, `repo_id` INT DEFAULT NULL, + `dataset_group` VARCHAR(200) DEFAULT NULL COMMENT 'dataset group name, database name, etc', `owner_type` VARCHAR(50) DEFAULT NULL COMMENT 'which acl file this owner is in', `owner_name` VARCHAR(50) DEFAULT NULL COMMENT 'one owner name', `sort_id` INT UNSIGNED DEFAULT NULL, @@ -96,7 +98,6 @@ CREATE TABLE `stg_repo_owner` ( ) ENGINE = InnoDB DEFAULT CHARSET = latin1; - CREATE TABLE stg_database_scm_map ( `database_name` VARCHAR(100) COMMENT 'database name', `database_type` VARCHAR(50) COMMENT 'database type', diff --git a/metadata-etl/src/main/resources/jython/CodeSearchLoad.py b/metadata-etl/src/main/resources/jython/CodeSearchLoad.py index 31ae606a8f..0c93d3c26c 100644 --- a/metadata-etl/src/main/resources/jython/CodeSearchLoad.py +++ b/metadata-etl/src/main/resources/jython/CodeSearchLoad.py @@ -101,8 +101,7 @@ class CodeSearchLoad: ON DUPLICATE KEY UPDATE dataset_urn = n.urn, sort_id = COALESCE(n.n_sort_id, sort_id), - owner_type = CASE WHEN n.n_owner_type IS NULL OR owner_type >= n.n_owner_type - THEN owner_type ELSE n.n_owner_type END, + owner_type = n.n_owner_type, owner_sub_type = COALESCE(owner_sub_type, n.n_owner_sub_type), owner_id_type = COALESCE(owner_id_type, n.n_owner_id_type), owner_source = CASE WHEN owner_source is null THEN 'SCM' diff --git a/metadata-etl/src/main/resources/jython/MultiproductLoad.py b/metadata-etl/src/main/resources/jython/MultiproductLoad.py index 03d4052aee..4475260c45 100644 --- a/metadata-etl/src/main/resources/jython/MultiproductLoad.py +++ b/metadata-etl/src/main/resources/jython/MultiproductLoad.py @@ -70,7 +70,30 @@ class MultiproductLoad: FIELDS TERMINATED BY '\Z' ESCAPED BY '\0' LINES TERMINATED BY '\n' (`app_id`, `wh_etl_exec_id`, `scm_repo_fullname`, `scm_type`, `repo_id`, `project`, `owner_type`, `owner_name`, - `multiproduct_name`, `product_type`, `product_version`, `namespace`) + `multiproduct_name`, `product_type`, `product_version`, `namespace`); + + -- map repo to oracle or espresso database + UPDATE stg_product_repo r + INNER JOIN + (select database_type, substring_index(substring_index(scm_url, '/', 5), '/', -2) repo, + GROUP_CONCAT(database_name SEPARATOR ', ') dataset_groups + from stg_database_scm_map + where scm_type = 'git' and database_type in ('espresso', 'oracle') + group by repo, database_type) d + ON d.repo = r.scm_repo_fullname + AND r.app_id = {app_id} + SET r.dataset_group = d.dataset_groups; + + -- map dali repo to dali dataset group + UPDATE stg_product_repo + SET dataset_group = REPLACE(substring_index(LEFT(scm_repo_fullname, LENGTH(scm_repo_fullname) - 9), '/', -1), '-', '_') + WHERE app_id = {app_id} + AND project IN ('dali-datasets', 'dali-base-datasets'); + + UPDATE stg_product_repo + SET dataset_group = concat(dataset_group, '_mp, ', dataset_group, '_mp_versioned') + WHERE app_id = {app_id} + AND project IN ('dali-datasets', 'dali-base-datasets') '''.format(source_file=self.product_repo_file, app_id=self.app_id) self.executeCommands(load_product_repos_cmd) @@ -86,7 +109,15 @@ class MultiproductLoad: INTO TABLE stg_repo_owner FIELDS TERMINATED BY '\Z' ESCAPED BY '\0' LINES TERMINATED BY '\n' - (`app_id`, `wh_etl_exec_id`, `scm_repo_fullname`, `scm_type`, `repo_id`, `owner_type`, `owner_name`, `sort_id`, `paths`) + (`app_id`, `wh_etl_exec_id`, `scm_repo_fullname`, `scm_type`, `repo_id`, `owner_type`, `owner_name`, `sort_id`, `paths`); + + -- update dataset_group from repo + UPDATE stg_repo_owner ro + INNER JOIN stg_product_repo pr + ON ro.app_id = {app_id} AND pr.app_id = {app_id} + AND ro.scm_repo_fullname = pr.scm_repo_fullname + AND pr.dataset_group IS NOT NULL + SET ro.dataset_group = pr.dataset_group '''.format(source_file=self.product_repo_owner_file, app_id=self.app_id) self.executeCommands(load_product_repo_owners_cmd) @@ -109,40 +140,38 @@ class MultiproductLoad: -- INSERT/UPDATE into dataset_owner INSERT INTO dataset_owner ( dataset_id, dataset_urn, owner_id, sort_id, namespace, app_id, owner_type, owner_sub_type, owner_id_type, - owner_source, db_ids, is_group, is_active, source_time, created_time, wh_etl_exec_id + owner_source, db_ids, is_group, is_active, source_time, created_time, wh_etl_exec_id, confirmed_by, confirmed_on ) SELECT * FROM ( SELECT ds.id, ds.urn, r.owner_name n_owner_id, r.sort_id n_sort_id, 'urn:li:corpuser' n_namespace, r.app_id, - IF(r.owner_type = 'main', 'Producer', r.owner_type) n_owner_type, - null n_owner_sub_type, + 'Owner' n_owner_type, r.owner_type n_owner_sub_type, case when r.app_id = 300 then 'USER' when r.app_id = 301 then 'GROUP' else null end n_owner_id_type, 'SCM' n_owner_source, null db_ids, IF(r.app_id = 301, 'Y', 'N') is_group, - 'Y' is_active, 0 source_time, unix_timestamp(NOW()) created_time, r.wh_etl_exec_id - FROM (SELECT id, urn FROM dict_dataset WHERE urn like 'dalids:///%') ds - JOIN (SELECT object_name, mapped_object_name FROM cfg_object_name_map WHERE mapped_object_type = 'scm') m - ON m.object_name = concat('/', substring_index(substring_index(ds.urn, '/', 4), '/', -1)) + 'Y' is_active, 0 source_time, unix_timestamp(NOW()) created_time, r.wh_etl_exec_id, + 'system' confirmed_by, unix_timestamp(NOW()) confirmed_on + FROM (SELECT id, urn, substring_index(substring_index(urn, '/', 4), '/', -1) ds_group + FROM dict_dataset WHERE urn regexp '^(dalids|espresso|oracle)\:\/\/\/.*$') ds JOIN stg_repo_owner r - ON r.scm_repo_fullname = m.mapped_object_name + ON r.owner_type in ('main', 'espresso_avsc', 'producer', 'consumer', 'global', 'public', 'private', 'database', 'root') + AND FIND_IN_SET(ds.ds_group, r.dataset_group) > 0 ) n ON DUPLICATE KEY UPDATE dataset_urn = n.urn, sort_id = COALESCE(n.n_sort_id, sort_id), -- the Owner_type precedence (from high to low) is: OWNER, PRODUCER, DELEGATE, STAKEHOLDER - owner_type = CASE WHEN ( - case owner_type when 'OWNER' then 20 when 'PRODUCER' then 40 when 'DELEGATE' then 60 when 'STACKHOLDER' then 80 else 100 end - ) <= ( - case n.n_owner_type when 'OWNER' then 20 when 'PRODUCER' then 40 when 'DELEGATE' then 60 when 'STACKHOLDER' then 80 else 100 end - ) - THEN owner_type ELSE n.n_owner_type END, + -- n.n_owner_type = Owner, highest priority + owner_type = n.n_owner_type, owner_sub_type = COALESCE(owner_sub_type, n.n_owner_sub_type), owner_id_type = COALESCE(owner_id_type, n.n_owner_id_type), owner_source = CASE WHEN owner_source is null THEN 'SCM' WHEN owner_source LIKE '%SCM%' THEN owner_source ELSE CONCAT(owner_source, ',SCM') END, namespace = COALESCE(namespace, n.n_namespace), wh_etl_exec_id = n.wh_etl_exec_id, - modified_time = unix_timestamp(NOW()); + modified_time = unix_timestamp(NOW()), + confirmed_by = 'system', + confirmed_on = unix_timestamp(NOW()); -- reset dataset owner sort id UPDATE dataset_owner d @@ -151,7 +180,7 @@ class MultiproductLoad: @owner_rank := IF(@current_dataset_id = dataset_id, @owner_rank + 1, 0) rank, @current_dataset_id := dataset_id from dataset_owner, (select @current_dataset_id := 0, @owner_rank := 0) t - where dataset_urn like 'dalids:///%' + where dataset_urn regexp '^(dalids|espresso|oracle)\:\/\/\/.*$' order by dataset_id asc, owner_type desc, sort_id asc, owner_id asc ) s ON d.dataset_id = s.dataset_id AND d.owner_id = s.owner_id