Merge pull request #277 from alyiwang/master

Map git repo and owners to Oracle/espresso/dali datasets
This commit is contained in:
Yi (Alan) Wang 2016-11-30 17:40:05 -08:00 committed by GitHub
commit 20db44df20
4 changed files with 70 additions and 51 deletions

View File

@ -142,7 +142,7 @@ CREATE TABLE cfg_database (
jdbc_url varchar(1000) NULL, jdbc_url varchar(1000) NULL,
uri varchar(1000) NULL, uri varchar(1000) NULL,
short_connection_string varchar(50) COMMENT 'Oracle TNS Name, ODBC DSN, TDPID...' NULL, short_connection_string varchar(50) COMMENT 'Oracle TNS Name, ODBC DSN, TDPID...' NULL,
last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY(db_id), PRIMARY KEY(db_id),
UNIQUE KEY `uix_cfg_database__dbcode` (db_code) USING HASH UNIQUE KEY `uix_cfg_database__dbcode` (db_code) USING HASH
) )
@ -150,6 +150,7 @@ ENGINE = InnoDB
DEFAULT CHARSET = utf8 DEFAULT CHARSET = utf8
COMMENT = 'Abstract different storage instances as databases' ; COMMENT = 'Abstract different storage instances as databases' ;
CREATE TABLE stg_cfg_object_name_map ( CREATE TABLE stg_cfg_object_name_map (
object_type varchar(100) NOT NULL, object_type varchar(100) NOT NULL,
object_sub_type varchar(100) NULL, object_sub_type varchar(100) NULL,
@ -164,17 +165,15 @@ CREATE TABLE stg_cfg_object_name_map (
mapped_object_urn varchar(350) NULL, mapped_object_urn varchar(350) NULL,
mapped_object_dataset_id int(11) UNSIGNED NULL, mapped_object_dataset_id int(11) UNSIGNED NULL,
description varchar(500) NULL, description varchar(500) NULL,
last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY(object_name,mapped_object_name) PRIMARY KEY(object_name, mapped_object_name),
KEY idx_stg_cfg_object_name_map__mappedobjectname (mapped_object_name) USING BTREE
) )
ENGINE = InnoDB ENGINE = InnoDB
CHARACTER SET latin1 CHARACTER SET latin1
COLLATE latin1_swedish_ci COLLATE latin1_swedish_ci
COMMENT = 'Map alias (when is_identical_map=Y) and view dependency' ; COMMENT = 'Map alias (when is_identical_map=Y) and view dependency' ;
CREATE INDEX idx_stg_cfg_object_name_map__mappedobjectname USING BTREE
ON stg_cfg_object_name_map(mapped_object_name);
CREATE TABLE cfg_object_name_map ( CREATE TABLE cfg_object_name_map (
obj_name_map_id int(11) AUTO_INCREMENT NOT NULL, obj_name_map_id int(11) AUTO_INCREMENT NOT NULL,
object_type varchar(100) NOT NULL, object_type varchar(100) NOT NULL,
@ -188,36 +187,30 @@ CREATE TABLE cfg_object_name_map (
mapped_object_name varchar(350) NOT NULL COMMENT 'this is the original/parent object', mapped_object_name varchar(350) NOT NULL COMMENT 'this is the original/parent object',
mapped_object_dataset_id int(11) UNSIGNED NULL COMMENT 'can be the abstract dataset id for versioned objects', mapped_object_dataset_id int(11) UNSIGNED NULL COMMENT 'can be the abstract dataset id for versioned objects',
description varchar(500) NULL, description varchar(500) NULL,
last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY(obj_name_map_id) PRIMARY KEY(obj_name_map_id),
KEY idx_cfg_object_name_map__mappedobjectname (mapped_object_name) USING BTREE,
CONSTRAINT uix_cfg_object_name_map__objectname_mappedobjectname UNIQUE (object_name, mapped_object_name)
) )
ENGINE = InnoDB ENGINE = InnoDB
CHARACTER SET latin1 CHARACTER SET latin1
AUTO_INCREMENT = 1 AUTO_INCREMENT = 1
COMMENT = 'Map alias (when is_identical_map=Y) and view dependency. Always map from Derived/Child (object) back to its Original/Parent (mapped_object)' ; COMMENT = 'Map alias (when is_identical_map=Y) and view dependency. Always map from Derived/Child (object) back to its Original/Parent (mapped_object)' ;
ALTER TABLE cfg_object_name_map
ADD CONSTRAINT uix_cfg_object_name_map__objectname_mappedobjectname
UNIQUE (object_name, mapped_object_name);
CREATE INDEX idx_cfg_object_name_map__mappedobjectname USING BTREE
ON cfg_object_name_map(mapped_object_name);
CREATE TABLE cfg_deployment_tier ( CREATE TABLE cfg_deployment_tier (
tier_id tinyint(4) NOT NULL, tier_id tinyint(4) NOT NULL,
tier_code varchar(25) COMMENT 'local,dev,test,qa,stg,prod' NOT NULL, tier_code varchar(25) COMMENT 'local,dev,test,qa,stg,prod' NOT NULL,
tier_label varchar(50) COMMENT 'display full name' NULL, tier_label varchar(50) COMMENT 'display full name' NULL,
sort_id smallint(6) COMMENT '3-digit for group, 3-digit within group' NOT NULL, sort_id smallint(6) COMMENT '3-digit for group, 3-digit within group' NOT NULL,
last_modified timestamp NOT NULL, last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY(tier_id) PRIMARY KEY(tier_id),
UNIQUE KEY uix_cfg_deployment_tier__tiercode (tier_code)
) )
ENGINE = InnoDB ENGINE = InnoDB
AUTO_INCREMENT = 0 AUTO_INCREMENT = 0
COMMENT = 'http://en.wikipedia.org/wiki/Deployment_environment'; COMMENT = 'http://en.wikipedia.org/wiki/Deployment_environment';
CREATE UNIQUE INDEX uix_cfg_deployment_tier__tiercode
ON cfg_deployment_tier(tier_code);
CREATE TABLE cfg_data_center ( CREATE TABLE cfg_data_center (
data_center_id smallint(6) NOT NULL DEFAULT '0', data_center_id smallint(6) NOT NULL DEFAULT '0',
@ -230,16 +223,14 @@ CREATE TABLE cfg_data_center (
longtitude decimal(10,6) NULL, longtitude decimal(10,6) NULL,
latitude decimal(10,6) NULL, latitude decimal(10,6) NULL,
data_center_status char(1) COMMENT 'A,D,U' NULL, data_center_status char(1) COMMENT 'A,D,U' NULL,
last_modified timestamp NULL, last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY(data_center_id) PRIMARY KEY(data_center_id),
UNIQUE KEY uix_cfg_data_center__datacentercode (data_center_code)
) )
ENGINE = InnoDB ENGINE = InnoDB
AUTO_INCREMENT = 0 AUTO_INCREMENT = 0
COMMENT = 'https://en.wikipedia.org/wiki/Data_center' ; COMMENT = 'https://en.wikipedia.org/wiki/Data_center' ;
CREATE UNIQUE INDEX uix_cfg_data_center__datacentercode
ON cfg_data_center(data_center_code);
CREATE TABLE cfg_cluster ( CREATE TABLE cfg_cluster (
cluster_id smallint(6) NOT NULL DEFAULT '0', cluster_id smallint(6) NOT NULL DEFAULT '0',
@ -249,13 +240,12 @@ CREATE TABLE cfg_cluster (
deployment_tier_code varchar(25) NOT NULL, deployment_tier_code varchar(25) NOT NULL,
data_center_code varchar(30) NULL, data_center_code varchar(30) NULL,
description varchar(200) NULL, description varchar(200) NULL,
last_modified timestamp NULL, last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY(cluster_id) PRIMARY KEY(cluster_id),
UNIQUE KEY uix_cfg_cluster__clustercode (cluster_code)
) )
COMMENT = 'https://en.wikipedia.org/wiki/Computer_cluster' ; COMMENT = 'https://en.wikipedia.org/wiki/Computer_cluster' ;
CREATE UNIQUE INDEX uix_cfg_cluster__clustercode
ON cfg_cluster(cluster_code);
CREATE TABLE IF NOT EXISTS cfg_search_score_boost ( CREATE TABLE IF NOT EXISTS cfg_search_score_boost (
`id` INT COMMENT 'dataset id', `id` INT COMMENT 'dataset id',

View File

@ -73,6 +73,7 @@ CREATE TABLE `stg_product_repo` (
`scm_type` VARCHAR(20) NOT NULL, `scm_type` VARCHAR(20) NOT NULL,
`repo_id` INT UNSIGNED DEFAULT NULL, `repo_id` INT UNSIGNED DEFAULT NULL,
`project` VARCHAR(100) DEFAULT NULL, `project` VARCHAR(100) DEFAULT NULL,
`dataset_group` VARCHAR(200) DEFAULT NULL COMMENT 'dataset group name, database name, etc',
`owner_type` VARCHAR(50) DEFAULT NULL, `owner_type` VARCHAR(50) DEFAULT NULL,
`owner_name` VARCHAR(300) DEFAULT NULL COMMENT 'owner names in comma separated list', `owner_name` VARCHAR(300) DEFAULT NULL COMMENT 'owner names in comma separated list',
`multiproduct_name` VARCHAR(100) DEFAULT NULL, `multiproduct_name` VARCHAR(100) DEFAULT NULL,
@ -88,6 +89,7 @@ CREATE TABLE `stg_repo_owner` (
`scm_repo_fullname` VARCHAR(100) NOT NULL, `scm_repo_fullname` VARCHAR(100) NOT NULL,
`scm_type` VARCHAR(20) NOT NULL, `scm_type` VARCHAR(20) NOT NULL,
`repo_id` INT DEFAULT NULL, `repo_id` INT DEFAULT NULL,
`dataset_group` VARCHAR(200) DEFAULT NULL COMMENT 'dataset group name, database name, etc',
`owner_type` VARCHAR(50) DEFAULT NULL COMMENT 'which acl file this owner is in', `owner_type` VARCHAR(50) DEFAULT NULL COMMENT 'which acl file this owner is in',
`owner_name` VARCHAR(50) DEFAULT NULL COMMENT 'one owner name', `owner_name` VARCHAR(50) DEFAULT NULL COMMENT 'one owner name',
`sort_id` INT UNSIGNED DEFAULT NULL, `sort_id` INT UNSIGNED DEFAULT NULL,
@ -96,7 +98,6 @@ CREATE TABLE `stg_repo_owner` (
) ENGINE = InnoDB DEFAULT CHARSET = latin1; ) ENGINE = InnoDB DEFAULT CHARSET = latin1;
CREATE TABLE stg_database_scm_map ( CREATE TABLE stg_database_scm_map (
`database_name` VARCHAR(100) COMMENT 'database name', `database_name` VARCHAR(100) COMMENT 'database name',
`database_type` VARCHAR(50) COMMENT 'database type', `database_type` VARCHAR(50) COMMENT 'database type',

View File

@ -101,8 +101,7 @@ class CodeSearchLoad:
ON DUPLICATE KEY UPDATE ON DUPLICATE KEY UPDATE
dataset_urn = n.urn, dataset_urn = n.urn,
sort_id = COALESCE(n.n_sort_id, sort_id), sort_id = COALESCE(n.n_sort_id, sort_id),
owner_type = CASE WHEN n.n_owner_type IS NULL OR owner_type >= n.n_owner_type owner_type = n.n_owner_type,
THEN owner_type ELSE n.n_owner_type END,
owner_sub_type = COALESCE(owner_sub_type, n.n_owner_sub_type), owner_sub_type = COALESCE(owner_sub_type, n.n_owner_sub_type),
owner_id_type = COALESCE(owner_id_type, n.n_owner_id_type), owner_id_type = COALESCE(owner_id_type, n.n_owner_id_type),
owner_source = CASE WHEN owner_source is null THEN 'SCM' owner_source = CASE WHEN owner_source is null THEN 'SCM'

View File

@ -70,7 +70,30 @@ class MultiproductLoad:
FIELDS TERMINATED BY '\Z' ESCAPED BY '\0' FIELDS TERMINATED BY '\Z' ESCAPED BY '\0'
LINES TERMINATED BY '\n' LINES TERMINATED BY '\n'
(`app_id`, `wh_etl_exec_id`, `scm_repo_fullname`, `scm_type`, `repo_id`, `project`, `owner_type`, `owner_name`, (`app_id`, `wh_etl_exec_id`, `scm_repo_fullname`, `scm_type`, `repo_id`, `project`, `owner_type`, `owner_name`,
`multiproduct_name`, `product_type`, `product_version`, `namespace`) `multiproduct_name`, `product_type`, `product_version`, `namespace`);
-- map repo to oracle or espresso database
UPDATE stg_product_repo r
INNER JOIN
(select database_type, substring_index(substring_index(scm_url, '/', 5), '/', -2) repo,
GROUP_CONCAT(database_name SEPARATOR ', ') dataset_groups
from stg_database_scm_map
where scm_type = 'git' and database_type in ('espresso', 'oracle')
group by repo, database_type) d
ON d.repo = r.scm_repo_fullname
AND r.app_id = {app_id}
SET r.dataset_group = d.dataset_groups;
-- map dali repo to dali dataset group
UPDATE stg_product_repo
SET dataset_group = REPLACE(substring_index(LEFT(scm_repo_fullname, LENGTH(scm_repo_fullname) - 9), '/', -1), '-', '_')
WHERE app_id = {app_id}
AND project IN ('dali-datasets', 'dali-base-datasets');
UPDATE stg_product_repo
SET dataset_group = concat(dataset_group, '_mp, ', dataset_group, '_mp_versioned')
WHERE app_id = {app_id}
AND project IN ('dali-datasets', 'dali-base-datasets')
'''.format(source_file=self.product_repo_file, app_id=self.app_id) '''.format(source_file=self.product_repo_file, app_id=self.app_id)
self.executeCommands(load_product_repos_cmd) self.executeCommands(load_product_repos_cmd)
@ -86,7 +109,15 @@ class MultiproductLoad:
INTO TABLE stg_repo_owner INTO TABLE stg_repo_owner
FIELDS TERMINATED BY '\Z' ESCAPED BY '\0' FIELDS TERMINATED BY '\Z' ESCAPED BY '\0'
LINES TERMINATED BY '\n' LINES TERMINATED BY '\n'
(`app_id`, `wh_etl_exec_id`, `scm_repo_fullname`, `scm_type`, `repo_id`, `owner_type`, `owner_name`, `sort_id`, `paths`) (`app_id`, `wh_etl_exec_id`, `scm_repo_fullname`, `scm_type`, `repo_id`, `owner_type`, `owner_name`, `sort_id`, `paths`);
-- update dataset_group from repo
UPDATE stg_repo_owner ro
INNER JOIN stg_product_repo pr
ON ro.app_id = {app_id} AND pr.app_id = {app_id}
AND ro.scm_repo_fullname = pr.scm_repo_fullname
AND pr.dataset_group IS NOT NULL
SET ro.dataset_group = pr.dataset_group
'''.format(source_file=self.product_repo_owner_file, app_id=self.app_id) '''.format(source_file=self.product_repo_owner_file, app_id=self.app_id)
self.executeCommands(load_product_repo_owners_cmd) self.executeCommands(load_product_repo_owners_cmd)
@ -109,40 +140,38 @@ class MultiproductLoad:
-- INSERT/UPDATE into dataset_owner -- INSERT/UPDATE into dataset_owner
INSERT INTO dataset_owner ( INSERT INTO dataset_owner (
dataset_id, dataset_urn, owner_id, sort_id, namespace, app_id, owner_type, owner_sub_type, owner_id_type, dataset_id, dataset_urn, owner_id, sort_id, namespace, app_id, owner_type, owner_sub_type, owner_id_type,
owner_source, db_ids, is_group, is_active, source_time, created_time, wh_etl_exec_id owner_source, db_ids, is_group, is_active, source_time, created_time, wh_etl_exec_id, confirmed_by, confirmed_on
) )
SELECT * FROM ( SELECT * FROM (
SELECT ds.id, ds.urn, r.owner_name n_owner_id, r.sort_id n_sort_id, SELECT ds.id, ds.urn, r.owner_name n_owner_id, r.sort_id n_sort_id,
'urn:li:corpuser' n_namespace, r.app_id, 'urn:li:corpuser' n_namespace, r.app_id,
IF(r.owner_type = 'main', 'Producer', r.owner_type) n_owner_type, 'Owner' n_owner_type, r.owner_type n_owner_sub_type,
null n_owner_sub_type,
case when r.app_id = 300 then 'USER' when r.app_id = 301 then 'GROUP' else null end n_owner_id_type, case when r.app_id = 300 then 'USER' when r.app_id = 301 then 'GROUP' else null end n_owner_id_type,
'SCM' n_owner_source, null db_ids, 'SCM' n_owner_source, null db_ids,
IF(r.app_id = 301, 'Y', 'N') is_group, IF(r.app_id = 301, 'Y', 'N') is_group,
'Y' is_active, 0 source_time, unix_timestamp(NOW()) created_time, r.wh_etl_exec_id 'Y' is_active, 0 source_time, unix_timestamp(NOW()) created_time, r.wh_etl_exec_id,
FROM (SELECT id, urn FROM dict_dataset WHERE urn like 'dalids:///%') ds 'system' confirmed_by, unix_timestamp(NOW()) confirmed_on
JOIN (SELECT object_name, mapped_object_name FROM cfg_object_name_map WHERE mapped_object_type = 'scm') m FROM (SELECT id, urn, substring_index(substring_index(urn, '/', 4), '/', -1) ds_group
ON m.object_name = concat('/', substring_index(substring_index(ds.urn, '/', 4), '/', -1)) FROM dict_dataset WHERE urn regexp '^(dalids|espresso|oracle)\:\/\/\/.*$') ds
JOIN stg_repo_owner r JOIN stg_repo_owner r
ON r.scm_repo_fullname = m.mapped_object_name ON r.owner_type in ('main', 'espresso_avsc', 'producer', 'consumer', 'global', 'public', 'private', 'database', 'root')
AND FIND_IN_SET(ds.ds_group, r.dataset_group) > 0
) n ) n
ON DUPLICATE KEY UPDATE ON DUPLICATE KEY UPDATE
dataset_urn = n.urn, dataset_urn = n.urn,
sort_id = COALESCE(n.n_sort_id, sort_id), sort_id = COALESCE(n.n_sort_id, sort_id),
-- the Owner_type precedence (from high to low) is: OWNER, PRODUCER, DELEGATE, STAKEHOLDER -- the Owner_type precedence (from high to low) is: OWNER, PRODUCER, DELEGATE, STAKEHOLDER
owner_type = CASE WHEN ( -- n.n_owner_type = Owner, highest priority
case owner_type when 'OWNER' then 20 when 'PRODUCER' then 40 when 'DELEGATE' then 60 when 'STACKHOLDER' then 80 else 100 end owner_type = n.n_owner_type,
) <= (
case n.n_owner_type when 'OWNER' then 20 when 'PRODUCER' then 40 when 'DELEGATE' then 60 when 'STACKHOLDER' then 80 else 100 end
)
THEN owner_type ELSE n.n_owner_type END,
owner_sub_type = COALESCE(owner_sub_type, n.n_owner_sub_type), owner_sub_type = COALESCE(owner_sub_type, n.n_owner_sub_type),
owner_id_type = COALESCE(owner_id_type, n.n_owner_id_type), owner_id_type = COALESCE(owner_id_type, n.n_owner_id_type),
owner_source = CASE WHEN owner_source is null THEN 'SCM' owner_source = CASE WHEN owner_source is null THEN 'SCM'
WHEN owner_source LIKE '%SCM%' THEN owner_source ELSE CONCAT(owner_source, ',SCM') END, WHEN owner_source LIKE '%SCM%' THEN owner_source ELSE CONCAT(owner_source, ',SCM') END,
namespace = COALESCE(namespace, n.n_namespace), namespace = COALESCE(namespace, n.n_namespace),
wh_etl_exec_id = n.wh_etl_exec_id, wh_etl_exec_id = n.wh_etl_exec_id,
modified_time = unix_timestamp(NOW()); modified_time = unix_timestamp(NOW()),
confirmed_by = 'system',
confirmed_on = unix_timestamp(NOW());
-- reset dataset owner sort id -- reset dataset owner sort id
UPDATE dataset_owner d UPDATE dataset_owner d
@ -151,7 +180,7 @@ class MultiproductLoad:
@owner_rank := IF(@current_dataset_id = dataset_id, @owner_rank + 1, 0) rank, @owner_rank := IF(@current_dataset_id = dataset_id, @owner_rank + 1, 0) rank,
@current_dataset_id := dataset_id @current_dataset_id := dataset_id
from dataset_owner, (select @current_dataset_id := 0, @owner_rank := 0) t from dataset_owner, (select @current_dataset_id := 0, @owner_rank := 0) t
where dataset_urn like 'dalids:///%' where dataset_urn regexp '^(dalids|espresso|oracle)\:\/\/\/.*$'
order by dataset_id asc, owner_type desc, sort_id asc, owner_id asc order by dataset_id asc, owner_type desc, sort_id asc, owner_id asc
) s ) s
ON d.dataset_id = s.dataset_id AND d.owner_id = s.owner_id ON d.dataset_id = s.dataset_id AND d.owner_id = s.owner_id