Merge pull request #277 from alyiwang/master

Map git repo and owners to Oracle/espresso/dali datasets
This commit is contained in:
Yi (Alan) Wang 2016-11-30 17:40:05 -08:00 committed by GitHub
commit 20db44df20
4 changed files with 70 additions and 51 deletions

View File

@ -142,14 +142,15 @@ CREATE TABLE cfg_database (
jdbc_url varchar(1000) NULL,
uri varchar(1000) NULL,
short_connection_string varchar(50) COMMENT 'Oracle TNS Name, ODBC DSN, TDPID...' NULL,
last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY(db_id),
UNIQUE KEY `uix_cfg_database__dbcode` (db_code) USING HASH
UNIQUE KEY `uix_cfg_database__dbcode` (db_code) USING HASH
)
ENGINE = InnoDB
DEFAULT CHARSET = utf8
COMMENT = 'Abstract different storage instances as databases' ;
CREATE TABLE stg_cfg_object_name_map (
object_type varchar(100) NOT NULL,
object_sub_type varchar(100) NULL,
@ -164,17 +165,15 @@ CREATE TABLE stg_cfg_object_name_map (
mapped_object_urn varchar(350) NULL,
mapped_object_dataset_id int(11) UNSIGNED NULL,
description varchar(500) NULL,
last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY(object_name,mapped_object_name)
last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY(object_name, mapped_object_name),
KEY idx_stg_cfg_object_name_map__mappedobjectname (mapped_object_name) USING BTREE
)
ENGINE = InnoDB
CHARACTER SET latin1
COLLATE latin1_swedish_ci
COMMENT = 'Map alias (when is_identical_map=Y) and view dependency' ;
CREATE INDEX idx_stg_cfg_object_name_map__mappedobjectname USING BTREE
ON stg_cfg_object_name_map(mapped_object_name);
CREATE TABLE cfg_object_name_map (
obj_name_map_id int(11) AUTO_INCREMENT NOT NULL,
object_type varchar(100) NOT NULL,
@ -188,36 +187,30 @@ CREATE TABLE cfg_object_name_map (
mapped_object_name varchar(350) NOT NULL COMMENT 'this is the original/parent object',
mapped_object_dataset_id int(11) UNSIGNED NULL COMMENT 'can be the abstract dataset id for versioned objects',
description varchar(500) NULL,
last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY(obj_name_map_id)
last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY(obj_name_map_id),
KEY idx_cfg_object_name_map__mappedobjectname (mapped_object_name) USING BTREE,
CONSTRAINT uix_cfg_object_name_map__objectname_mappedobjectname UNIQUE (object_name, mapped_object_name)
)
ENGINE = InnoDB
CHARACTER SET latin1
AUTO_INCREMENT = 1
COMMENT = 'Map alias (when is_identical_map=Y) and view dependency. Always map from Derived/Child (object) back to its Original/Parent (mapped_object)' ;
ALTER TABLE cfg_object_name_map
ADD CONSTRAINT uix_cfg_object_name_map__objectname_mappedobjectname
UNIQUE (object_name, mapped_object_name);
CREATE INDEX idx_cfg_object_name_map__mappedobjectname USING BTREE
ON cfg_object_name_map(mapped_object_name);
CREATE TABLE cfg_deployment_tier (
tier_id tinyint(4) NOT NULL,
tier_code varchar(25) COMMENT 'local,dev,test,qa,stg,prod' NOT NULL,
tier_label varchar(50) COMMENT 'display full name' NULL,
sort_id smallint(6) COMMENT '3-digit for group, 3-digit within group' NOT NULL,
last_modified timestamp NOT NULL,
PRIMARY KEY(tier_id)
last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY(tier_id),
UNIQUE KEY uix_cfg_deployment_tier__tiercode (tier_code)
)
ENGINE = InnoDB
AUTO_INCREMENT = 0
COMMENT = 'http://en.wikipedia.org/wiki/Deployment_environment';
CREATE UNIQUE INDEX uix_cfg_deployment_tier__tiercode
ON cfg_deployment_tier(tier_code);
CREATE TABLE cfg_data_center (
data_center_id smallint(6) NOT NULL DEFAULT '0',
@ -230,16 +223,14 @@ CREATE TABLE cfg_data_center (
longtitude decimal(10,6) NULL,
latitude decimal(10,6) NULL,
data_center_status char(1) COMMENT 'A,D,U' NULL,
last_modified timestamp NULL,
PRIMARY KEY(data_center_id)
last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY(data_center_id),
UNIQUE KEY uix_cfg_data_center__datacentercode (data_center_code)
)
ENGINE = InnoDB
AUTO_INCREMENT = 0
COMMENT = 'https://en.wikipedia.org/wiki/Data_center' ;
CREATE UNIQUE INDEX uix_cfg_data_center__datacentercode
ON cfg_data_center(data_center_code);
CREATE TABLE cfg_cluster (
cluster_id smallint(6) NOT NULL DEFAULT '0',
@ -249,13 +240,12 @@ CREATE TABLE cfg_cluster (
deployment_tier_code varchar(25) NOT NULL,
data_center_code varchar(30) NULL,
description varchar(200) NULL,
last_modified timestamp NULL,
PRIMARY KEY(cluster_id)
last_modified timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY(cluster_id),
UNIQUE KEY uix_cfg_cluster__clustercode (cluster_code)
)
COMMENT = 'https://en.wikipedia.org/wiki/Computer_cluster' ;
CREATE UNIQUE INDEX uix_cfg_cluster__clustercode
ON cfg_cluster(cluster_code);
CREATE TABLE IF NOT EXISTS cfg_search_score_boost (
`id` INT COMMENT 'dataset id',

View File

@ -73,6 +73,7 @@ CREATE TABLE `stg_product_repo` (
`scm_type` VARCHAR(20) NOT NULL,
`repo_id` INT UNSIGNED DEFAULT NULL,
`project` VARCHAR(100) DEFAULT NULL,
`dataset_group` VARCHAR(200) DEFAULT NULL COMMENT 'dataset group name, database name, etc',
`owner_type` VARCHAR(50) DEFAULT NULL,
`owner_name` VARCHAR(300) DEFAULT NULL COMMENT 'owner names in comma separated list',
`multiproduct_name` VARCHAR(100) DEFAULT NULL,
@ -88,6 +89,7 @@ CREATE TABLE `stg_repo_owner` (
`scm_repo_fullname` VARCHAR(100) NOT NULL,
`scm_type` VARCHAR(20) NOT NULL,
`repo_id` INT DEFAULT NULL,
`dataset_group` VARCHAR(200) DEFAULT NULL COMMENT 'dataset group name, database name, etc',
`owner_type` VARCHAR(50) DEFAULT NULL COMMENT 'which acl file this owner is in',
`owner_name` VARCHAR(50) DEFAULT NULL COMMENT 'one owner name',
`sort_id` INT UNSIGNED DEFAULT NULL,
@ -96,7 +98,6 @@ CREATE TABLE `stg_repo_owner` (
) ENGINE = InnoDB DEFAULT CHARSET = latin1;
CREATE TABLE stg_database_scm_map (
`database_name` VARCHAR(100) COMMENT 'database name',
`database_type` VARCHAR(50) COMMENT 'database type',

View File

@ -101,8 +101,7 @@ class CodeSearchLoad:
ON DUPLICATE KEY UPDATE
dataset_urn = n.urn,
sort_id = COALESCE(n.n_sort_id, sort_id),
owner_type = CASE WHEN n.n_owner_type IS NULL OR owner_type >= n.n_owner_type
THEN owner_type ELSE n.n_owner_type END,
owner_type = n.n_owner_type,
owner_sub_type = COALESCE(owner_sub_type, n.n_owner_sub_type),
owner_id_type = COALESCE(owner_id_type, n.n_owner_id_type),
owner_source = CASE WHEN owner_source is null THEN 'SCM'

View File

@ -70,7 +70,30 @@ class MultiproductLoad:
FIELDS TERMINATED BY '\Z' ESCAPED BY '\0'
LINES TERMINATED BY '\n'
(`app_id`, `wh_etl_exec_id`, `scm_repo_fullname`, `scm_type`, `repo_id`, `project`, `owner_type`, `owner_name`,
`multiproduct_name`, `product_type`, `product_version`, `namespace`)
`multiproduct_name`, `product_type`, `product_version`, `namespace`);
-- map repo to oracle or espresso database
UPDATE stg_product_repo r
INNER JOIN
(select database_type, substring_index(substring_index(scm_url, '/', 5), '/', -2) repo,
GROUP_CONCAT(database_name SEPARATOR ', ') dataset_groups
from stg_database_scm_map
where scm_type = 'git' and database_type in ('espresso', 'oracle')
group by repo, database_type) d
ON d.repo = r.scm_repo_fullname
AND r.app_id = {app_id}
SET r.dataset_group = d.dataset_groups;
-- map dali repo to dali dataset group
UPDATE stg_product_repo
SET dataset_group = REPLACE(substring_index(LEFT(scm_repo_fullname, LENGTH(scm_repo_fullname) - 9), '/', -1), '-', '_')
WHERE app_id = {app_id}
AND project IN ('dali-datasets', 'dali-base-datasets');
UPDATE stg_product_repo
SET dataset_group = concat(dataset_group, '_mp, ', dataset_group, '_mp_versioned')
WHERE app_id = {app_id}
AND project IN ('dali-datasets', 'dali-base-datasets')
'''.format(source_file=self.product_repo_file, app_id=self.app_id)
self.executeCommands(load_product_repos_cmd)
@ -86,7 +109,15 @@ class MultiproductLoad:
INTO TABLE stg_repo_owner
FIELDS TERMINATED BY '\Z' ESCAPED BY '\0'
LINES TERMINATED BY '\n'
(`app_id`, `wh_etl_exec_id`, `scm_repo_fullname`, `scm_type`, `repo_id`, `owner_type`, `owner_name`, `sort_id`, `paths`)
(`app_id`, `wh_etl_exec_id`, `scm_repo_fullname`, `scm_type`, `repo_id`, `owner_type`, `owner_name`, `sort_id`, `paths`);
-- update dataset_group from repo
UPDATE stg_repo_owner ro
INNER JOIN stg_product_repo pr
ON ro.app_id = {app_id} AND pr.app_id = {app_id}
AND ro.scm_repo_fullname = pr.scm_repo_fullname
AND pr.dataset_group IS NOT NULL
SET ro.dataset_group = pr.dataset_group
'''.format(source_file=self.product_repo_owner_file, app_id=self.app_id)
self.executeCommands(load_product_repo_owners_cmd)
@ -109,40 +140,38 @@ class MultiproductLoad:
-- INSERT/UPDATE into dataset_owner
INSERT INTO dataset_owner (
dataset_id, dataset_urn, owner_id, sort_id, namespace, app_id, owner_type, owner_sub_type, owner_id_type,
owner_source, db_ids, is_group, is_active, source_time, created_time, wh_etl_exec_id
owner_source, db_ids, is_group, is_active, source_time, created_time, wh_etl_exec_id, confirmed_by, confirmed_on
)
SELECT * FROM (
SELECT ds.id, ds.urn, r.owner_name n_owner_id, r.sort_id n_sort_id,
'urn:li:corpuser' n_namespace, r.app_id,
IF(r.owner_type = 'main', 'Producer', r.owner_type) n_owner_type,
null n_owner_sub_type,
'Owner' n_owner_type, r.owner_type n_owner_sub_type,
case when r.app_id = 300 then 'USER' when r.app_id = 301 then 'GROUP' else null end n_owner_id_type,
'SCM' n_owner_source, null db_ids,
IF(r.app_id = 301, 'Y', 'N') is_group,
'Y' is_active, 0 source_time, unix_timestamp(NOW()) created_time, r.wh_etl_exec_id
FROM (SELECT id, urn FROM dict_dataset WHERE urn like 'dalids:///%') ds
JOIN (SELECT object_name, mapped_object_name FROM cfg_object_name_map WHERE mapped_object_type = 'scm') m
ON m.object_name = concat('/', substring_index(substring_index(ds.urn, '/', 4), '/', -1))
'Y' is_active, 0 source_time, unix_timestamp(NOW()) created_time, r.wh_etl_exec_id,
'system' confirmed_by, unix_timestamp(NOW()) confirmed_on
FROM (SELECT id, urn, substring_index(substring_index(urn, '/', 4), '/', -1) ds_group
FROM dict_dataset WHERE urn regexp '^(dalids|espresso|oracle)\:\/\/\/.*$') ds
JOIN stg_repo_owner r
ON r.scm_repo_fullname = m.mapped_object_name
ON r.owner_type in ('main', 'espresso_avsc', 'producer', 'consumer', 'global', 'public', 'private', 'database', 'root')
AND FIND_IN_SET(ds.ds_group, r.dataset_group) > 0
) n
ON DUPLICATE KEY UPDATE
dataset_urn = n.urn,
sort_id = COALESCE(n.n_sort_id, sort_id),
-- the Owner_type precedence (from high to low) is: OWNER, PRODUCER, DELEGATE, STAKEHOLDER
owner_type = CASE WHEN (
case owner_type when 'OWNER' then 20 when 'PRODUCER' then 40 when 'DELEGATE' then 60 when 'STACKHOLDER' then 80 else 100 end
) <= (
case n.n_owner_type when 'OWNER' then 20 when 'PRODUCER' then 40 when 'DELEGATE' then 60 when 'STACKHOLDER' then 80 else 100 end
)
THEN owner_type ELSE n.n_owner_type END,
-- n.n_owner_type = Owner, highest priority
owner_type = n.n_owner_type,
owner_sub_type = COALESCE(owner_sub_type, n.n_owner_sub_type),
owner_id_type = COALESCE(owner_id_type, n.n_owner_id_type),
owner_source = CASE WHEN owner_source is null THEN 'SCM'
WHEN owner_source LIKE '%SCM%' THEN owner_source ELSE CONCAT(owner_source, ',SCM') END,
namespace = COALESCE(namespace, n.n_namespace),
wh_etl_exec_id = n.wh_etl_exec_id,
modified_time = unix_timestamp(NOW());
modified_time = unix_timestamp(NOW()),
confirmed_by = 'system',
confirmed_on = unix_timestamp(NOW());
-- reset dataset owner sort id
UPDATE dataset_owner d
@ -151,7 +180,7 @@ class MultiproductLoad:
@owner_rank := IF(@current_dataset_id = dataset_id, @owner_rank + 1, 0) rank,
@current_dataset_id := dataset_id
from dataset_owner, (select @current_dataset_id := 0, @owner_rank := 0) t
where dataset_urn like 'dalids:///%'
where dataset_urn regexp '^(dalids|espresso|oracle)\:\/\/\/.*$'
order by dataset_id asc, owner_type desc, sort_id asc, owner_id asc
) s
ON d.dataset_id = s.dataset_id AND d.owner_id = s.owner_id