172 lines
7.3 KiB
SQL

--
-- Copyright 2015 LinkedIn Corp. All rights reserved.
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--
-- creation statement for Kafka event related tables
-- Gobblin:
-- + GobblinTrackingEvent: compaction
-- + GobblinTrackingEvent_Distcp_Ng: distcp
-- + GobblinTrackingEvent_Lumos: rdbms/nosql
-- Hive Metastore
-- + MetastoreTableAudit
-- + MetastorePartitionAudit
-- Mapping {Kafka topic => stg table} is loaded in
-- wherehows-backend/app/actors/KafkaConsumerMaster.java
-- Avro schemas of the Kafka event are available in
-- wherehows-data-model/avro
-- staging table for Gobblin tracking event compaction
CREATE TABLE `stg_kafka_gobblin_compaction` (
`cluster` VARCHAR(20) NOT NULL,
`dataset` VARCHAR(100) NOT NULL,
`partition_type` VARCHAR(20) DEFAULT NULL,
`partition_name` VARCHAR(50) DEFAULT NULL,
`record_count` BIGINT(20) DEFAULT NULL,
`late_record_count` BIGINT(20) DEFAULT NULL,
`dedupe_status` VARCHAR(20) DEFAULT NULL,
`job_context` VARCHAR(50) DEFAULT NULL,
`project_name` VARCHAR(50) DEFAULT NULL,
`flow_name` VARCHAR(100) DEFAULT NULL,
`job_name` VARCHAR(100) DEFAULT NULL,
`flow_exec_id` INT(11) DEFAULT NULL,
`log_event_time` BIGINT(20) NOT NULL,
PRIMARY KEY (`dataset`,`cluster`,`log_event_time`)
)
ENGINE=InnoDB
DEFAULT CHARSET=latin1;
-- staging table for Gobblin tracking event lumos
CREATE TABLE `stg_kafka_gobblin_lumos` (
`cluster` VARCHAR(20) NOT NULL,
`dataset` VARCHAR(100) NOT NULL,
`location` VARCHAR(200) NOT NULL,
`partition_type` VARCHAR(20) DEFAULT NULL,
`partition_name` VARCHAR(50) DEFAULT NULL,
`subpartition_type` VARCHAR(20) DEFAULT NULL,
`subpartition_name` VARCHAR(50) DEFAULT NULL,
`max_data_date_epoch3` BIGINT(20) DEFAULT NULL,
`max_data_key` BIGINT(20) DEFAULT NULL,
`record_count` BIGINT(20) DEFAULT NULL,
`source_datacenter` VARCHAR(10) DEFAULT NULL,
`source_deployment_env` VARCHAR(10) DEFAULT NULL,
`source_database` VARCHAR(50) DEFAULT NULL,
`source_table` VARCHAR(50) DEFAULT NULL,
`job_context` VARCHAR(50) DEFAULT NULL,
`project_name` VARCHAR(100) DEFAULT NULL,
`flow_name` VARCHAR(100) DEFAULT NULL,
`job_name` VARCHAR(100) DEFAULT NULL,
`flow_exec_id` INT(11) DEFAULT NULL,
`log_event_time` BIGINT(20) NOT NULL,
PRIMARY KEY (`dataset`,`cluster`,`log_event_time`)
)
ENGINE=InnoDB
DEFAULT CHARSET=latin1;
-- staging table for Gobblin tracking event distcp_ng
CREATE TABLE `stg_kafka_gobblin_distcp` (
`cluster` VARCHAR(20) NOT NULL,
`dataset` VARCHAR(100) NOT NULL,
`partition_type` VARCHAR(20) DEFAULT NULL,
`partition_name` VARCHAR(50) NOT NULL,
`upsteam_timestamp` BIGINT(20) DEFAULT NULL,
`origin_timestamp` BIGINT(20) DEFAULT NULL,
`source_path` VARCHAR(200) DEFAULT NULL,
`target_path` VARCHAR(200) DEFAULT NULL,
`job_context` VARCHAR(50) DEFAULT NULL,
`project_name` VARCHAR(100) DEFAULT NULL,
`flow_name` VARCHAR(100) DEFAULT NULL,
`job_name` VARCHAR(100) DEFAULT NULL,
`flow_exec_id` INT(11) DEFAULT NULL,
`log_event_time` BIGINT(20) NOT NULL,
PRIMARY KEY (`dataset`,`cluster`,`partition_name`,`log_event_time`)
)
ENGINE=InnoDB
DEFAULT CHARSET=latin1;
-- staging table for Metastore Audit Event, include TableAudit / PartitionAudit
CREATE TABLE `stg_kafka_metastore_audit` (
`server` VARCHAR(20) NOT NULL,
`instance` VARCHAR(20) NOT NULL,
`app_name` VARCHAR(50) NOT NULL,
`event_name` VARCHAR(50) NOT NULL,
`event_type` VARCHAR(30) NOT NULL,
`log_event_time` BIGINT(20) NOT NULL,
`metastore_thrift_uri` VARCHAR(200) DEFAULT NULL,
`metastore_version` VARCHAR(20) DEFAULT NULL,
`is_successful` VARCHAR(5) DEFAULT NULL,
`is_data_deleted` VARCHAR(5) DEFAULT NULL,
`db_name` VARCHAR(100) NOT NULL,
`table_name` VARCHAR(100) NOT NULL,
`time_partition` VARCHAR(100) NOT NULL,
`location` VARCHAR(200) DEFAULT NULL,
`owner` VARCHAR(100) DEFAULT NULL,
`create_time` BIGINT(20) DEFAULT NULL,
`last_access_time` BIGINT(20) DEFAULT NULL,
`old_info` MEDIUMTEXT CHAR SET utf8 DEFAULT NULL,
`new_info` MEDIUMTEXT CHAR SET utf8 DEFAULT NULL,
PRIMARY KEY (`db_name`,`table_name`,`time_partition`,`instance`,`log_event_time`,`event_type`)
)
ENGINE=InnoDB
DEFAULT CHARSET=latin1;
-- Combine multiple data log status from Kafka events into a status table
SET TIME_ZONE='US/Pacific'; -- this needs to be customized based on your time zone
SELECT @@session.time_zone, current_timestamp;
CREATE TABLE log_dataset_instance_load_status (
dataset_id int(11) UNSIGNED NOT NULL DEFAULT '0',
db_id smallint(6) NOT NULL DEFAULT '0',
dataset_type varchar(30) COMMENT 'hive,teradata,oracle,hdfs...' NOT NULL,
dataset_native_name varchar(200) NOT NULL,
operation_type varchar(50) COMMENT 'load, merge, compact, update, delete' NULL,
partition_grain varchar(30) COMMENT 'snapshot, delta, daily, daily, monthly...' NOT NULL,
partition_expr varchar(500) COMMENT 'partition name or expression' NOT NULL,
data_time_expr varchar(20) COMMENT 'datetime literal of the data datetime' NOT NULL,
data_time_epoch int(11) COMMENT 'epoch second of the data datetime' NOT NULL,
record_count bigint(20) NULL,
size_in_byte bigint(20) NULL,
log_time_epoch int(11) COMMENT 'When data is loaded or published' NOT NULL,
ref_dataset_type varchar(30) COMMENT 'Refer to the underlying dataset' NULL,
ref_db_id int(11) COMMENT 'Refer to db of the underlying dataset' NULL,
ref_uri varchar(300) COMMENT 'Table name or HDFS location' NULL,
last_modified timestamp NULL,
PRIMARY KEY(dataset_id,db_id,data_time_epoch,partition_grain,partition_expr),
KEY(dataset_native_name),
KEY(ref_uri)
)
ENGINE = InnoDB
CHARACTER SET latin1
AUTO_INCREMENT = 0
COMMENT = 'Capture the load/publish ops for dataset instance'
PARTITION BY RANGE COLUMNS (data_time_epoch)
( PARTITION P201601 VALUES LESS THAN (unix_timestamp(date'2016-02-01')),
PARTITION P201602 VALUES LESS THAN (unix_timestamp(date'2016-03-01')),
PARTITION P201603 VALUES LESS THAN (unix_timestamp(date'2016-04-01')),
PARTITION P201604 VALUES LESS THAN (unix_timestamp(date'2016-05-01')),
PARTITION P201605 VALUES LESS THAN (unix_timestamp(date'2016-06-01')),
PARTITION P201606 VALUES LESS THAN (unix_timestamp(date'2016-07-01')),
PARTITION P201607 VALUES LESS THAN (unix_timestamp(date'2016-08-01')),
PARTITION P201608 VALUES LESS THAN (unix_timestamp(date'2016-09-01')),
PARTITION P201609 VALUES LESS THAN (unix_timestamp(date'2016-10-01')),
PARTITION P201610 VALUES LESS THAN (unix_timestamp(date'2016-11-01')),
PARTITION P201611 VALUES LESS THAN (unix_timestamp(date'2016-12-01')),
PARTITION P201612 VALUES LESS THAN (unix_timestamp(date'2017-01-01')),
PARTITION P203507 VALUES LESS THAN (unix_timestamp(date'2035-08-01'))
) ;