# # Copyright 2015 LinkedIn Corp. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # import datetime import FileUtil import json import os import re import sys from com.ziclix.python.sql import zxJDBC from org.slf4j import LoggerFactory from wherehows.common.schemas import SampleDataRecord from wherehows.common.writers import FileWriter from wherehows.common import Constant class TeradataExtract: def __init__(self): self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__) def get_view_info(self, database_name, view_name): """ :param database_name: :param view_name: :return: """ view_cols = [] curs_vw = self.conn_td.cursor() view_sql = ''' SELECT Trim(DatabaseName) DatabaseName, Trim(TableName) TableName, RequestText, CreateTimestamp(CHAR(19)), LastAlterTimestamp(CHAR(19)), AccessCount FROM DBC.Tables WHERE DatabaseName = '%s' AND TableName NOT LIKE ALL ('!_%%', '#%%', 'TMP%%', 'TEMP%%') ESCAPE '!' AND TableKind = 'V' ORDER BY 2''' % database_name if not view_name is None: view_sql = view_sql + ''' AND TableName = '%s' ''' % view_name curs_vw.execute(view_sql) views = curs_vw.fetchall() for vw in views: try: # table_sql = 'CREATE VOLATILE TABLE vt_get_column_info AS (SELECT * FROM %s.%s) WITH NO DATA NO PRIMARY INDEX' % (vw[0], vw[1]) help_column_sql = 'help column %s.%s.*' % (vw[0], vw[1]) curs_vw.execute(help_column_sql) rows = curs_vw.fetchall() except Exception, e: # print str(e), vw[1], len(views) continue for r in rows: column_name = r[0].strip() data_type = r[1].strip() nullable = r[2].strip() format = r[3].strip() max_length = r[4] & 0xffff decimal_total_digits = r[5] & 0xffff if r[5] else None decimal_fractional_digits = r[6] & 0xffff if r[6] else (0 if r[5] else None) char_type = r[16] & 0xffff if r[16] else None if data_type == 'I1': data_type = 'BYTEINT' elif data_type == 'I2': data_type = 'SMALLINT' elif data_type == 'I': data_type = 'INT' elif data_type == 'F': data_type = 'FLOAT' elif data_type == 'I8': data_type = 'BIGINT' elif data_type == 'DA': data_type = 'DATE' elif data_type == 'AT': data_type = 'TIME' elif data_type == 'TS': data_type = 'TIMESTAMP' elif data_type == 'SZ': data_type = 'TIMESTAMP WITH TIMEZONE' elif data_type == 'TZ': data_type = 'TIME WITH TIMEZONE' elif data_type == 'CO': data_type = 'CLOB' elif data_type == 'CF': data_type = 'CHAR(' + str(max_length / char_type) + ')' elif data_type == 'CV': data_type = 'VARCHAR(' + str(max_length / char_type) + ')' elif data_type == 'BV': data_type = 'VARBYTE(' + str(max_length) + ')' elif data_type == 'D': data_type = 'DECIMAL(' + str(decimal_total_digits) + ',' + str(decimal_fractional_digits) + ')' if data_type not in ['DATE', 'TIME', 'TIMESTAMP', 'TIME WITH TIMEZONE', 'TIMESTAMP WITH TIMEZONE']: format = '' view_cols.append(( vw[0], vw[1], vw[2], str(vw[3]), str(vw[4]), vw[5], column_name, format, nullable, data_type, max_length, decimal_total_digits, decimal_fractional_digits)) curs_vw.close() return view_cols def get_table_info(self, database_name, table_name): """ get table, column info from teradata DBC.Tables :param database_name: :param table_name: not used in common case :return: """ td_table_name_filter = '' if not database_name is None: td_database_name = database_name elif len(table_name) > 0: if table_name.find('.') > 0: (td_database_name, td_table_name) = table_name.split('.') else: td_database_name = self.default_database td_table_name = table_name td_table_name_filter = "AND a.TableName = '%s' " % td_table_name curs_td = self.conn_td.cursor() col_stats_sql = """SELECT c.DatabaseName, c.TableName2 TableName, c.CreateTimestamp (FORMAT 'YYYY-MM-DDBHH:MI:SS') (CHAR(19)) createTimestamp, c.LastAlterTimestamp (CHAR(19)) TableLastAlterTimestamp, c.LastAccessTimestamp (FORMAT 'YYYY-MM-DDBHH:MI:SS') (CHAR(19)) LastAccessTimestamp, c.TableAccessCount, RTRIM(a.columnname) ColumnName, CASE WHEN a.columntype IN ('DA','AT','TS','SZ','TZ') THEN RTrim(a.ColumnFormat) ELSE NULL END ColumnFormat, a.DefaultValue, a.nullable, a.LastAccessTimestamp (FORMAT 'YYYY-MM-DDBHH:MI:SS') (CHAR(19)) LastAccessTimestamp, a.AccessCount, b.UniqueValueCount (bigint) UniqueValueCount, b.LastCollectTimestamp (FORMAT 'YYYY-MM-DDBHH:MI:SS') (CHAR(19)) LastCollectTimestamp, CASE WHEN a.columntype = 'I1' THEN 'BYTEINT' WHEN a.columntype = 'I2' THEN 'SMALLINT' WHEN a.columntype = 'I' THEN 'INT' WHEN a.columntype = 'F' THEN 'FLOAT' WHEN a.columntype = 'I8' THEN 'BIGINT' WHEN a.columntype = 'DA' THEN 'DATE' WHEN a.columntype = 'AT' THEN 'TIME' WHEN a.columntype = 'TS' THEN 'TIMESTAMP' WHEN a.columntype = 'SZ' THEN 'TIMESTAMP WITH TIMEZONE' WHEN a.columntype = 'TZ' THEN 'TIME WITH TIMEZONE' WHEN a.columntype = 'CO' THEN 'CLOB' WHEN a.columntype = 'BV' THEN 'VARBYTE(' || ColumnLength || ')' WHEN a.columntype = 'CF' THEN 'CHAR('|| TRIM(a.ColumnLength/a.CharType) || ')' WHEN a.columntype = 'CV' THEN 'VARCHAR('|| TRIM(a.ColumnLength/a.CharType) || ')' WHEN a.columntype = 'D' THEN 'DECIMAL(' || TRIM(a.DecimalTotalDigits) || ',' || TRIM(a.DecimalFractionalDigits) || ')' END Data_Type, a.ColumnLength, a.DecimalTotalDigits, a.DecimalFractionalDigits, a.ColumnId Column_Id, RTrim(c.CreatorName) CreatorName, RTrim(c.TableName) OriginalTableName FROM ( select RTrim(a.DatabaseName) DatabaseName, case when regexp_similar(a.tableName, '[[:alnum:]_]+[[:digit:]]{8}([^[:digit:]]|$).*', 'c') = 1 then case when regexp_substr(a.TableName, '([[:digit:]]{8})([^[:digit:]]|$)') between '20000101' and '20991231' then rtrim(regexp_replace(a.tableName, '([[:digit:]]{8})', '${YYYYMMDD}', 1, 1, 'c')) when regexp_substr(a.TableName, '([[:digit:]]{8})([^[:digit:]]|$)') between '01012000' and '12312099' then rtrim(regexp_replace(a.tableName, '([[:digit:]]{8})', '${MMDDYYYY}', 1, 1, 'c')) else RTRIM(a.tablename) end when regexp_similar(a.tableName, '[[:alnum:]_]+[[:digit:]]{4}_[[:digit:]]{2}([^[:digit:]]|$).*', 'c') = 1 and regexp_substr(a.TableName, '([[:digit:]]{4})_[[:digit:]]{2}([^[:digit:]]|$)') between '2000_01' and '9999_12' then rtrim(regexp_replace(a.tableName, '([[:digit:]]{4}_[[:digit:]]{2})', '${YYYY_MM}', 1, 1, 'c')) when regexp_similar(a.tableName, '[[:alnum:]_]+[[:digit:]]{6}([^[:digit:]]|$).*', 'c') = 1 and regexp_substr(a.TableName, '([[:digit:]]{6})([^[:digit:]]|$)') between '200001' and '999912' then rtrim(regexp_replace(a.tableName, '([[:digit:]]{6})', '${YYYYMM}', 1, 1, 'c')) else RTRIM(a.tablename) end TableName2, a.TableName, a.CreateTimestamp, a.LastAlterTimestamp, a.LastAccessTimestamp, a.AccessCount TableAccessCount, a.CreatorName from DBC.Tables a where a.TableKind IN ('T', 'O') AND a.DatabaseName = '%s' %s AND a.TableName NOT LIKE ALL ('INFA%%', 'tmp!_%%', 'temp!_%%', '!_%%', '#%%' 'ET!_%%', 'LS!_%%', 'VT!_%%', 'LOGTABLE%%', 'backup%%', 'bkp%%', 'W!_%%') ESCAPE '!' AND RTRIM(a.TableName) NOT LIKE ALL ('%%!_tmp', '%%!_temp', '%%!_ERR!_.', '%%!_bkp', '%%!_backup') ESCAPE '!' AND REGEXP_SIMILAR(RTRIM(a.TableName), '.*_tmp_[0-9]+','i') = 0 AND REGEXP_SIMILAR(RTRIM(a.TableName), '.*_tmp[0-9]+','i') = 0 QUALIFY RANK() OVER (PARTITION BY DatabaseName, TableName2 ORDER BY a.TableName desc) = 1 ) c JOIN DBC.Columns a ON (c.databasename = a.databasename AND c.tablename = a.tablename) LEFT OUTER JOIN DBC.ColumnStatsV b ON (a.databasename = b.databasename AND a.tablename = b.tablename AND a.columnname = b.columnname) ORDER BY 1, 2, a.ColumnId """ % (td_database_name, td_table_name_filter) curs_td.execute(col_stats_sql) rows = curs_td.fetchall() curs_td.close() return rows def get_extra_table_info(self, database_name): """ Index, Partition, Size info :param database_name: :return: size, partition, indice """ table_size_sql = """select RTrim(TableName), cast(sum(CurrentPerm)/1024/1024 as BIGINT) size_in_mb from DBC.TableSize where DatabaseName = '%s' AND TableName NOT LIKE ALL ('INFA%%', 'tmp!_%%', 'temp!_%%', '!_%%', '#%%' 'ET!_%%', 'LS!_%%', 'VT!_%%', 'LOGTABLE%%', 'backup%%', 'bkp%%', 'W!_%%') ESCAPE '!' AND RTRIM(TableName) NOT LIKE ALL ('%%!_tmp', '%%!_temp', '%%!_ERR!_.', '%%!_bkp', '%%!_backup') ESCAPE '!' AND REGEXP_SIMILAR(RTRIM(TableName), '.*_tmp_[0-9]+','i') = 0 AND REGEXP_SIMILAR(RTRIM(TableName), '.*_tmp[0-9]+','i') = 0 group by 1 order by 1 """ % (database_name) table_index_sql = """select RTrim(TableName), IndexNumber, IndexType, UniqueFlag, IndexName, RTrim(ColumnName), ColumnPosition, AccessCount from DBC.Indices where DatabaseName = '%s' order by TableName, IndexNumber, ColumnPosition""" % (database_name) table_partition_sql = """select RTrim(TableName), ConstraintText from DBC.IndexConstraints where DatabaseName = '%s' and ConstraintType = 'Q' order by TableName""" % (database_name) extra_table_info = {} curs_td = self.conn_td.cursor() # curs_td.execute("SET QUERY_BAND = 'script=%s; pid=%d; hostname=%s; task=extra_table_info;' FOR SESSION" % (os.path.basename(__file__), os.getpid(), os.getenv('HOSTNAME'))) # get size index and partition info one by one curs_td.execute(table_size_sql) rows = curs_td.fetchall() for row in rows: full_table_name = database_name + '.' + row[0] extra_table_info[full_table_name] = {'size_in_mb': row[1], 'partitions': [], 'indices': []} curs_td.execute(table_partition_sql) rows = curs_td.fetchall() for row in rows: full_table_name = database_name + '.' + row[0] if full_table_name not in extra_table_info: continue search_result = re.search('CHECK \(/\*([0-9]+)\*/ (.*)\)$', row[1], re.IGNORECASE) partition_level = 1 if search_result: partition_level = int(search_result.group(1)) partition_info = search_result.group(2).replace("\r", "").replace(") IS NOT NULL AND ", ")\n").replace( ") IS NOT NULL", ")") extra_table_info[full_table_name]['partitions'] = partition_info.split("\n") search_result = re.search('CHECK \(\((.*)\) BETWEEN [0-9]+ AND [0-9]+\)$', row[1], re.IGNORECASE) if search_result: partition_info = search_result.group(1).replace("\r", "") extra_table_info[full_table_name]['partitions'] = [partition_info] curs_td.execute(table_index_sql) rows = curs_td.fetchall() table_count = 0 current_table_name = '' full_table_name = '' for row in rows: if current_table_name <> row[0]: if table_count > 0: # finish previous table's last index indices[-1]['column_list'] = column_list if full_table_name in extra_table_info: extra_table_info[full_table_name]['indices'] = indices full_table_name = database_name + '.' + row[0] if full_table_name not in extra_table_info: continue table_count += 1 current_table_name = row[0] current_index_number = 0 indices = [] if current_index_number <> row[1]: if current_index_number > 0: indices[-1]['column_list'] = column_list # new index current_index_number = row[1] indices.append( {'index_number': row[1], 'index_type': index_type[row[2]], 'is_unique': row[3], 'index_name': row[4], 'access_count': row[7], 'column_list': ''}) column_list = row[5] else: column_list += ", %s" % row[5] if len(indices) > 0: indices[-1]['column_list'] = column_list if full_table_name in extra_table_info: extra_table_info[full_table_name]['indices'] = indices return extra_table_info def format_view_metadata(self, rows, schema): """ add view info from rows into schema note : view's original name is the same as full name :param rows: :param schema: :return: """ db_dict = {} table_dict = {} for row in rows: if row[0] not in db_dict: schema.append({'database': row[0], 'type': 'Teradata', 'views': []}) db_dict[row[0]] = len(schema) - 1 db_idx = db_dict[row[0]] full_name = row[0] + '.' + row[1] ref_table_list = [] ref_table_list = set(re.findall(r"\s+FROM\s+(\w+\.\w+)[\s,;]", row[2], re.DOTALL | re.IGNORECASE)) search_result = set(re.findall(r"\s+JOIN\s+(\w+\.\w+)[\s,;]", row[2], re.DOTALL | re.IGNORECASE)) ref_table_list = list(set(ref_table_list) | set(search_result)) if full_name not in table_dict: schema[db_idx]['views'].append( {'name': row[1], 'type': 'View', 'createTime': row[3], 'lastAlterTime': row[4], 'accessCount': row[5], 'referenceTables': ref_table_list, 'viewSqlText': row[2].replace("\r", "\n"), 'columns': [], 'original_name': full_name}) table_dict[full_name] = len(schema[db_idx]['views']) - 1 table_idx = table_dict[full_name] schema[db_idx]['views'][table_idx]['columns'].append( {'name': row[6], 'nullable': row[8], 'dataType': row[9], 'maxByteLength': row[10], 'precision': row[11], 'scale': row[12]}) column_idx = len(schema[db_idx]['views'][table_idx]['columns']) - 1 if row[7]: schema[db_idx]['views'][table_idx]['columns'][column_idx]['columnFormat'] = row[7].strip() self.logger.info("%s %6d views with %6d columns processed for %12s" % ( datetime.datetime.now(), table_idx + 1, len(rows), row[0])) def format_table_metadata(self, rows, schema): """ add table info from rows into schema :param rows: input. each row is a database with all it's tables :param schema: {database : _, type : _, tables : ['name' : _, ... 'original_name' : _] } :return: """ db_dict = {} table_dict = {} db_idx = len(schema) - 1 table_idx = -1 for row in rows: if row[0] not in db_dict: schema.append({'database': row[0], 'type': 'Teradata', 'tables': []}) db_idx += 1 db_dict[row[0]] = db_idx extra_table_info = self.get_extra_table_info(row[0]) full_name = '' if row[0]: full_name = row[0] if row[1]: full_name += '.' + row[1] elif row[1]: full_name = row[1] # full_name = row[0] + '.' + row[1] original_name = row[0] + '.' + row[20] if original_name not in extra_table_info: self.logger.error('ERROR : {0} not in extra_table_info!'.format(original_name)) continue if full_name not in table_dict: schema[db_idx]['tables'].append( {'name': row[1], 'type': 'Table', 'createTime': row[2], 'lastAlterTime': row[3], 'lastAccessTime': row[4], 'accessCount': row[5], 'owner': row[19], 'sizeInMbytes': extra_table_info[original_name]['size_in_mb'], 'partitions': extra_table_info[original_name]['partitions'], 'indices': extra_table_info[original_name]['indices'], 'columns': [], 'original_name': original_name}) table_idx += 1 table_dict[full_name] = table_idx # print "%6d: %s: %s" % (table_idx, full_name, str(schema[db_idx]['tables'][table_idx])) schema[db_idx]['tables'][table_idx]['columns'].append( {'name': row[6], 'nullable': row[9], 'lastAccessTime': row[8], 'accessCount': row[11] & 0xffff if row[11] else None, 'dataType': row[14] if row[14] else 'N/A', 'maxByteLength': row[15] & 0xffff, 'precision': row[16] & 0xffff if row[16] else None, 'scale': row[17] & 0xffff if row[17] else None}) column_idx = len(schema[db_idx]['tables'][table_idx]['columns']) - 1 if not row[8] is None: schema[db_idx]['tables'][table_idx]['columns'][column_idx]['defaultValue'] = row[8] if not row[7] is None: schema[db_idx]['tables'][table_idx]['columns'][column_idx]['columnFormat'] = row[7].strip() if not row[12] is None: schema[db_idx]['tables'][table_idx]['columns'][column_idx]['statistics'] = { 'uniqueValueCount': row[12] & 0xffff, 'lastStatsCollectTime': str(row[13])} self.logger.info("%s %6d tables with %6d columns processed for %12s" % ( datetime.datetime.now(), table_idx + 1, len(rows), row[0])) def get_sample_data(self, database_name, table_name): """ find the reference dataset (if it has), select top 10 from teradata :return: (reference_urn, json of sample data) """ fullname = '' columns = [] rows_data = [] # doesn't have permission for these databases, fetch sample from DWH_STG's correspond tables if database_name in ['DWH_DIM', 'DWH_FACT', 'DWH_TRK', 'DWH_AGG', 'DWH_CPY', 'DWH_MSTR', 'DWH_SEC']: fullname = 'DWH_STG."' + table_name + '"' else: fullname = database_name + '."' + table_name + '"' sql = 'LOCK ROW FOR ACCESS SELECT top 10 * FROM ' + fullname curs_td = self.conn_td.cursor() rows = [] try: curs_td.execute(sql) rows = curs_td.fetchall() for i, value in enumerate(rows[0]): columns.append(curs_td.description[i][0]) for r in rows: row_data = [] # encode each field to a new value for i, value in enumerate(r): new_value = unicode(value, errors='ignore') if isinstance(value, bytearray): new_value = ''.join(format(x, '02x') for x in value) elif value is None: new_value = '' row_data.append(new_value) rows_data.append(row_data) except Exception, e: self.logger.error('sql : ' + sql) if len(rows) == 0: self.logger.error("dataset {0} is empty".format(fullname)) else: self.logger.error("dataset {0} is not accessible.".format(fullname)) self.logger.error('result : ' + str(rows)) self.logger.error(e) pass ref_urn = 'teradata:///' + fullname.replace('.', '/').replace('"', '') data_with_column = map(lambda x:dict(zip(columns, x)), rows_data) return ref_urn, json.dumps({'sample': data_with_column}) def run(self, database_name, table_name, schema_output_file, sample_output_file, sample=True): """ The entrance of the class, extract schema and sample data Notice the database need to have a order that the databases have more info (DWH_STG) should be scaned first. :param database_name: :param table_name: :param schema_output_file: :return: """ cur = self.conn_td.cursor() schema = [] f_log = open(self.log_file, "a") schema_json = open(schema_output_file, 'wb') os.chmod(schema_output_file, 0666) if database_name is None and table_name is None: # default route: process everything for database_name in self.databases: self.logger.info("Collecting tables in database : " + database_name) # table info rows = [] begin = datetime.datetime.now().strftime("%H:%M:%S") rows.extend(self.get_table_info(database_name, table_name)) if len(rows) > 0: self.format_table_metadata(rows, schema) end = datetime.datetime.now().strftime("%H:%M:%S") f_log.write("Get table info %12s [%s -> %s]\n" % (database_name, str(begin), str(end))) # view info rows = [] begin = datetime.datetime.now().strftime("%H:%M:%S") rows.extend(self.get_view_info(database_name, table_name)) if len(rows) > 0: self.format_view_metadata(rows, schema) end = datetime.datetime.now().strftime("%H:%M:%S") f_log.write("Get view info %12s [%s -> %s]\n" % (database_name, str(begin), str(end))) scaned_dict = {} # a cache of {name : {urn : _, data : _}} to avoid repeat computing if sample: self.logger.info("Start collecting sample data.") open(sample_output_file, 'wb') os.chmod(sample_output_file, 0666) sample_file_writer = FileWriter(sample_output_file) # collect sample data for onedatabase in schema: database_name = onedatabase['database'] if 'tables' in onedatabase: alltables = onedatabase['tables'] else: alltables = onedatabase['views'] for onetable in alltables: table_name = onetable['original_name'].split('.')[1] if table_name in scaned_dict: sample_record = SampleDataRecord('teradata', '/' + database_name + '/' + table_name, scaned_dict[table_name]['ref_urn'], scaned_dict[table_name]['data']) else: (ref_urn, sample_data) = self.get_sample_data(database_name, table_name) sample_record = SampleDataRecord('teradata', '/' + database_name + '/' + table_name, '', sample_data) scaned_dict[table_name] = {'ref_urn': ref_urn, 'data': sample_data} sample_file_writer.append(sample_record) sample_file_writer.close() # print 'byte size of schema : ' + str(sys.getsizeof(schema)) schema_json.write(json.dumps(schema, indent=None) + '\n') cur.close() schema_json.close() f_log.close() if __name__ == "__main__": args = sys.argv[1] # connection username = args[Constant.TD_DB_USERNAME_KEY] password = args[Constant.TD_DB_PASSWORD_KEY] JDBC_DRIVER = args[Constant.TD_DB_DRIVER_KEY] JDBC_URL = args[Constant.TD_DB_URL_KEY] e = TeradataExtract() e.conn_td = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER) do_sample = False if Constant.TD_LOAD_SAMPLE in args: do_sample = FileUtil.parse_bool(args[Constant.TD_LOAD_SAMPLE], False) if datetime.datetime.now().strftime('%a') not in args[Constant.TD_COLLECT_SAMPLE_DATA_DAYS]: do_sample = False temp_dir = FileUtil.etl_temp_dir(args, "TERADATA") try: e.conn_td.cursor().execute( "SET QUERY_BAND = 'script=%s; pid=%d; ' FOR SESSION;" % ('TeradataExtract.py', os.getpid())) e.conn_td.commit() e.log_file = os.path.join(temp_dir, args[Constant.TD_LOG_KEY]) e.databases = args[Constant.TD_TARGET_DATABASES_KEY].split(',') e.default_database = args[Constant.TD_DEFAULT_DATABASE_KEY] index_type = {'P': 'Primary Index', 'K': 'Primary Key', 'S': 'Secondary Index', 'Q': 'Partitioned Primary Index', 'J': 'Join Index', 'U': 'Unique Index'} schema_output_file = os.path.join(temp_dir, args[Constant.TD_SCHEMA_OUTPUT_KEY]) sample_output_file = os.path.join(temp_dir, args[Constant.TD_SAMPLE_OUTPUT_KEY]) e.run(None, None, schema_output_file, sample_output_file, sample=do_sample) finally: e.conn_td.close()