mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-16 13:50:14 +00:00
576 lines
24 KiB
Python
576 lines
24 KiB
Python
#
|
|
# Copyright 2015 LinkedIn Corp. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
#
|
|
|
|
import datetime
|
|
import FileUtil
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
from com.ziclix.python.sql import zxJDBC
|
|
from org.slf4j import LoggerFactory
|
|
from wherehows.common.schemas import SampleDataRecord
|
|
from wherehows.common.writers import FileWriter
|
|
from wherehows.common import Constant
|
|
|
|
|
|
class TeradataExtract:
|
|
def __init__(self):
|
|
self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
|
|
|
|
def get_view_info(self, database_name, view_name):
|
|
"""
|
|
:param database_name:
|
|
:param view_name:
|
|
:return:
|
|
"""
|
|
view_cols = []
|
|
curs_vw = self.conn_td.cursor()
|
|
view_sql = '''
|
|
SELECT Trim(DatabaseName) DatabaseName, Trim(TableName) TableName, RequestText,
|
|
CreateTimestamp(CHAR(19)), LastAlterTimestamp(CHAR(19)), AccessCount
|
|
FROM DBC.Tables
|
|
WHERE DatabaseName = '%s'
|
|
AND TableName NOT LIKE ALL ('!_%%', '#%%', 'TMP%%', 'TEMP%%') ESCAPE '!'
|
|
AND TableKind = 'V' ORDER BY 2''' % database_name
|
|
|
|
if not view_name is None:
|
|
view_sql = view_sql + ''' AND TableName = '%s' ''' % view_name
|
|
|
|
curs_vw.execute(view_sql)
|
|
views = curs_vw.fetchall()
|
|
|
|
for vw in views:
|
|
try:
|
|
# table_sql = 'CREATE VOLATILE TABLE vt_get_column_info AS (SELECT * FROM %s.%s) WITH NO DATA NO PRIMARY INDEX' % (vw[0], vw[1])
|
|
help_column_sql = 'help column %s.%s.*' % (vw[0], vw[1])
|
|
curs_vw.execute(help_column_sql)
|
|
rows = curs_vw.fetchall()
|
|
except Exception, e:
|
|
# print str(e), vw[1], len(views)
|
|
continue
|
|
for r in rows:
|
|
column_name = r[0].strip()
|
|
data_type = r[1].strip()
|
|
nullable = r[2].strip()
|
|
format = r[3].strip()
|
|
max_length = r[4] & 0xffff
|
|
decimal_total_digits = r[5] & 0xffff if r[5] else None
|
|
decimal_fractional_digits = r[6] & 0xffff if r[6] else (0 if r[5] else None)
|
|
char_type = r[16] & 0xffff if r[16] else None
|
|
if data_type == 'I1':
|
|
data_type = 'BYTEINT'
|
|
elif data_type == 'I2':
|
|
data_type = 'SMALLINT'
|
|
elif data_type == 'I':
|
|
data_type = 'INT'
|
|
elif data_type == 'F':
|
|
data_type = 'FLOAT'
|
|
elif data_type == 'I8':
|
|
data_type = 'BIGINT'
|
|
elif data_type == 'DA':
|
|
data_type = 'DATE'
|
|
elif data_type == 'AT':
|
|
data_type = 'TIME'
|
|
elif data_type == 'TS':
|
|
data_type = 'TIMESTAMP'
|
|
elif data_type == 'SZ':
|
|
data_type = 'TIMESTAMP WITH TIMEZONE'
|
|
elif data_type == 'TZ':
|
|
data_type = 'TIME WITH TIMEZONE'
|
|
elif data_type == 'CO':
|
|
data_type = 'CLOB'
|
|
elif data_type == 'CF':
|
|
data_type = 'CHAR(' + str(max_length / char_type) + ')'
|
|
elif data_type == 'CV':
|
|
data_type = 'VARCHAR(' + str(max_length / char_type) + ')'
|
|
elif data_type == 'BV':
|
|
data_type = 'VARBYTE(' + str(max_length) + ')'
|
|
elif data_type == 'D':
|
|
data_type = 'DECIMAL(' + str(decimal_total_digits) + ',' + str(decimal_fractional_digits) + ')'
|
|
|
|
if data_type not in ['DATE', 'TIME', 'TIMESTAMP', 'TIME WITH TIMEZONE', 'TIMESTAMP WITH TIMEZONE']:
|
|
format = ''
|
|
|
|
view_cols.append((
|
|
vw[0], vw[1], vw[2], str(vw[3]), str(vw[4]), vw[5], column_name, format, nullable, data_type, max_length,
|
|
decimal_total_digits, decimal_fractional_digits))
|
|
curs_vw.close()
|
|
return view_cols
|
|
|
|
def get_table_info(self, database_name, table_name):
|
|
"""
|
|
get table, column info from teradata DBC.Tables
|
|
:param database_name:
|
|
:param table_name: not used in common case
|
|
:return:
|
|
"""
|
|
td_table_name_filter = ''
|
|
if not database_name is None:
|
|
td_database_name = database_name
|
|
elif len(table_name) > 0:
|
|
if table_name.find('.') > 0:
|
|
(td_database_name, td_table_name) = table_name.split('.')
|
|
else:
|
|
td_database_name = self.default_database
|
|
td_table_name = table_name
|
|
td_table_name_filter = "AND a.TableName = '%s' " % td_table_name
|
|
|
|
curs_td = self.conn_td.cursor()
|
|
|
|
col_stats_sql = """SELECT c.DatabaseName,
|
|
c.TableName2 TableName,
|
|
c.CreateTimestamp (FORMAT 'YYYY-MM-DDBHH:MI:SS') (CHAR(19)) createTimestamp,
|
|
c.LastAlterTimestamp (CHAR(19)) TableLastAlterTimestamp,
|
|
c.LastAccessTimestamp (FORMAT 'YYYY-MM-DDBHH:MI:SS') (CHAR(19)) LastAccessTimestamp,
|
|
c.TableAccessCount,
|
|
RTRIM(a.columnname) ColumnName,
|
|
CASE
|
|
WHEN a.columntype IN ('DA','AT','TS','SZ','TZ') THEN
|
|
RTrim(a.ColumnFormat)
|
|
ELSE
|
|
NULL
|
|
END ColumnFormat,
|
|
a.DefaultValue,
|
|
a.nullable,
|
|
a.LastAccessTimestamp (FORMAT 'YYYY-MM-DDBHH:MI:SS') (CHAR(19)) LastAccessTimestamp,
|
|
a.AccessCount,
|
|
b.UniqueValueCount (bigint) UniqueValueCount,
|
|
b.LastCollectTimestamp (FORMAT 'YYYY-MM-DDBHH:MI:SS') (CHAR(19)) LastCollectTimestamp,
|
|
CASE
|
|
WHEN a.columntype = 'I1' THEN 'BYTEINT'
|
|
WHEN a.columntype = 'I2' THEN 'SMALLINT'
|
|
WHEN a.columntype = 'I' THEN 'INT'
|
|
WHEN a.columntype = 'F' THEN 'FLOAT'
|
|
WHEN a.columntype = 'I8' THEN 'BIGINT'
|
|
WHEN a.columntype = 'DA' THEN 'DATE'
|
|
WHEN a.columntype = 'AT' THEN 'TIME'
|
|
WHEN a.columntype = 'TS' THEN 'TIMESTAMP'
|
|
WHEN a.columntype = 'SZ' THEN 'TIMESTAMP WITH TIMEZONE'
|
|
WHEN a.columntype = 'TZ' THEN 'TIME WITH TIMEZONE'
|
|
WHEN a.columntype = 'CO' THEN 'CLOB'
|
|
WHEN a.columntype = 'BV' THEN 'VARBYTE(' || ColumnLength || ')'
|
|
WHEN a.columntype = 'CF' THEN 'CHAR('|| TRIM(a.ColumnLength/a.CharType) || ')'
|
|
WHEN a.columntype = 'CV' THEN 'VARCHAR('|| TRIM(a.ColumnLength/a.CharType) || ')'
|
|
WHEN a.columntype = 'D' THEN 'DECIMAL(' || TRIM(a.DecimalTotalDigits) || ',' || TRIM(a.DecimalFractionalDigits) || ')'
|
|
END Data_Type,
|
|
a.ColumnLength,
|
|
a.DecimalTotalDigits,
|
|
a.DecimalFractionalDigits,
|
|
a.ColumnId Column_Id,
|
|
RTrim(c.CreatorName) CreatorName,
|
|
RTrim(c.TableName) OriginalTableName
|
|
FROM (
|
|
select RTrim(a.DatabaseName) DatabaseName,
|
|
case when regexp_similar(a.tableName, '[[:alnum:]_]+[[:digit:]]{8}([^[:digit:]]|$).*', 'c') = 1 then
|
|
case when regexp_substr(a.TableName, '([[:digit:]]{8})([^[:digit:]]|$)') between '20000101' and '20991231'
|
|
then rtrim(regexp_replace(a.tableName, '([[:digit:]]{8})', '${YYYYMMDD}', 1, 1, 'c'))
|
|
when regexp_substr(a.TableName, '([[:digit:]]{8})([^[:digit:]]|$)') between '01012000' and '12312099'
|
|
then rtrim(regexp_replace(a.tableName, '([[:digit:]]{8})', '${MMDDYYYY}', 1, 1, 'c'))
|
|
else RTRIM(a.tablename)
|
|
end
|
|
|
|
when regexp_similar(a.tableName, '[[:alnum:]_]+[[:digit:]]{4}_[[:digit:]]{2}([^[:digit:]]|$).*', 'c') = 1
|
|
and regexp_substr(a.TableName, '([[:digit:]]{4})_[[:digit:]]{2}([^[:digit:]]|$)') between '2000_01' and '9999_12'
|
|
then rtrim(regexp_replace(a.tableName, '([[:digit:]]{4}_[[:digit:]]{2})', '${YYYY_MM}', 1, 1, 'c'))
|
|
|
|
when regexp_similar(a.tableName, '[[:alnum:]_]+[[:digit:]]{6}([^[:digit:]]|$).*', 'c') = 1
|
|
and regexp_substr(a.TableName, '([[:digit:]]{6})([^[:digit:]]|$)') between '200001' and '999912'
|
|
then rtrim(regexp_replace(a.tableName, '([[:digit:]]{6})', '${YYYYMM}', 1, 1, 'c'))
|
|
|
|
else RTRIM(a.tablename)
|
|
end TableName2,
|
|
a.TableName,
|
|
a.CreateTimestamp,
|
|
a.LastAlterTimestamp,
|
|
a.LastAccessTimestamp,
|
|
a.AccessCount TableAccessCount,
|
|
a.CreatorName
|
|
from DBC.Tables a where a.TableKind IN ('T', 'O')
|
|
AND a.DatabaseName = '%s'
|
|
%s
|
|
AND a.TableName NOT LIKE ALL ('INFA%%', 'tmp!_%%', 'temp!_%%', '!_%%', '#%%' 'ET!_%%', 'LS!_%%', 'VT!_%%', 'LOGTABLE%%', 'backup%%', 'bkp%%', 'W!_%%') ESCAPE '!'
|
|
AND RTRIM(a.TableName) NOT LIKE ALL ('%%!_tmp', '%%!_temp', '%%!_ERR!_.', '%%!_bkp', '%%!_backup') ESCAPE '!'
|
|
AND REGEXP_SIMILAR(RTRIM(a.TableName), '.*_tmp_[0-9]+','i') = 0
|
|
AND REGEXP_SIMILAR(RTRIM(a.TableName), '.*_tmp[0-9]+','i') = 0
|
|
QUALIFY RANK() OVER (PARTITION BY DatabaseName, TableName2 ORDER BY a.TableName desc) = 1
|
|
) c
|
|
JOIN
|
|
DBC.Columns a
|
|
ON (c.databasename = a.databasename AND
|
|
c.tablename = a.tablename)
|
|
LEFT OUTER JOIN
|
|
DBC.ColumnStatsV b
|
|
ON (a.databasename = b.databasename AND
|
|
a.tablename = b.tablename AND
|
|
a.columnname = b.columnname)
|
|
ORDER BY 1, 2, a.ColumnId """ % (td_database_name, td_table_name_filter)
|
|
|
|
curs_td.execute(col_stats_sql)
|
|
|
|
rows = curs_td.fetchall()
|
|
|
|
curs_td.close()
|
|
|
|
return rows
|
|
|
|
def get_extra_table_info(self, database_name):
|
|
"""
|
|
Index, Partition, Size info
|
|
:param database_name:
|
|
:return: size, partition, indice
|
|
"""
|
|
table_size_sql = """select RTrim(TableName), cast(sum(CurrentPerm)/1024/1024 as BIGINT) size_in_mb
|
|
from DBC.TableSize where DatabaseName = '%s'
|
|
AND TableName NOT LIKE ALL ('INFA%%', 'tmp!_%%', 'temp!_%%', '!_%%', '#%%' 'ET!_%%', 'LS!_%%', 'VT!_%%', 'LOGTABLE%%', 'backup%%', 'bkp%%', 'W!_%%') ESCAPE '!'
|
|
AND RTRIM(TableName) NOT LIKE ALL ('%%!_tmp', '%%!_temp', '%%!_ERR!_.', '%%!_bkp', '%%!_backup') ESCAPE '!'
|
|
AND REGEXP_SIMILAR(RTRIM(TableName), '.*_tmp_[0-9]+','i') = 0
|
|
AND REGEXP_SIMILAR(RTRIM(TableName), '.*_tmp[0-9]+','i') = 0
|
|
group by 1 order by 1 """ % (database_name)
|
|
|
|
table_index_sql = """select RTrim(TableName), IndexNumber, IndexType, UniqueFlag, IndexName, RTrim(ColumnName), ColumnPosition, AccessCount
|
|
from DBC.Indices where DatabaseName = '%s' order by TableName, IndexNumber, ColumnPosition""" % (database_name)
|
|
|
|
table_partition_sql = """select RTrim(TableName), ConstraintText
|
|
from DBC.IndexConstraints where DatabaseName = '%s' and ConstraintType = 'Q'
|
|
order by TableName""" % (database_name)
|
|
|
|
extra_table_info = {}
|
|
|
|
curs_td = self.conn_td.cursor()
|
|
# curs_td.execute("SET QUERY_BAND = 'script=%s; pid=%d; hostname=%s; task=extra_table_info;' FOR SESSION" % (os.path.basename(__file__), os.getpid(), os.getenv('HOSTNAME')))
|
|
|
|
# get size index and partition info one by one
|
|
curs_td.execute(table_size_sql)
|
|
rows = curs_td.fetchall()
|
|
for row in rows:
|
|
full_table_name = database_name + '.' + row[0]
|
|
extra_table_info[full_table_name] = {'size_in_mb': row[1], 'partitions': [], 'indices': []}
|
|
|
|
curs_td.execute(table_partition_sql)
|
|
rows = curs_td.fetchall()
|
|
for row in rows:
|
|
full_table_name = database_name + '.' + row[0]
|
|
if full_table_name not in extra_table_info:
|
|
continue
|
|
|
|
search_result = re.search('CHECK \(/\*([0-9]+)\*/ (.*)\)$', row[1], re.IGNORECASE)
|
|
partition_level = 1
|
|
if search_result:
|
|
partition_level = int(search_result.group(1))
|
|
partition_info = search_result.group(2).replace("\r", "").replace(") IS NOT NULL AND ", ")\n").replace(
|
|
") IS NOT NULL", ")")
|
|
extra_table_info[full_table_name]['partitions'] = partition_info.split("\n")
|
|
|
|
search_result = re.search('CHECK \(\((.*)\) BETWEEN [0-9]+ AND [0-9]+\)$', row[1], re.IGNORECASE)
|
|
if search_result:
|
|
partition_info = search_result.group(1).replace("\r", "")
|
|
extra_table_info[full_table_name]['partitions'] = [partition_info]
|
|
|
|
curs_td.execute(table_index_sql)
|
|
rows = curs_td.fetchall()
|
|
table_count = 0
|
|
current_table_name = ''
|
|
full_table_name = ''
|
|
|
|
for row in rows:
|
|
if current_table_name <> row[0]:
|
|
if table_count > 0:
|
|
# finish previous table's last index
|
|
indices[-1]['column_list'] = column_list
|
|
if full_table_name in extra_table_info:
|
|
extra_table_info[full_table_name]['indices'] = indices
|
|
|
|
full_table_name = database_name + '.' + row[0]
|
|
if full_table_name not in extra_table_info:
|
|
continue
|
|
|
|
table_count += 1
|
|
current_table_name = row[0]
|
|
current_index_number = 0
|
|
indices = []
|
|
|
|
if current_index_number <> row[1]:
|
|
if current_index_number > 0:
|
|
indices[-1]['column_list'] = column_list
|
|
# new index
|
|
current_index_number = row[1]
|
|
indices.append(
|
|
{'index_number': row[1], 'index_type': index_type[row[2]], 'is_unique': row[3], 'index_name': row[4],
|
|
'access_count': row[7], 'column_list': ''})
|
|
column_list = row[5]
|
|
else:
|
|
column_list += ", %s" % row[5]
|
|
|
|
if len(indices) > 0:
|
|
indices[-1]['column_list'] = column_list
|
|
if full_table_name in extra_table_info:
|
|
extra_table_info[full_table_name]['indices'] = indices
|
|
|
|
return extra_table_info
|
|
|
|
def format_view_metadata(self, rows, schema):
|
|
"""
|
|
add view info from rows into schema
|
|
note : view's original name is the same as full name
|
|
:param rows:
|
|
:param schema:
|
|
:return:
|
|
"""
|
|
db_dict = {}
|
|
table_dict = {}
|
|
for row in rows:
|
|
if row[0] not in db_dict:
|
|
schema.append({'database': row[0], 'type': 'Teradata', 'views': []})
|
|
db_dict[row[0]] = len(schema) - 1
|
|
db_idx = db_dict[row[0]]
|
|
full_name = row[0] + '.' + row[1]
|
|
ref_table_list = []
|
|
ref_table_list = set(re.findall(r"\s+FROM\s+(\w+\.\w+)[\s,;]", row[2], re.DOTALL | re.IGNORECASE))
|
|
search_result = set(re.findall(r"\s+JOIN\s+(\w+\.\w+)[\s,;]", row[2], re.DOTALL | re.IGNORECASE))
|
|
ref_table_list = list(set(ref_table_list) | set(search_result))
|
|
|
|
if full_name not in table_dict:
|
|
schema[db_idx]['views'].append(
|
|
{'name': row[1], 'type': 'View', 'createTime': row[3], 'lastAlterTime': row[4], 'accessCount': row[5],
|
|
'referenceTables': ref_table_list, 'viewSqlText': row[2].replace("\r", "\n"), 'columns': [],
|
|
'original_name': full_name})
|
|
table_dict[full_name] = len(schema[db_idx]['views']) - 1
|
|
table_idx = table_dict[full_name]
|
|
schema[db_idx]['views'][table_idx]['columns'].append(
|
|
{'name': row[6], 'nullable': row[8], 'dataType': row[9], 'maxByteLength': row[10], 'precision': row[11],
|
|
'scale': row[12]})
|
|
column_idx = len(schema[db_idx]['views'][table_idx]['columns']) - 1
|
|
if row[7]:
|
|
schema[db_idx]['views'][table_idx]['columns'][column_idx]['columnFormat'] = row[7].strip()
|
|
|
|
self.logger.info("%s %6d views with %6d columns processed for %12s" % (
|
|
datetime.datetime.now(), table_idx + 1, len(rows), row[0]))
|
|
|
|
def format_table_metadata(self, rows, schema):
|
|
"""
|
|
add table info from rows into schema
|
|
:param rows: input. each row is a database with all it's tables
|
|
:param schema: {database : _, type : _, tables : ['name' : _, ... 'original_name' : _] }
|
|
:return:
|
|
"""
|
|
db_dict = {}
|
|
table_dict = {}
|
|
db_idx = len(schema) - 1
|
|
table_idx = -1
|
|
for row in rows:
|
|
if row[0] not in db_dict:
|
|
schema.append({'database': row[0], 'type': 'Teradata', 'tables': []})
|
|
db_idx += 1
|
|
db_dict[row[0]] = db_idx
|
|
extra_table_info = self.get_extra_table_info(row[0])
|
|
full_name = ''
|
|
if row[0]:
|
|
full_name = row[0]
|
|
if row[1]:
|
|
full_name += '.' + row[1]
|
|
elif row[1]:
|
|
full_name = row[1]
|
|
# full_name = row[0] + '.' + row[1]
|
|
original_name = row[0] + '.' + row[20]
|
|
if original_name not in extra_table_info:
|
|
self.logger.error('ERROR : {0} not in extra_table_info!'.format(original_name))
|
|
continue
|
|
if full_name not in table_dict:
|
|
schema[db_idx]['tables'].append(
|
|
{'name': row[1], 'type': 'Table', 'createTime': row[2], 'lastAlterTime': row[3], 'lastAccessTime': row[4],
|
|
'accessCount': row[5], 'owner': row[19], 'sizeInMbytes': extra_table_info[original_name]['size_in_mb'],
|
|
'partitions': extra_table_info[original_name]['partitions'],
|
|
'indices': extra_table_info[original_name]['indices'], 'columns': [], 'original_name': original_name})
|
|
table_idx += 1
|
|
table_dict[full_name] = table_idx
|
|
# print "%6d: %s: %s" % (table_idx, full_name, str(schema[db_idx]['tables'][table_idx]))
|
|
|
|
schema[db_idx]['tables'][table_idx]['columns'].append(
|
|
{'name': row[6], 'nullable': row[9], 'lastAccessTime': row[8],
|
|
'accessCount': row[11] & 0xffff if row[11] else None, 'dataType': row[14] if row[14] else 'N/A',
|
|
'maxByteLength': row[15] & 0xffff, 'precision': row[16] & 0xffff if row[16] else None,
|
|
'scale': row[17] & 0xffff if row[17] else None})
|
|
column_idx = len(schema[db_idx]['tables'][table_idx]['columns']) - 1
|
|
if not row[8] is None:
|
|
schema[db_idx]['tables'][table_idx]['columns'][column_idx]['defaultValue'] = row[8]
|
|
if not row[7] is None:
|
|
schema[db_idx]['tables'][table_idx]['columns'][column_idx]['columnFormat'] = row[7].strip()
|
|
if not row[12] is None:
|
|
schema[db_idx]['tables'][table_idx]['columns'][column_idx]['statistics'] = {
|
|
'uniqueValueCount': row[12] & 0xffff, 'lastStatsCollectTime': str(row[13])}
|
|
|
|
self.logger.info("%s %6d tables with %6d columns processed for %12s" % (
|
|
datetime.datetime.now(), table_idx + 1, len(rows), row[0]))
|
|
|
|
def get_sample_data(self, database_name, table_name):
|
|
"""
|
|
find the reference dataset (if it has), select top 10 from teradata
|
|
:return: (reference_urn, json of sample data)
|
|
"""
|
|
fullname = ''
|
|
columns = []
|
|
rows_data = []
|
|
# doesn't have permission for these databases, fetch sample from DWH_STG's correspond tables
|
|
if database_name in ['DWH_DIM', 'DWH_FACT', 'DWH_TRK', 'DWH_AGG', 'DWH_CPY', 'DWH_MSTR', 'DWH_SEC']:
|
|
fullname = 'DWH_STG."' + table_name + '"'
|
|
else:
|
|
fullname = database_name + '."' + table_name + '"'
|
|
|
|
sql = 'LOCK ROW FOR ACCESS SELECT top 10 * FROM ' + fullname
|
|
curs_td = self.conn_td.cursor()
|
|
rows = []
|
|
try:
|
|
curs_td.execute(sql)
|
|
rows = curs_td.fetchall()
|
|
for i, value in enumerate(rows[0]):
|
|
columns.append(curs_td.description[i][0])
|
|
for r in rows:
|
|
row_data = []
|
|
# encode each field to a new value
|
|
for i, value in enumerate(r):
|
|
new_value = unicode(value, errors='ignore')
|
|
if isinstance(value, bytearray):
|
|
new_value = ''.join(format(x, '02x') for x in value)
|
|
elif value is None:
|
|
new_value = ''
|
|
row_data.append(new_value)
|
|
rows_data.append(row_data)
|
|
except Exception, e:
|
|
self.logger.error('sql : ' + sql)
|
|
if len(rows) == 0:
|
|
self.logger.error("dataset {0} is empty".format(fullname))
|
|
else:
|
|
self.logger.error("dataset {0} is not accessible.".format(fullname))
|
|
self.logger.error('result : ' + str(rows))
|
|
self.logger.error(e)
|
|
pass
|
|
|
|
ref_urn = 'teradata:///' + fullname.replace('.', '/').replace('"', '')
|
|
data_with_column = map(lambda x:dict(zip(columns, x)), rows_data)
|
|
return ref_urn, json.dumps({'sample': data_with_column})
|
|
|
|
def run(self, database_name, table_name, schema_output_file, sample_output_file, sample=True):
|
|
"""
|
|
The entrance of the class, extract schema and sample data
|
|
Notice the database need to have a order that the databases have more info (DWH_STG) should be scaned first.
|
|
:param database_name:
|
|
:param table_name:
|
|
:param schema_output_file:
|
|
:return:
|
|
"""
|
|
cur = self.conn_td.cursor()
|
|
schema = []
|
|
|
|
f_log = open(self.log_file, "a")
|
|
|
|
schema_json = open(schema_output_file, 'wb')
|
|
os.chmod(schema_output_file, 0666)
|
|
|
|
|
|
if database_name is None and table_name is None: # default route: process everything
|
|
for database_name in self.databases:
|
|
self.logger.info("Collecting tables in database : " + database_name)
|
|
# table info
|
|
rows = []
|
|
begin = datetime.datetime.now().strftime("%H:%M:%S")
|
|
rows.extend(self.get_table_info(database_name, table_name))
|
|
if len(rows) > 0:
|
|
self.format_table_metadata(rows, schema)
|
|
end = datetime.datetime.now().strftime("%H:%M:%S")
|
|
f_log.write("Get table info %12s [%s -> %s]\n" % (database_name, str(begin), str(end)))
|
|
|
|
# view info
|
|
rows = []
|
|
begin = datetime.datetime.now().strftime("%H:%M:%S")
|
|
rows.extend(self.get_view_info(database_name, table_name))
|
|
if len(rows) > 0:
|
|
self.format_view_metadata(rows, schema)
|
|
end = datetime.datetime.now().strftime("%H:%M:%S")
|
|
f_log.write("Get view info %12s [%s -> %s]\n" % (database_name, str(begin), str(end)))
|
|
|
|
scaned_dict = {} # a cache of {name : {urn : _, data : _}} to avoid repeat computing
|
|
|
|
if sample:
|
|
self.logger.info("Start collecting sample data.")
|
|
open(sample_output_file, 'wb')
|
|
os.chmod(sample_output_file, 0666)
|
|
sample_file_writer = FileWriter(sample_output_file)
|
|
|
|
# collect sample data
|
|
for onedatabase in schema:
|
|
database_name = onedatabase['database']
|
|
if 'tables' in onedatabase:
|
|
alltables = onedatabase['tables']
|
|
else:
|
|
alltables = onedatabase['views']
|
|
|
|
for onetable in alltables:
|
|
table_name = onetable['original_name'].split('.')[1]
|
|
if table_name in scaned_dict:
|
|
sample_record = SampleDataRecord('teradata', '/' + database_name + '/' + table_name,
|
|
scaned_dict[table_name]['ref_urn'], scaned_dict[table_name]['data'])
|
|
else:
|
|
(ref_urn, sample_data) = self.get_sample_data(database_name, table_name)
|
|
sample_record = SampleDataRecord('teradata', '/' + database_name + '/' + table_name, '', sample_data)
|
|
scaned_dict[table_name] = {'ref_urn': ref_urn, 'data': sample_data}
|
|
sample_file_writer.append(sample_record)
|
|
sample_file_writer.close()
|
|
|
|
# print 'byte size of schema : ' + str(sys.getsizeof(schema))
|
|
schema_json.write(json.dumps(schema, indent=None) + '\n')
|
|
cur.close()
|
|
schema_json.close()
|
|
f_log.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = sys.argv[1]
|
|
|
|
# connection
|
|
username = args[Constant.TD_DB_USERNAME_KEY]
|
|
password = args[Constant.TD_DB_PASSWORD_KEY]
|
|
JDBC_DRIVER = args[Constant.TD_DB_DRIVER_KEY]
|
|
JDBC_URL = args[Constant.TD_DB_URL_KEY]
|
|
|
|
e = TeradataExtract()
|
|
e.conn_td = zxJDBC.connect(JDBC_URL, username, password, JDBC_DRIVER)
|
|
do_sample = False
|
|
if Constant.TD_LOAD_SAMPLE in args:
|
|
do_sample = FileUtil.parse_bool(args[Constant.TD_LOAD_SAMPLE], False)
|
|
|
|
if datetime.datetime.now().strftime('%a') not in args[Constant.TD_COLLECT_SAMPLE_DATA_DAYS]:
|
|
do_sample = False
|
|
|
|
temp_dir = FileUtil.etl_temp_dir(args, "TERADATA")
|
|
|
|
try:
|
|
e.conn_td.cursor().execute(
|
|
"SET QUERY_BAND = 'script=%s; pid=%d; ' FOR SESSION;" % ('TeradataExtract.py', os.getpid()))
|
|
e.conn_td.commit()
|
|
e.log_file = os.path.join(temp_dir, args[Constant.TD_LOG_KEY])
|
|
e.databases = args[Constant.TD_TARGET_DATABASES_KEY].split(',')
|
|
e.default_database = args[Constant.TD_DEFAULT_DATABASE_KEY]
|
|
index_type = {'P': 'Primary Index', 'K': 'Primary Key', 'S': 'Secondary Index', 'Q': 'Partitioned Primary Index',
|
|
'J': 'Join Index', 'U': 'Unique Index'}
|
|
|
|
schema_output_file = os.path.join(temp_dir, args[Constant.TD_SCHEMA_OUTPUT_KEY])
|
|
sample_output_file = os.path.join(temp_dir, args[Constant.TD_SAMPLE_OUTPUT_KEY])
|
|
|
|
e.run(None, None, schema_output_file, sample_output_file, sample=do_sample)
|
|
finally:
|
|
e.conn_td.close()
|
|
|