mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-18 20:34:14 +00:00
540 lines
25 KiB
Python
540 lines
25 KiB
Python
#
|
|
# Copyright 2015 LinkedIn Corp. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
#
|
|
|
|
import csv
|
|
import datetime
|
|
import json
|
|
import os
|
|
import stat
|
|
import sys
|
|
import SchemaUrlHelper
|
|
from org.slf4j import LoggerFactory
|
|
from com.ziclix.python.sql import zxJDBC
|
|
from wherehows.common import Constant
|
|
|
|
|
|
class TableInfo:
|
|
""" Class to define the variable name """
|
|
table_name = 'name'
|
|
dataset_name = 'dataset_name'
|
|
native_name = 'native_name'
|
|
logical_name = 'logical_name'
|
|
version = 'version'
|
|
type = 'type'
|
|
serialization_format = 'serialization_format'
|
|
create_time = 'create_time'
|
|
schema_url = 'schema_url'
|
|
field_delimiter = 'field_delimiter'
|
|
db_id = 'DB_ID'
|
|
table_id = 'TBL_ID'
|
|
serde_id = 'SD_ID'
|
|
table_type = 'tbl_type'
|
|
location = 'location'
|
|
view_expended_text = 'view_expanded_text'
|
|
input_format = 'input_format'
|
|
output_format = 'output_format'
|
|
is_compressed = 'is_compressed'
|
|
is_storedassubdirectories = 'is_storedassubdirectories'
|
|
etl_source = 'etl_source'
|
|
source_modified_time = 'source_modified_time'
|
|
|
|
field_list = 'fields'
|
|
schema_literal = 'schema_literal'
|
|
|
|
optional_prop = [create_time, serialization_format, field_delimiter, schema_url, db_id, table_id, serde_id,
|
|
table_type, location, view_expended_text, input_format, output_format, is_compressed,
|
|
is_storedassubdirectories, etl_source]
|
|
|
|
class HiveExtract:
|
|
"""
|
|
Extract hive metadata from hive metastore in mysql. store it in a json file
|
|
"""
|
|
conn_hms = None
|
|
db_dict = {} # name : index
|
|
table_dict = {} # fullname : index
|
|
dataset_dict = {} # name : index
|
|
instance_dict = {} # name : index
|
|
serde_param_columns = []
|
|
|
|
def __init__(self):
|
|
self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
|
|
|
|
def get_table_info_from_v2(self, database_name, is_dali=False):
|
|
"""
|
|
get table, column info from table columns_v2
|
|
:param database_name:
|
|
:return: (0 DB_NAME, 1 TBL_NAME, 2 SERDE_FORMAT, 3 TBL_CREATE_TIME
|
|
4 DB_ID, 5 TBL_ID,6 SD_ID, 7 LOCATION, 8 TBL_TYPE, 9 VIEW_EXPENDED_TEXT, 10 INPUT_FORMAT,11 OUTPUT_FORMAT,
|
|
12 IS_COMPRESSED, 13 IS_STOREDASSUBDIRECTORIES, 14 INTEGER_IDX, 15 COLUMN_NAME, 16 TYPE_NAME, 17 COMMENT
|
|
18 dataset_name, 19 version, 20 TYPE, 21 storage_type, 22 native_name, 23 logical_name, 24 created_time,
|
|
25 dataset_urn, 26 source_modified_time)
|
|
"""
|
|
curs = self.conn_hms.cursor()
|
|
if is_dali:
|
|
tbl_info_sql = """select d.NAME DB_NAME, t.TBL_NAME TBL_NAME,
|
|
case when s.INPUT_FORMAT like '%.TextInput%' then 'Text'
|
|
when s.INPUT_FORMAT like '%.Avro%' then 'Avro'
|
|
when s.INPUT_FORMAT like '%.RCFile%' then 'RC'
|
|
when s.INPUT_FORMAT like '%.Orc%' then 'ORC'
|
|
when s.INPUT_FORMAT like '%.Sequence%' then 'Sequence'
|
|
when s.INPUT_FORMAT like '%.Parquet%' then 'Parquet'
|
|
else s.INPUT_FORMAT
|
|
end SerializationFormat,
|
|
t.CREATE_TIME TableCreateTime,
|
|
t.DB_ID, t.TBL_ID, s.SD_ID,
|
|
substr(s.LOCATION, length(substring_index(s.LOCATION, '/', 3))+1) Location,
|
|
t.TBL_TYPE, t.VIEW_EXPANDED_TEXT, s.INPUT_FORMAT, s.OUTPUT_FORMAT, s.IS_COMPRESSED, s.IS_STOREDASSUBDIRECTORIES,
|
|
c.INTEGER_IDX, c.COLUMN_NAME, c.TYPE_NAME, c.COMMENT,
|
|
case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
|
|
then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
|
|
else t.TBL_NAME
|
|
end dataset_name,
|
|
case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
|
|
then replace(substring_index(t.TBL_NAME, '_', -3), '_', '.')
|
|
else 0
|
|
end version, 'Dalids' TYPE, 'View' storage_type, concat(d.NAME, '.', t.TBL_NAME) native_name,
|
|
case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
|
|
then substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1)
|
|
else t.TBL_NAME
|
|
end logical_name, unix_timestamp(now()) created_time, concat('dalids:///', d.NAME, '/', t.TBL_NAME) dataset_urn,
|
|
p.PARAM_VALUE source_modified_time
|
|
from TBLS t join DBS d on t.DB_ID=d.DB_ID
|
|
join SDS s on t.SD_ID = s.SD_ID
|
|
join COLUMNS_V2 c on s.CD_ID = c.CD_ID
|
|
join TABLE_PARAMS p on p.TBL_ID = t.TBL_ID
|
|
where
|
|
p.PARAM_KEY = 'transient_lastDdlTime' and
|
|
d.NAME in ('{db_name}') and (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
|
|
order by DB_NAME, dataset_name, version DESC
|
|
""".format(version='{version}', db_name=database_name)
|
|
else:
|
|
tbl_info_sql = """select d.NAME DB_NAME, t.TBL_NAME TBL_NAME,
|
|
case when s.INPUT_FORMAT like '%.TextInput%' then 'Text'
|
|
when s.INPUT_FORMAT like '%.Avro%' then 'Avro'
|
|
when s.INPUT_FORMAT like '%.RCFile%' then 'RC'
|
|
when s.INPUT_FORMAT like '%.Orc%' then 'ORC'
|
|
when s.INPUT_FORMAT like '%.Sequence%' then 'Sequence'
|
|
when s.INPUT_FORMAT like '%.Parquet%' then 'Parquet'
|
|
else s.INPUT_FORMAT
|
|
end SerializationFormat,
|
|
t.CREATE_TIME TableCreateTime,
|
|
t.DB_ID, t.TBL_ID, s.SD_ID,
|
|
substr(s.LOCATION, length(substring_index(s.LOCATION, '/', 3))+1) Location,
|
|
t.TBL_TYPE, t.VIEW_EXPANDED_TEXT, s.INPUT_FORMAT, s.OUTPUT_FORMAT, s.IS_COMPRESSED, s.IS_STOREDASSUBDIRECTORIES,
|
|
c.INTEGER_IDX, c.COLUMN_NAME, c.TYPE_NAME, c.COMMENT, t.TBL_NAME dataset_name, 0 version, 'Hive' TYPE,
|
|
case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
|
|
when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
|
|
else 'Table'
|
|
end storage_type, concat(d.NAME, '.', t.TBL_NAME) native_name, t.TBL_NAME logical_name,
|
|
unix_timestamp(now()) created_time, concat('hive:///', d.NAME, '/', t.TBL_NAME) dataset_urn,
|
|
p.PARAM_VALUE source_modified_time
|
|
from TBLS t join DBS d on t.DB_ID=d.DB_ID
|
|
join SDS s on t.SD_ID = s.SD_ID
|
|
join COLUMNS_V2 c on s.CD_ID = c.CD_ID
|
|
join TABLE_PARAMS p on p.TBL_ID = t.TBL_ID
|
|
where
|
|
p.PARAM_KEY = 'transient_lastDdlTime' and
|
|
d.NAME in ('{db_name}') and not ((d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW')
|
|
order by 1,2
|
|
""".format(db_name=database_name)
|
|
curs.execute(tbl_info_sql)
|
|
rows = curs.fetchall()
|
|
curs.close()
|
|
return rows
|
|
|
|
def get_table_info_from_serde_params(self, database_name):
|
|
"""
|
|
get table, column info {MANAGED and EXTERNAL} from avro schema parameter
|
|
:param database_name:
|
|
:return: (DatabaseName, TableName, SerializationFormat, Create_Time, SchemaLiteral, SchemaUrl, FieldDelimiter, DB_ID, TBL_ID, SD_ID
|
|
TBL_TYPE, INPUT_FORMAT, OUTPUT_FORMAT, IS_COMPRESSED, IS_STOREDASSUBDIRECTORIES, LOCATION, VIEW_EXPANDED_TEXT)
|
|
"""
|
|
curs_et = self.conn_hms.cursor()
|
|
tbl_info_sql = """select d.NAME DatabaseName, et.TBL_NAME TableName,
|
|
case when s.INPUT_FORMAT like '%.TextInput%' then 'Text'
|
|
when s.INPUT_FORMAT like '%.Avro%' then 'Avro'
|
|
when s.INPUT_FORMAT like '%.RCFile%' then 'RC'
|
|
when s.INPUT_FORMAT like '%.Orc%' then 'ORC'
|
|
when s.INPUT_FORMAT like '%.Sequence%' then 'Sequence'
|
|
when s.INPUT_FORMAT like '%.Parquet%' then 'Parquet'
|
|
else s.INPUT_FORMAT
|
|
end SerializationFormat,
|
|
et.CREATE_TIME TableCreateTime, et.DB_ID, et.TBL_ID, s.SD_ID,
|
|
substr(s.LOCATION, length(substring_index(s.LOCATION, '/', 3))+1) Location,
|
|
et.TBL_TYPE, et.VIEW_EXPANDED_TEXT, s.INPUT_FORMAT, s.OUTPUT_FORMAT, s.IS_COMPRESSED, s.IS_STOREDASSUBDIRECTORIES,
|
|
et.schema_literal SchemaLiteral, et.schema_url SchemaUrl, et.field_delim FieldDelimiter
|
|
from (
|
|
select t.DB_ID, t.TBL_ID, sp.SERDE_ID,
|
|
t.TBL_NAME, t.CREATE_TIME, t.TBL_TYPE, t.VIEW_EXPANDED_TEXT,
|
|
replace(max( case when param_key in ('avro.schema.literal', 'schema.literal')
|
|
then param_value
|
|
end), '\\n', ' ') schema_literal,
|
|
max( case when param_key in ('avro.schema.url', 'schema.url')
|
|
then param_value
|
|
end) schema_url,
|
|
max( case when param_key in ('field.delim')
|
|
then param_value
|
|
end) field_delim
|
|
from SERDE_PARAMS sp join TBLS t on sp.SERDE_ID = t.SD_ID
|
|
where sp.PARAM_KEY regexp 'schema.literal|schema.url|field.delim'
|
|
group by 1,2,3,4,5 ) et
|
|
JOIN DBS d on et.DB_ID = d.DB_ID
|
|
JOIN SDS s on et.SERDE_ID = s.SD_ID
|
|
where d.NAME = '{db_name}'
|
|
order by 1,2 """.format(db_name=database_name)
|
|
curs_et.execute(tbl_info_sql)
|
|
rows = curs_et.fetchall()
|
|
curs_et.close()
|
|
return rows
|
|
|
|
def format_table_metadata_v2(self, rows, schema):
|
|
"""
|
|
process info get from COLUMN_V2 into final table, several lines form one table info
|
|
:param rows: the info get from COLUMN_V2 table, order by database name, table name
|
|
:param schema: {database : _, type : _, tables : [{}, {} ...] }
|
|
:return:
|
|
"""
|
|
db_idx = len(schema) - 1
|
|
table_idx = -1
|
|
|
|
field_list = []
|
|
for row_index, row_value in enumerate(rows):
|
|
|
|
field_list.append({'IntegerIndex': row_value[14], 'ColumnName': row_value[15], 'TypeName': row_value[16],
|
|
'Comment': row_value[17]})
|
|
if row_index == len(rows) - 1 or (row_value[0] != rows[row_index+1][0] or row_value[1] != rows[row_index+1][1]): # if this is last record of current table
|
|
# sort the field_list by IntegerIndex
|
|
field_list = sorted(field_list, key=lambda k: k['IntegerIndex'])
|
|
# process the record of table
|
|
|
|
table_record = {TableInfo.table_name: row_value[1],
|
|
TableInfo.type: row_value[21],
|
|
TableInfo.native_name: row_value[22],
|
|
TableInfo.logical_name: row_value[23],
|
|
TableInfo.dataset_name: row_value[18],
|
|
TableInfo.version: str(row_value[19]),
|
|
TableInfo.serialization_format: row_value[2],
|
|
TableInfo.create_time: row_value[3],
|
|
TableInfo.source_modified_time: row_value[26],
|
|
TableInfo.db_id: row_value[4],
|
|
TableInfo.table_id: row_value[5],
|
|
TableInfo.serde_id: row_value[6],
|
|
TableInfo.location: row_value[7],
|
|
TableInfo.table_type: row_value[8],
|
|
TableInfo.view_expended_text: row_value[9],
|
|
TableInfo.input_format: row_value[10],
|
|
TableInfo.output_format: row_value[11],
|
|
TableInfo.is_compressed: row_value[12],
|
|
TableInfo.is_storedassubdirectories: row_value[13],
|
|
TableInfo.etl_source: 'COLUMN_V2',
|
|
TableInfo.field_list: field_list[:]}
|
|
|
|
field_list = [] # empty it
|
|
|
|
if row_value[0] not in self.db_dict:
|
|
schema.append({'database': row_value[0], 'type': row_value[20], 'tables': []})
|
|
db_idx += 1
|
|
self.db_dict[row_value[0]] = db_idx
|
|
|
|
full_name = row_value[0] + '.' + row_value[1]
|
|
|
|
# put in schema result
|
|
if full_name not in self.table_dict:
|
|
schema[db_idx]['tables'].append(table_record)
|
|
table_idx += 1
|
|
self.table_dict[full_name] = table_idx
|
|
|
|
self.logger.info("%s %6d tables processed for database %12s from COLUMN_V2" % (
|
|
datetime.datetime.now(), table_idx + 1, row_value[0]))
|
|
|
|
def format_table_metadata_serde(self, rows, schema):
|
|
"""
|
|
add table info from rows into schema
|
|
also add extra info.
|
|
:param rows: DatabaseName, TableName, SerializationFormat, Create_Time, SchemaLiteral, SchemaUrl, FieldDelimiter, DB_ID, TBL_ID, SD_ID
|
|
:param schema: {database : _, type : _, tables : ['name' : _, ... '' : _] }
|
|
:return:
|
|
"""
|
|
|
|
db_idx = len(schema) - 1
|
|
table_idx = -1
|
|
for row_value in rows:
|
|
if row_value[0] not in self.db_dict:
|
|
schema.append({'database': row_value[0], 'type': 'Hive', 'tables': []})
|
|
db_idx += 1
|
|
self.db_dict[row_value[0]] = db_idx
|
|
else:
|
|
db_idx = self.db_dict[row_value[0]]
|
|
full_name = ''
|
|
if row_value[0]:
|
|
full_name = row_value[0]
|
|
if row_value[1]:
|
|
full_name += '.' + row_value[1]
|
|
elif row_value[1]:
|
|
full_name = row_value[1]
|
|
|
|
literal = None
|
|
if row_value[15] and not row_value[14]: # schema_url is available but missing schema_literal
|
|
try:
|
|
literal = self.get_schema_literal_from_url(row_value[15])
|
|
except Exception as e:
|
|
self.logger.error(str(e))
|
|
elif row_value[14]:
|
|
literal = row_value[14]
|
|
|
|
# put in schema result
|
|
if full_name not in self.table_dict:
|
|
schema[db_idx]['tables'].append(
|
|
{TableInfo.table_name: row_value[1], TableInfo.type: 'Table', TableInfo.serialization_format: row_value[2],
|
|
TableInfo.dataset_name: row_value[1],
|
|
TableInfo.native_name: row_value[1], TableInfo.logical_name: row_value[1], TableInfo.version: '0',
|
|
TableInfo.create_time: row_value[3], TableInfo.db_id: row_value[4], TableInfo.table_id: row_value[5],
|
|
TableInfo.serde_id: row_value[6], TableInfo.location: row_value[7], TableInfo.table_type: row_value[8],
|
|
TableInfo.view_expended_text: row_value[9], TableInfo.input_format: row_value[10],
|
|
TableInfo.output_format: row_value[11], TableInfo.is_compressed: row_value[12],
|
|
TableInfo.is_storedassubdirectories: row_value[13], TableInfo.etl_source: 'SERDE_PARAMS',
|
|
TableInfo.schema_literal: literal,
|
|
TableInfo.schema_url: row_value[15],
|
|
TableInfo.field_delimiter: row_value[16]})
|
|
table_idx += 1
|
|
self.table_dict[full_name] = table_idx
|
|
|
|
self.logger.info("%s %6d tables processed for database %12s from SERDE_PARAM" % (
|
|
datetime.datetime.now(), table_idx + 1, row_value[0]))
|
|
|
|
def run(self, schema_output_file, sample_output_file=None, hdfs_map_output_file=None,
|
|
hdfs_namenode_ipc_uri=None, kerberos_auth=False, kerberos_principal=None, keytab_file=None):
|
|
"""
|
|
The entrance of the class, extract schema.
|
|
One database per time
|
|
:param schema_output_file: output file
|
|
:return:
|
|
"""
|
|
schema = []
|
|
|
|
# open(sample_output_file, 'wb')
|
|
# os.chmod(sample_output_file, stat.S_IWOTH)
|
|
# sample_file_writer = FileWriter(sample_output_file)
|
|
|
|
if type(kerberos_auth) == str:
|
|
if kerberos_auth.lower() == 'false':
|
|
kerberos_auth = False
|
|
else:
|
|
kerberos_auth = True
|
|
|
|
self.schema_url_helper = SchemaUrlHelper.SchemaUrlHelper(hdfs_namenode_ipc_uri, kerberos_auth, kerberos_principal, keytab_file)
|
|
|
|
for database_name in self.databases:
|
|
self.logger.info("Collecting hive tables in database : " + database_name)
|
|
|
|
# tables from Column V2
|
|
rows = []
|
|
begin = datetime.datetime.now().strftime("%H:%M:%S")
|
|
rows.extend(self.get_table_info_from_v2(database_name, False))
|
|
if len(rows) > 0:
|
|
self.format_table_metadata_v2(rows, schema)
|
|
end = datetime.datetime.now().strftime("%H:%M:%S")
|
|
self.logger.info("Get Hive table info from COLUMN_V2 %12s [%s -> %s]\n" % (database_name, str(begin), str(end)))
|
|
|
|
rows = []
|
|
begin = datetime.datetime.now().strftime("%H:%M:%S")
|
|
rows.extend(self.get_table_info_from_v2(database_name, True))
|
|
if len(rows) > 0:
|
|
self.format_table_metadata_v2(rows, schema)
|
|
end = datetime.datetime.now().strftime("%H:%M:%S")
|
|
self.logger.info("Get Dalids table info from COLUMN_V2 %12s [%s -> %s]\n" % (database_name, str(begin), str(end)))
|
|
|
|
# tables from schemaLiteral
|
|
rows = []
|
|
begin = datetime.datetime.now().strftime("%H:%M:%S")
|
|
rows.extend(self.get_table_info_from_serde_params(database_name))
|
|
if len(rows) > 0:
|
|
self.format_table_metadata_serde(rows, schema)
|
|
end = datetime.datetime.now().strftime("%H:%M:%S")
|
|
self.logger.info("Get table info from Serde %12s [%s -> %s]\n" % (database_name, str(begin), str(end)))
|
|
|
|
with open(schema_output_file, 'wb') as schema_json_file:
|
|
os.chmod(schema_output_file,
|
|
stat.S_IWUSR | stat.S_IRUSR | stat.S_IWGRP | stat.S_IRGRP | stat.S_IWOTH | stat.S_IROTH)
|
|
for item in schema:
|
|
schema_json_file.write("{}\n".format(json.dumps(item, separators=(',',':'))))
|
|
|
|
# fetch hive (managed/external) table to hdfs path mapping
|
|
begin = datetime.datetime.now().strftime("%H:%M:%S")
|
|
rows = self.get_hdfs_map()
|
|
with open(hdfs_map_output_file, 'wb') as hdfs_map_csv_file:
|
|
os.chmod(hdfs_map_output_file,
|
|
stat.S_IWUSR | stat.S_IRUSR | stat.S_IWGRP | stat.S_IRGRP | stat.S_IWOTH | stat.S_IROTH)
|
|
hdfs_map_columns = ['db_name', 'table_name', 'cluster_uri', 'abstract_hdfs_path']
|
|
csv_writer = csv.writer(hdfs_map_csv_file, delimiter='\x1a', lineterminator='\n', quoting=csv.QUOTE_NONE)
|
|
csv_writer.writerow(hdfs_map_columns)
|
|
for row in rows:
|
|
try:
|
|
csv_writer.writerow(row)
|
|
except:
|
|
self.logger.error("Error writing CSV row: " + str(row))
|
|
end = datetime.datetime.now().strftime("%H:%M:%S")
|
|
self.logger.info("Get hdfs map from SDS %12s [%s -> %s]\n" % (database_name, str(begin), str(end)))
|
|
|
|
|
|
def get_all_databases(self, database_white_list, database_black_list):
|
|
"""
|
|
Fetch all databases name from DBS table
|
|
:return:
|
|
"""
|
|
database_white_list = ",".join(["'" + x + "'" for x in database_white_list.split(',')])
|
|
database_black_list = ",".join(["'" + x + "'" for x in database_black_list.split(',')])
|
|
fetch_all_database_names = "SELECT `NAME` FROM DBS WHERE `NAME` IN ({white_list}) OR NOT (`NAME` IN ({black_list}) OR `NAME` LIKE 'u\\_%')"\
|
|
.format(white_list=database_white_list, black_list=database_black_list)
|
|
self.logger.info(fetch_all_database_names)
|
|
curs = self.conn_hms.cursor()
|
|
curs.execute(fetch_all_database_names)
|
|
rows = [item[0] for item in curs.fetchall()]
|
|
curs.close()
|
|
return rows
|
|
|
|
def get_schema_literal_from_url(self, schema_url):
|
|
"""
|
|
fetch avro schema literal from
|
|
- avsc file on hdfs via hdfs/webhdfs
|
|
- json string in schemaregistry via http
|
|
:param schema_url: e.g. hdfs://server:port/data/tracking/abc/_schema.avsc http://schema-registry-vip-1:port/schemaRegistry/schemas/latest_with_type=xyz
|
|
:param schema: {database : _, type : _, tables : ['name' : _, ... '' : _] }
|
|
:return: json string of avro schema
|
|
"""
|
|
if schema_url.startswith('hdfs://') or schema_url.startswith('/') or schema_url.startswith('webhdfs://'):
|
|
return self.schema_url_helper.get_from_hdfs(schema_url)
|
|
elif scheam_url.startswith('https://') or schema_url.startswith('http://'):
|
|
return self.schema_url_helper.get_from_http(schema_url)
|
|
else:
|
|
self.logger.error("get_schema_literal_from_url() gets a bad input: %s" % schema_url)
|
|
return ''
|
|
|
|
def get_hdfs_map(self):
|
|
"""
|
|
Fetch the mapping from hdfs location to hive (managed and external) table
|
|
:return:
|
|
"""
|
|
|
|
hdfs_map_sql = """select db_name, tbl_name table_name, cluster_uri,
|
|
cast(
|
|
case when substring_index(hdfs_path, '/', -4) regexp '[0-9]{4}/[0-9]{2}/[0-9]{2}/[0-9]{2}'
|
|
then substring(hdfs_path, 1, hdfs_path_len - length(substring_index(hdfs_path, '/', -4))-1)
|
|
when substring_index(hdfs_path, '/', -3) regexp '[0-9]{4}/[0-9]{2}/[0-9]{2}'
|
|
then substring(hdfs_path, 1, hdfs_path_len - length(substring_index(hdfs_path, '/', -3))-1)
|
|
when substring_index(hdfs_path, '/', -1) regexp '20[0-9]{2}([\\._-]?[0-9][0-9]){2,5}'
|
|
or substring_index(hdfs_path, '/', -1) regexp '1[3-6][0-9]{11}(-(PT|UTC|GMT|SCN)-[0-9]+)?'
|
|
or substring_index(hdfs_path, '/', -1) regexp '[0-9]+([\\._-][0-9]+)?'
|
|
or substring_index(hdfs_path, '/', -1) regexp '[vV][0-9]+([\\._-][0-9]+)?'
|
|
or substring_index(hdfs_path, '/', -1) regexp '(prod|ei|qa|dev)_[0-9]+\\.[0-9]+\\.[0-9]+_20[01][0-9]([_-]?[0-9][0-9]){2,5}'
|
|
then substring(hdfs_path, 1, hdfs_path_len - length(substring_index(hdfs_path, '/', -1))-1)
|
|
when hdfs_path regexp '/datepartition=20[01][0-9]([\\._-]?[0-9][0-9]){2,3}'
|
|
or hdfs_path regexp '/datepartition=[[:alnum:]]+'
|
|
then substring(hdfs_path, 1, locate('/datepartition=', hdfs_path)-1)
|
|
when hdfs_path regexp '/date_sk=20[01][0-9]([\\._-]?[0-9][0-9]){2,3}'
|
|
then substring(hdfs_path, 1, locate('/date_sk=', hdfs_path)-1)
|
|
when hdfs_path regexp '/ds=20[01][0-9]([\\._-]?[0-9][0-9]){2,3}'
|
|
then substring(hdfs_path, 1, locate('/ds=', hdfs_path)-1)
|
|
when hdfs_path regexp '/dt=20[01][0-9]([\\._-]?[0-9][0-9]){2,3}'
|
|
then substring(hdfs_path, 1, locate('/dt=', hdfs_path)-1)
|
|
when hdfs_path regexp '^/[[:alnum:]]+/[[:alnum:]]+/[[:alnum:]]+/20[01][0-9]([\\._-]?[0-9][0-9]){2,5}/'
|
|
then concat(substring_index(hdfs_path, '/', 4), '/*',
|
|
substring(hdfs_path, length(substring_index(hdfs_path, '/', 5))+1))
|
|
when hdfs_path regexp '^/[[:alnum:]]+/[[:alnum:]]+/20[01][0-9]([\\._-]?[0-9][0-9]){2,5}/'
|
|
then concat(substring_index(hdfs_path, '/', 3), '/*',
|
|
substring(hdfs_path, length(substring_index(hdfs_path, '/', 4))+1))
|
|
when substring_index(hdfs_path, '/', -3) regexp '^(prod|ei|qa|dev)_[0-9]+\\.[0-9]+\\.[0-9]+_20[01][0-9]([\\._-]?[0-9][0-9]){2,5}/'
|
|
then concat(substring(hdfs_path, 1, hdfs_path_len - length(substring_index(hdfs_path, '/', -3))-1), '/*/',
|
|
substring_index(hdfs_path, '/', -2))
|
|
when substring_index(hdfs_path, '/', -2) regexp '^(prod|ei|qa|dev)_[0-9]+\\.[0-9]+\\.[0-9]+_20[01][0-9]([\\._-]?[0-9][0-9]){2,5}/'
|
|
then concat(substring(hdfs_path, 1, hdfs_path_len - length(substring_index(hdfs_path, '/', -2))-1), '/*/',
|
|
substring_index(hdfs_path, '/', -1))
|
|
else hdfs_path
|
|
end as char(300)) abstract_hdfs_path
|
|
from (
|
|
select d.NAME DB_NAME, t.TBL_NAME,
|
|
substring_index(s.LOCATION, '/', 3) cluster_uri,
|
|
substring(s.LOCATION, length(substring_index(s.LOCATION, '/', 3))+1) hdfs_path,
|
|
length(s.LOCATION) - length(substring_index(s.LOCATION, '/', 3)) as hdfs_path_len
|
|
from SDS s
|
|
join TBLS t
|
|
on s.SD_ID = t.SD_ID
|
|
join DBS d
|
|
on t.DB_ID = d.DB_ID
|
|
left join PARTITIONS p
|
|
on t.TBL_ID = p.TBL_ID
|
|
where p.PART_ID is null
|
|
and LOCATION is not null
|
|
union all
|
|
select d.NAME DB_NAME, t.TBL_NAME,
|
|
substring_index(s.LOCATION, '/', 3) cluster_uri,
|
|
substring(s.LOCATION, length(substring_index(s.LOCATION, '/', 3))+1) hdfs_path,
|
|
length(s.LOCATION) - length(substring_index(s.LOCATION, '/', 3)) as hdfs_path_len
|
|
from SDS s
|
|
join (select TBL_ID, MAX(SD_ID) SD_ID from PARTITIONS group by 1) p
|
|
on s.SD_ID = p.SD_ID
|
|
join TBLS t
|
|
on p.TBL_ID = t.TBL_ID
|
|
join DBS d
|
|
on t.DB_ID = d.DB_ID
|
|
where not LOCATION like 'hdfs:%__HIVE_DEFAULT_PARTITION__%'
|
|
) x where hdfs_path not like '/tmp/%'
|
|
order by 1,2"""
|
|
|
|
curs = self.conn_hms.cursor()
|
|
curs.execute(hdfs_map_sql)
|
|
rows = curs.fetchall()
|
|
curs.close()
|
|
self.logger.info("%6d Hive table => HDFS path mapping relations found." % len(rows))
|
|
return rows
|
|
|
|
if __name__ == "__main__":
|
|
args = sys.argv[1]
|
|
|
|
# connection
|
|
username = args[Constant.HIVE_METASTORE_USERNAME]
|
|
password = args[Constant.HIVE_METASTORE_PASSWORD]
|
|
jdbc_driver = args[Constant.HIVE_METASTORE_JDBC_DRIVER]
|
|
jdbc_url = args[Constant.HIVE_METASTORE_JDBC_URL]
|
|
|
|
if Constant.HIVE_DATABASE_WHITELIST_KEY in args:
|
|
database_white_list = args[Constant.HIVE_DATABASE_WHITELIST_KEY]
|
|
else:
|
|
database_white_list = "''"
|
|
|
|
if Constant.HIVE_DATABASE_BLACKLIST_KEY in args:
|
|
database_black_list = args[Constant.HIVE_DATABASE_BLACKLIST_KEY]
|
|
else:
|
|
database_black_list = "''"
|
|
|
|
e = HiveExtract()
|
|
e.conn_hms = zxJDBC.connect(jdbc_url, username, password, jdbc_driver)
|
|
|
|
keytab_file = args[Constant.KERBEROS_KEYTAB_FILE_KEY]
|
|
krb5_dir = os.getenv("WHZ_KRB5_DIR")
|
|
if keytab_file and krb5_dir:
|
|
keytab_file = os.path.join(krb5_dir, keytab_file)
|
|
|
|
try:
|
|
e.databases = e.get_all_databases(database_white_list, database_black_list)
|
|
e.run(args[Constant.HIVE_SCHEMA_JSON_FILE_KEY],
|
|
None,
|
|
args[Constant.HIVE_HDFS_MAP_CSV_FILE_KEY],
|
|
args[Constant.HDFS_NAMENODE_IPC_URI_KEY],
|
|
args[Constant.KERBEROS_AUTH_KEY],
|
|
args[Constant.KERBEROS_PRINCIPAL_KEY],
|
|
keytab_file)
|
|
finally:
|
|
e.conn_hms.close()
|