datahub/metadata-ingestion/hive-etl/hive_etl.py

#! /usr/bin/python
import sys
import time
from pyhive import hive
from TCLIService.ttypes import TOperationState

HIVESTORE='localhost'

AVROLOADPATH = '../../metadata-events/mxe-schemas/src/renamed/avro/com/linkedin/mxe/MetadataChangeEvent.avsc'
KAFKATOPIC = 'MetadataChangeEvent'
BOOTSTRAP = 'localhost:9092'
SCHEMAREGISTRY = 'http://localhost:8081'

def hive_query(query):
    """
    Execute the query to the HiveStore.
    """
    cursor = hive.connect(HIVESTORE).cursor()
    cursor.execute(query, async=True)
    status = cursor.poll().operationState
    while status in (TOperationState.INITIALIZED_STATE, TOperationState.RUNNING_STATE):
        logs = cursor.fetch_logs()
        for message in logs:
            sys.stdout.write(message)
        status = cursor.poll().operationState
    results = cursor.fetchall()
    return results

def build_hive_dataset_mce(dataset_name, schema, metadata):
    """
    Create the MetadataChangeEvent via dataset_name and schema.
    """
    actor, type, created_time, upstreams_dataset, sys_time = "urn:li:corpuser:" + metadata[2][7:], str(metadata[-1][11:-1]), long(metadata[3][12:]), metadata[-28][10:], long(time.time())
    owners = {"owners":[{"owner":actor,"type":"DATAOWNER"}],"lastModified":{"time":sys_time,"actor":actor}}
    upstreams = {"upstreams":[{"auditStamp":{"time":sys_time,"actor":actor},"dataset":"urn:li:dataset:(urn:li:dataPlatform:hive," + upstreams_dataset + ",PROD)","type":type}]}
    elements = {"elements":[{"url":HIVESTORE,"description":"sample doc to describe upstreams","createStamp":{"time":sys_time,"actor":actor}}]}
    schema_name = {"schemaName":dataset_name,"platform":"urn:li:dataPlatform:hive","version":0,"created":{"time":created_time,"actor":actor},
                  "lastModified":{"time":sys_time,"actor":actor},"hash":"","platformSchema":{"OtherSchema": schema},
                   "fields":[{"fieldPath":"","description":"","nativeDataType":"string","type":{"type":{"com.linkedin.pegasus2avro.schema.StringType":{}}}}]}

    mce = {"auditHeader": None,
           "proposedSnapshot":("com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot",
                               {"urn": "urn:li:dataset:(urn:li:dataPlatform:hive,"+ dataset_name +",PROD)","aspects": [owners, upstreams, elements, schema_name]}),
           "proposedDelta": None}

    produce_hive_dataset_mce(mce)

def produce_hive_dataset_mce(mce):
    """
    Produce MetadataChangeEvent records.
    """
    from confluent_kafka import avro
    from confluent_kafka.avro import AvroProducer

    conf = {'bootstrap.servers': BOOTSTRAP,
            'schema.registry.url': SCHEMAREGISTRY}
    record_schema = avro.load(AVROLOADPATH)
    producer = AvroProducer(conf, default_value_schema=record_schema)

    try:
        producer.produce(topic=KAFKATOPIC, value=mce)
        producer.poll(0)
        sys.stdout.write('\n%s has been successfully produced!\n' % mce)
    except ValueError as e:
        sys.stdout.write('Message serialization failed %s' % e)
    producer.flush()

databases = hive_query('show databases')
for database in databases:
    tables = hive_query('show tables in ' + database[0])
    for table in tables:
        dataset_name = database[0] + '.' + table[0]
        description = hive_query('describe extended ' + dataset_name)
        build_hive_dataset_mce(dataset_name, str(description[:-1][:-1]), description[-1][1].split(','))

sys.exit(0)