mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-28 20:09:59 +00:00

Start adding java ETL examples, starting with kafka etl. We've had a few requests to start providing Java examples rather than Python due to type safety. I've also started to add these to metadata-ingestion-examples to make it clearer these are *examples*. They can be used directly or as a basis for other things. As we port to Java we'll move examples to contrib.
67 lines
3.5 KiB
Python
Executable File
67 lines
3.5 KiB
Python
Executable File
#! /usr/bin/env nix-shell
|
|
#! nix-shell dataset-hive-generator.py.nix -i python
|
|
|
|
import sys
|
|
import time
|
|
from pyhive import hive
|
|
from TCLIService.ttypes import TOperationState
|
|
|
|
import simplejson as json
|
|
|
|
HIVESTORE='localhost'
|
|
|
|
AVROLOADPATH = '../../metadata-events/mxe-schemas/src/renamed/avro/com/linkedin/mxe/MetadataChangeEvent.avsc'
|
|
KAFKATOPIC = 'MetadataChangeEvent_v4'
|
|
BOOTSTRAP = 'localhost:9092'
|
|
SCHEMAREGISTRY = 'http://localhost:8081'
|
|
|
|
def hive_query(query):
|
|
"""
|
|
Execute the query to the HiveStore.
|
|
"""
|
|
cursor = hive.connect(HIVESTORE).cursor()
|
|
cursor.execute(query, async_=True)
|
|
status = cursor.poll().operationState
|
|
while status in (TOperationState.INITIALIZED_STATE, TOperationState.RUNNING_STATE):
|
|
logs = cursor.fetch_logs()
|
|
for message in logs:
|
|
sys.stdout.write(message)
|
|
status = cursor.poll().operationState
|
|
results = cursor.fetchall()
|
|
return results
|
|
|
|
def build_hive_dataset_mce(dataset_name, schema, metadata):
|
|
"""
|
|
Create the MetadataChangeEvent via dataset_name and schema.
|
|
"""
|
|
actor, type, created_time, upstreams_dataset, sys_time = "urn:li:corpuser:" + metadata[2][7:], str(metadata[-1][11:-1]), int(metadata[3][12:]), metadata[-28][10:], int(time.time())
|
|
owners = {"owners":[{"owner":actor,"type":"DATAOWNER"}],"lastModified":{"time":sys_time,"actor":actor}}
|
|
upstreams = {"upstreams":[{"auditStamp":{"time":sys_time,"actor":actor},"dataset":"urn:li:dataset:(urn:li:dataPlatform:hive," + upstreams_dataset + ",PROD)","type":"TRANSFORMED"}]}
|
|
elements = {"elements":[{"url":HIVESTORE,"description":"sample doc to describe upstreams","createStamp":{"time":sys_time,"actor":actor}}]}
|
|
schema_name = {"schemaName":dataset_name,"platform":"urn:li:dataPlatform:hive","version":0,"created":{"time":created_time,"actor":actor},
|
|
"lastModified":{"time":sys_time,"actor":actor},"hash":"","platformSchema":{"com.linkedin.pegasus2avro.schema.OtherSchema": {"rawSchema": schema}},
|
|
"fields":[{"fieldPath":"","description":{"string":""},"nativeDataType":"string","type":{"type":{"com.linkedin.pegasus2avro.schema.StringType":{}}}}]}
|
|
|
|
mce = {"auditHeader": None,
|
|
"proposedSnapshot":{"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot":
|
|
{"urn": "urn:li:dataset:(urn:li:dataPlatform:hive,"+ dataset_name +",PROD)"
|
|
,"aspects": [
|
|
{"com.linkedin.pegasus2avro.common.Ownership": owners}
|
|
, {"com.linkedin.pegasus2avro.dataset.UpstreamLineage": upstreams}
|
|
, {"com.linkedin.pegasus2avro.common.InstitutionalMemory": elements}
|
|
, {"com.linkedin.pegasus2avro.schema.SchemaMetadata": schema_name}
|
|
]}},
|
|
"proposedDelta": None}
|
|
|
|
print(json.dumps(mce))
|
|
|
|
databases = hive_query('show databases')
|
|
for database in databases:
|
|
tables = hive_query('show tables in ' + database[0])
|
|
for table in tables:
|
|
dataset_name = database[0] + '.' + table[0]
|
|
description = hive_query('describe extended ' + dataset_name)
|
|
build_hive_dataset_mce(dataset_name, str(description[:-1][:-1]), description[-1][1].split(','))
|
|
|
|
sys.exit(0)
|