2021-04-02 13:00:58 -05:00
import lkml
import glob
import time
import typing
import os
2021-05-03 08:54:57 -06:00
import re
2021-04-02 13:00:58 -05:00
from confluent_kafka import avro
from confluent_kafka . avro import AvroProducer
from dataclasses import dataclass , replace
2021-05-03 08:54:57 -06:00
from sql_metadata import get_query_tables
2021-04-02 13:00:58 -05:00
# Configuration
AVSC_PATH = " ../../metadata-events/mxe-schemas/src/renamed/avro/com/linkedin/mxe/MetadataChangeEvent.avsc "
KAFKA_TOPIC = ' MetadataChangeEvent_v4 '
# LOOKER_DIRECTORY = "./test_lookml"
LOOKER_DIRECTORY = os . environ [ " LOOKER_DIRECTORY " ]
LOOKER_DIRECTORY = os . path . abspath ( LOOKER_DIRECTORY )
EXTRA_KAFKA_CONF = {
' bootstrap.servers ' : ' localhost:9092 ' ,
' schema.registry.url ' : ' http://localhost:8081 '
# 'security.protocol': 'SSL',
# 'ssl.ca.location': '',
# 'ssl.key.location': '',
# 'ssl.certificate.location': ''
}
# The datahub platform where looker views are stored
LOOKER_VIEW_PLATFORM = " looker_views "
class LookerViewFileLoader :
"""
Loads the looker viewfile at a : path and caches the LookerViewFile in memory
This is to avoid reloading the same file off of disk many times during the recursive include resolution process
"""
def __init__ ( self ) :
self . viewfile_cache = { }
def _load_viewfile ( self , path : str ) - > typing . Optional [ " LookerViewFile " ] :
if path in self . viewfile_cache :
return self . viewfile_cache [ path ]
try :
with open ( path , " r " ) as file :
parsed = lkml . load ( file )
looker_viewfile = LookerViewFile . from_looker_dict ( path , parsed )
self . viewfile_cache [ path ] = looker_viewfile
2021-05-03 08:54:57 -06:00
return looker_viewfile
2021-04-02 13:00:58 -05:00
except Exception as e :
print ( e )
print ( f " Error processing view file { path } . Skipping it " )
def load_viewfile ( self , path : str , connection : str ) :
viewfile = self . _load_viewfile ( path )
if viewfile is None :
return None
return replace ( viewfile , connection = connection )
@dataclass
class LookerModel :
2021-05-03 08:54:57 -06:00
connection : str
2021-04-02 13:00:58 -05:00
includes : typing . List [ str ]
resolved_includes : typing . List [ str ]
@staticmethod
def from_looker_dict ( looker_model_dict ) :
connection = looker_model_dict [ " connection " ]
includes = looker_model_dict [ " includes " ]
resolved_includes = LookerModel . resolve_includes ( includes )
return LookerModel ( connection = connection , includes = includes , resolved_includes = resolved_includes )
@staticmethod
def resolve_includes ( includes ) - > typing . List [ str ] :
resolved = [ ]
for inc in includes :
# Massage the looker include into a valid glob wildcard expression
glob_expr = f " { LOOKER_DIRECTORY } / { inc } "
outputs = glob . glob ( glob_expr )
resolved . extend ( outputs )
return resolved
@dataclass
class LookerViewFile :
absolute_file_path : str
connection : typing . Optional [ str ]
includes : typing . List [ str ]
resolved_includes : typing . List [ str ]
views : typing . List [ typing . Dict ]
@staticmethod
def from_looker_dict ( absolute_file_path , looker_view_file_dict ) :
includes = looker_view_file_dict . get ( " includes " , [ ] )
resolved_includes = LookerModel . resolve_includes ( includes )
views = looker_view_file_dict . get ( " views " , [ ] )
return LookerViewFile ( absolute_file_path = absolute_file_path , connection = None , includes = includes , resolved_includes = resolved_includes , views = views )
@dataclass
class LookerView :
absolute_file_path : str
connection : str
view_name : str
2021-05-03 08:54:57 -06:00
sql_table_names : typing . List [ str ]
2021-04-02 13:00:58 -05:00
def get_relative_file_path ( self ) :
if LOOKER_DIRECTORY in self . absolute_file_path :
return self . absolute_file_path . replace ( LOOKER_DIRECTORY , ' ' ) . lstrip ( ' / ' )
else :
raise Exception ( f " Found a looker view with name: { view_name } at path: { absolute_file_path } not underneath the base LOOKER_DIRECTORY: { LOOKER_DIRECTORY } . This should not happen " )
@staticmethod
def from_looker_dict ( looker_view , connection : str , looker_viewfile : LookerViewFile , looker_viewfile_loader : LookerViewFileLoader ) - > typing . Optional [ " LookerView " ] :
view_name = looker_view [ " name " ]
sql_table_name = looker_view . get ( " sql_table_name " , None )
# Some sql_table_name fields contain quotes like: optimizely."group", just remove the quotes
2021-05-03 08:54:57 -06:00
sql_table_name = sql_table_name . replace ( ' " ' , ' ' ) if sql_table_name is not None else None
2021-04-02 13:00:58 -05:00
derived_table = looker_view . get ( " derived_table " , None )
2021-05-03 08:54:57 -06:00
# Parse SQL from derived tables to extract dependencies
if derived_table is not None and ' sql ' in derived_table :
# Get the list of tables in the query
sql_tables : typing . List [ str ] = get_query_tables ( derived_table [ ' sql ' ] )
# Remove temporary tables from WITH statements
sql_table_names = [ t for t in sql_tables if not re . search ( f ' WITH(.*,)? \ s+ { t } ( \ s* \ ([ \ w \ s,]+ \ ))? \ s+AS \ s+ \ ( ' , derived_table [ ' sql ' ] , re . IGNORECASE | re . DOTALL ) ]
# Remove quotes from tables
sql_table_names = [ t . replace ( ' " ' , ' ' ) for t in sql_table_names ]
return LookerView ( absolute_file_path = looker_viewfile . absolute_file_path , connection = connection , view_name = view_name , sql_table_names = sql_table_names )
2021-04-02 13:00:58 -05:00
# There is a single dependency in the view, on the sql_table_name
if sql_table_name is not None :
return LookerView ( absolute_file_path = looker_viewfile . absolute_file_path , connection = connection , view_name = view_name , sql_table_names = [ sql_table_name ] )
# The sql_table_name might be defined in another view and this view is extending that view, try to find it
else :
extends = looker_view . get ( " extends " , [ ] )
if len ( extends ) == 0 :
# The view is malformed, the view is not a derived table, does not contain a sql_table_name or an extends
print ( f " Skipping malformed with view_name: { view_name } . View should have a sql_table_name if it is not a derived table " )
return None
2021-05-03 08:54:57 -06:00
extends_to_looker_view = [ ]
2021-04-02 13:00:58 -05:00
# The base view could live in the same file
for raw_view in looker_viewfile . views :
raw_view_name = raw_view [ " name " ]
# Make sure to skip loading view we are currently trying to resolve
if raw_view_name != view_name :
maybe_looker_view = LookerView . from_looker_dict ( raw_view , connection , looker_viewfile , looker_viewfile_loader )
if maybe_looker_view is not None and maybe_looker_view . view_name in extends :
extends_to_looker_view . append ( maybe_looker_view )
# Or it could live in one of the included files, we do not know which file the base view lives in, try them all!
for include in looker_viewfile . resolved_includes :
looker_viewfile = looker_viewfile_loader . load_viewfile ( include , connection )
if looker_viewfile is not None :
for view in looker_viewfile . views :
maybe_looker_view = LookerView . from_looker_dict ( view , connection , looker_viewfile , looker_viewfile_loader )
if maybe_looker_view is None :
2021-05-03 08:54:57 -06:00
continue
2021-04-02 13:00:58 -05:00
if maybe_looker_view is not None and maybe_looker_view . view_name in extends :
extends_to_looker_view . append ( maybe_looker_view )
if len ( extends_to_looker_view ) != 1 :
print ( f " Skipping malformed view with view_name: { view_name } . View should have a single view in a view inheritance chain with a sql_table_name " )
return None
output_looker_view = LookerView ( absolute_file_path = looker_viewfile . absolute_file_path , connection = connection , view_name = view_name , sql_table_names = extends_to_looker_view [ 0 ] . sql_table_names )
return output_looker_view
def get_platform_and_table ( view_name : str , connection : str , sql_table_name : str ) :
"""
2021-05-03 08:54:57 -06:00
This will depend on what database connections you use in Looker
2021-04-02 13:00:58 -05:00
For SpotHero , we had two database connections in Looker : " redshift_test " ( a redshift database ) and " presto " ( a presto database )
Presto supports querying across multiple catalogs , so we infer which underlying database presto is using based on the presto catalog name
For SpotHero , we have 3 catalogs in presto : " redshift " , " hive " , and " hive_emr "
"""
if connection == " redshift_test " :
platform = " redshift "
table_name = sql_table_name
return platform , table_name
elif connection == " presto " :
parts = sql_table_name . split ( " . " )
catalog = parts [ 0 ]
if catalog == " hive " :
platform = " hive "
elif catalog == " hive_emr " :
platform = " hive_emr "
elif catalog == " redshift " :
platform = " redshift "
else :
# Looker lets you exclude a catalog and use a configured default, the default we have configured is to use hive_emr
if sql_table_name . count ( " . " ) != 1 :
raise Exception ( " Unknown catalog for sql_table_name: {sql_table_name} for view_name: {view_name} " )
platform = " hive_emr "
return platform , sql_table_name
table_name = " . " . join ( parts [ 1 : : ] )
return platform , table_name
else :
raise Exception ( f " Could not find a platform for looker view with connection: { connection } " )
def construct_datalineage_urn ( view_name : str , connection : str , sql_table_name : str ) :
platform , table_name = get_platform_and_table ( view_name , connection , sql_table_name )
return f " urn:li:dataset:(urn:li:dataPlatform: { platform } , { table_name } ,PROD) "
def construct_data_urn ( looker_view : LookerView ) :
return f " urn:li:dataset:(urn:li:dataPlatform: { LOOKER_VIEW_PLATFORM } , { looker_view . view_name } ,PROD) "
def build_dataset_mce ( looker_view : LookerView ) :
"""
Creates MetadataChangeEvent for the dataset , creating upstream lineage links
"""
actor , sys_time = " urn:li:corpuser:etl " , int ( time . time ( ) ) * 1000
upstreams = [ {
" auditStamp " : {
" time " : sys_time ,
" actor " : actor
} ,
" dataset " : construct_datalineage_urn ( looker_view . view_name , looker_view . connection , sql_table_name ) ,
" type " : " TRANSFORMED "
} for sql_table_name in looker_view . sql_table_names ]
doc_elements = [ {
" url " : f " https://github.com/spothero/internal-looker-repo/blob/master/ { looker_view . get_relative_file_path ( ) } " ,
" description " : " Github looker view definition " ,
" createStamp " : {
" time " : sys_time ,
" actor " : actor
}
} ]
owners = [ {
" owner " : f " urn:li:corpuser:analysts " ,
" type " : " DEVELOPER "
} ]
return {
" auditHeader " : None ,
" proposedSnapshot " : ( " com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot " , {
" urn " : construct_data_urn ( looker_view ) ,
" aspects " : [
( " com.linkedin.pegasus2avro.dataset.UpstreamLineage " , { " upstreams " : upstreams } ) ,
( " com.linkedin.pegasus2avro.common.InstitutionalMemory " , { " elements " : doc_elements } ) ,
( " com.linkedin.pegasus2avro.common.Ownership " , {
" owners " : owners ,
" lastModified " : {
" time " : sys_time ,
" actor " : actor
}
} )
]
} ) ,
" proposedDelta " : None
}
def delivery_report ( err , msg ) :
""" Called once for each message produced to indicate delivery result.
Triggered by poll ( ) or flush ( ) . """
if err is not None :
print ( ' Message delivery failed: {} ' . format ( err ) )
else :
print ( ' Message delivered to {} [ {} ] ' . format ( msg . topic ( ) , msg . partition ( ) ) )
def make_kafka_producer ( extra_kafka_conf ) :
conf = {
" on_delivery " : delivery_report ,
* * extra_kafka_conf
}
key_schema = avro . loads ( ' { " type " : " string " } ' )
record_schema = avro . load ( AVSC_PATH )
producer = AvroProducer ( conf , default_key_schema = key_schema , default_value_schema = record_schema )
return producer
def main ( ) :
kafka_producer = make_kafka_producer ( EXTRA_KAFKA_CONF )
viewfile_loader = LookerViewFileLoader ( )
looker_models = [ ]
all_views = [ ]
model_files = sorted ( f for f in glob . glob ( f " { LOOKER_DIRECTORY } /**/*.model.lkml " , recursive = True ) )
for f in model_files :
try :
with open ( f , ' r ' ) as file :
parsed = lkml . load ( file )
looker_model = LookerModel . from_looker_dict ( parsed )
looker_models . append ( looker_model )
except Exception as e :
print ( e )
print ( f " Error processing model file { f } . Skipping it " )
for model in looker_models :
for include in model . resolved_includes :
looker_viewfile = viewfile_loader . load_viewfile ( include , model . connection )
if looker_viewfile is not None :
for raw_view in looker_viewfile . views :
maybe_looker_view = LookerView . from_looker_dict ( raw_view , model . connection , looker_viewfile , viewfile_loader )
if maybe_looker_view :
all_views . append ( maybe_looker_view )
for view in all_views :
MCE = build_dataset_mce ( view )
print ( view )
print ( MCE )
kafka_producer . produce ( topic = KAFKA_TOPIC , key = MCE [ ' proposedSnapshot ' ] [ 1 ] [ ' urn ' ] , value = MCE )
kafka_producer . flush ( )
if __name__ == " __main__ " :
main ( )