344 lines
12 KiB
Python

import lkml
import glob
import time
import typing
import os
import re
from confluent_kafka import avro
from confluent_kafka.avro import AvroProducer
from dataclasses import dataclass, replace
from sql_metadata import get_query_tables
# Configuration
AVSC_PATH = "../../metadata-events/mxe-schemas/src/renamed/avro/com/linkedin/mxe/MetadataChangeEvent.avsc"
KAFKA_TOPIC = 'MetadataChangeEvent_v4'
# LOOKER_DIRECTORY = "./test_lookml"
LOOKER_DIRECTORY = os.environ["LOOKER_DIRECTORY"]
LOOKER_DIRECTORY = os.path.abspath(LOOKER_DIRECTORY)
EXTRA_KAFKA_CONF = {
'bootstrap.servers': 'localhost:9092',
'schema.registry.url': 'http://localhost:8081'
# 'security.protocol': 'SSL',
# 'ssl.ca.location': '',
# 'ssl.key.location': '',
# 'ssl.certificate.location': ''
}
# The datahub platform where looker views are stored
LOOKER_VIEW_PLATFORM = "looker_views"
class LookerViewFileLoader:
"""
Loads the looker viewfile at a :path and caches the LookerViewFile in memory
This is to avoid reloading the same file off of disk many times during the recursive include resolution process
"""
def __init__(self):
self.viewfile_cache = {}
def _load_viewfile(self, path: str) -> typing.Optional["LookerViewFile"]:
if path in self.viewfile_cache:
return self.viewfile_cache[path]
try:
with open(path, "r") as file:
parsed = lkml.load(file)
looker_viewfile = LookerViewFile.from_looker_dict(path, parsed)
self.viewfile_cache[path] = looker_viewfile
return looker_viewfile
except Exception as e:
print(e)
print(f"Error processing view file {path}. Skipping it")
def load_viewfile(self, path: str, connection: str):
viewfile = self._load_viewfile(path)
if viewfile is None:
return None
return replace(viewfile, connection=connection)
@dataclass
class LookerModel:
connection: str
includes: typing.List[str]
resolved_includes: typing.List[str]
@staticmethod
def from_looker_dict(looker_model_dict):
connection = looker_model_dict["connection"]
includes = looker_model_dict["includes"]
resolved_includes = LookerModel.resolve_includes(includes)
return LookerModel(connection=connection, includes=includes, resolved_includes=resolved_includes)
@staticmethod
def resolve_includes(includes) -> typing.List[str]:
resolved = []
for inc in includes:
# Massage the looker include into a valid glob wildcard expression
glob_expr = f"{LOOKER_DIRECTORY}/{inc}"
outputs = glob.glob(glob_expr)
resolved.extend(outputs)
return resolved
@dataclass
class LookerViewFile:
absolute_file_path: str
connection: typing.Optional[str]
includes: typing.List[str]
resolved_includes: typing.List[str]
views: typing.List[typing.Dict]
@staticmethod
def from_looker_dict(absolute_file_path, looker_view_file_dict):
includes = looker_view_file_dict.get("includes", [])
resolved_includes = LookerModel.resolve_includes(includes)
views = looker_view_file_dict.get("views", [])
return LookerViewFile(absolute_file_path=absolute_file_path, connection=None, includes=includes, resolved_includes=resolved_includes, views=views)
@dataclass
class LookerView:
absolute_file_path: str
connection: str
view_name: str
sql_table_names: typing.List[str]
def get_relative_file_path(self):
if LOOKER_DIRECTORY in self.absolute_file_path:
return self.absolute_file_path.replace(LOOKER_DIRECTORY, '').lstrip('/')
else:
raise Exception(f"Found a looker view with name: {view_name} at path: {absolute_file_path} not underneath the base LOOKER_DIRECTORY: {LOOKER_DIRECTORY}. This should not happen")
@staticmethod
def from_looker_dict(looker_view, connection: str, looker_viewfile: LookerViewFile, looker_viewfile_loader: LookerViewFileLoader) -> typing.Optional["LookerView"]:
view_name = looker_view["name"]
sql_table_name = looker_view.get("sql_table_name", None)
# Some sql_table_name fields contain quotes like: optimizely."group", just remove the quotes
sql_table_name = sql_table_name.replace('"', '') if sql_table_name is not None else None
derived_table = looker_view.get("derived_table", None)
# Parse SQL from derived tables to extract dependencies
if derived_table is not None and 'sql' in derived_table:
# Get the list of tables in the query
sql_tables: typing.List[str] = get_query_tables(derived_table['sql'])
# Remove temporary tables from WITH statements
sql_table_names = [t for t in sql_tables if not re.search(f'WITH(.*,)?\s+{t}(\s*\([\w\s,]+\))?\s+AS\s+\(', derived_table['sql'], re.IGNORECASE|re.DOTALL)]
# Remove quotes from tables
sql_table_names = [t.replace('"', '') for t in sql_table_names]
return LookerView(absolute_file_path=looker_viewfile.absolute_file_path, connection=connection, view_name=view_name, sql_table_names=sql_table_names)
# There is a single dependency in the view, on the sql_table_name
if sql_table_name is not None:
return LookerView(absolute_file_path=looker_viewfile.absolute_file_path, connection=connection, view_name=view_name, sql_table_names=[sql_table_name])
# The sql_table_name might be defined in another view and this view is extending that view, try to find it
else:
extends = looker_view.get("extends", [])
if len(extends) == 0:
# The view is malformed, the view is not a derived table, does not contain a sql_table_name or an extends
print(f"Skipping malformed with view_name: {view_name}. View should have a sql_table_name if it is not a derived table")
return None
extends_to_looker_view = []
# The base view could live in the same file
for raw_view in looker_viewfile.views:
raw_view_name = raw_view["name"]
# Make sure to skip loading view we are currently trying to resolve
if raw_view_name != view_name:
maybe_looker_view = LookerView.from_looker_dict(raw_view, connection, looker_viewfile, looker_viewfile_loader)
if maybe_looker_view is not None and maybe_looker_view.view_name in extends:
extends_to_looker_view.append(maybe_looker_view)
# Or it could live in one of the included files, we do not know which file the base view lives in, try them all!
for include in looker_viewfile.resolved_includes:
looker_viewfile = looker_viewfile_loader.load_viewfile(include, connection)
if looker_viewfile is not None:
for view in looker_viewfile.views:
maybe_looker_view = LookerView.from_looker_dict(view, connection, looker_viewfile, looker_viewfile_loader)
if maybe_looker_view is None:
continue
if maybe_looker_view is not None and maybe_looker_view.view_name in extends:
extends_to_looker_view.append(maybe_looker_view)
if len(extends_to_looker_view) != 1:
print(f"Skipping malformed view with view_name: {view_name}. View should have a single view in a view inheritance chain with a sql_table_name")
return None
output_looker_view = LookerView(absolute_file_path=looker_viewfile.absolute_file_path, connection=connection, view_name=view_name, sql_table_names=extends_to_looker_view[0].sql_table_names)
return output_looker_view
def get_platform_and_table(view_name: str, connection: str, sql_table_name: str):
"""
This will depend on what database connections you use in Looker
For SpotHero, we had two database connections in Looker: "redshift_test" (a redshift database) and "presto" (a presto database)
Presto supports querying across multiple catalogs, so we infer which underlying database presto is using based on the presto catalog name
For SpotHero, we have 3 catalogs in presto: "redshift", "hive", and "hive_emr"
"""
if connection == "redshift_test":
platform = "redshift"
table_name = sql_table_name
return platform, table_name
elif connection == "presto":
parts = sql_table_name.split(".")
catalog = parts[0]
if catalog == "hive":
platform = "hive"
elif catalog == "hive_emr":
platform = "hive_emr"
elif catalog == "redshift":
platform = "redshift"
else:
# Looker lets you exclude a catalog and use a configured default, the default we have configured is to use hive_emr
if sql_table_name.count(".") != 1:
raise Exception("Unknown catalog for sql_table_name: {sql_table_name} for view_name: {view_name}")
platform = "hive_emr"
return platform, sql_table_name
table_name = ".".join(parts[1::])
return platform, table_name
else:
raise Exception(f"Could not find a platform for looker view with connection: {connection}")
def construct_datalineage_urn(view_name: str, connection: str, sql_table_name: str):
platform, table_name = get_platform_and_table(view_name, connection, sql_table_name)
return f"urn:li:dataset:(urn:li:dataPlatform:{platform},{table_name},PROD)"
def construct_data_urn(looker_view: LookerView):
return f"urn:li:dataset:(urn:li:dataPlatform:{LOOKER_VIEW_PLATFORM},{looker_view.view_name},PROD)"
def build_dataset_mce(looker_view: LookerView):
"""
Creates MetadataChangeEvent for the dataset, creating upstream lineage links
"""
actor, sys_time = "urn:li:corpuser:etl", int(time.time()) * 1000
upstreams = [{
"auditStamp":{
"time": sys_time,
"actor":actor
},
"dataset": construct_datalineage_urn(looker_view.view_name, looker_view.connection, sql_table_name),
"type":"TRANSFORMED"
} for sql_table_name in looker_view.sql_table_names]
doc_elements = [{
"url":f"https://github.com/spothero/internal-looker-repo/blob/master/{looker_view.get_relative_file_path()}",
"description":"Github looker view definition",
"createStamp":{
"time": sys_time,
"actor": actor
}
}]
owners = [{
"owner": f"urn:li:corpuser:analysts",
"type": "DEVELOPER"
}]
return {
"auditHeader": None,
"proposedSnapshot":("com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot", {
"urn": construct_data_urn(looker_view),
"aspects": [
("com.linkedin.pegasus2avro.dataset.UpstreamLineage", {"upstreams": upstreams}),
("com.linkedin.pegasus2avro.common.InstitutionalMemory", {"elements": doc_elements}),
("com.linkedin.pegasus2avro.common.Ownership", {
"owners": owners,
"lastModified":{
"time": sys_time,
"actor": actor
}
})
]
}),
"proposedDelta": None
}
def delivery_report(err, msg):
""" Called once for each message produced to indicate delivery result.
Triggered by poll() or flush(). """
if err is not None:
print('Message delivery failed: {}'.format(err))
else:
print('Message delivered to {} [{}]'.format(msg.topic(), msg.partition()))
def make_kafka_producer(extra_kafka_conf):
conf = {
"on_delivery": delivery_report,
**extra_kafka_conf
}
key_schema = avro.loads('{"type": "string"}')
record_schema = avro.load(AVSC_PATH)
producer = AvroProducer(conf, default_key_schema=key_schema, default_value_schema=record_schema)
return producer
def main():
kafka_producer = make_kafka_producer(EXTRA_KAFKA_CONF)
viewfile_loader = LookerViewFileLoader()
looker_models = []
all_views = []
model_files = sorted(f for f in glob.glob(f"{LOOKER_DIRECTORY}/**/*.model.lkml", recursive=True))
for f in model_files:
try:
with open(f, 'r') as file:
parsed = lkml.load(file)
looker_model = LookerModel.from_looker_dict(parsed)
looker_models.append(looker_model)
except Exception as e:
print(e)
print(f"Error processing model file {f}. Skipping it")
for model in looker_models:
for include in model.resolved_includes:
looker_viewfile = viewfile_loader.load_viewfile(include, model.connection)
if looker_viewfile is not None:
for raw_view in looker_viewfile.views:
maybe_looker_view = LookerView.from_looker_dict(raw_view, model.connection, looker_viewfile, viewfile_loader)
if maybe_looker_view:
all_views.append(maybe_looker_view)
for view in all_views:
MCE = build_dataset_mce(view)
print(view)
print(MCE)
kafka_producer.produce(topic=KAFKA_TOPIC, key=MCE['proposedSnapshot'][1]['urn'], value=MCE)
kafka_producer.flush()
if __name__ == "__main__":
main()