mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-25 17:15:09 +00:00
344 lines
12 KiB
Python
344 lines
12 KiB
Python
import lkml
|
|
import glob
|
|
import time
|
|
import typing
|
|
import os
|
|
import re
|
|
|
|
from confluent_kafka import avro
|
|
from confluent_kafka.avro import AvroProducer
|
|
|
|
from dataclasses import dataclass, replace
|
|
|
|
from sql_metadata import get_query_tables
|
|
|
|
# Configuration
|
|
AVSC_PATH = "../../metadata-events/mxe-schemas/src/renamed/avro/com/linkedin/mxe/MetadataChangeEvent.avsc"
|
|
KAFKA_TOPIC = 'MetadataChangeEvent_v4'
|
|
|
|
# LOOKER_DIRECTORY = "./test_lookml"
|
|
LOOKER_DIRECTORY = os.environ["LOOKER_DIRECTORY"]
|
|
LOOKER_DIRECTORY = os.path.abspath(LOOKER_DIRECTORY)
|
|
|
|
EXTRA_KAFKA_CONF = {
|
|
'bootstrap.servers': 'localhost:9092',
|
|
'schema.registry.url': 'http://localhost:8081'
|
|
# 'security.protocol': 'SSL',
|
|
# 'ssl.ca.location': '',
|
|
# 'ssl.key.location': '',
|
|
# 'ssl.certificate.location': ''
|
|
}
|
|
|
|
# The datahub platform where looker views are stored
|
|
LOOKER_VIEW_PLATFORM = "looker_views"
|
|
|
|
class LookerViewFileLoader:
|
|
"""
|
|
Loads the looker viewfile at a :path and caches the LookerViewFile in memory
|
|
This is to avoid reloading the same file off of disk many times during the recursive include resolution process
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.viewfile_cache = {}
|
|
|
|
def _load_viewfile(self, path: str) -> typing.Optional["LookerViewFile"]:
|
|
if path in self.viewfile_cache:
|
|
return self.viewfile_cache[path]
|
|
|
|
try:
|
|
with open(path, "r") as file:
|
|
parsed = lkml.load(file)
|
|
looker_viewfile = LookerViewFile.from_looker_dict(path, parsed)
|
|
self.viewfile_cache[path] = looker_viewfile
|
|
return looker_viewfile
|
|
except Exception as e:
|
|
print(e)
|
|
print(f"Error processing view file {path}. Skipping it")
|
|
|
|
def load_viewfile(self, path: str, connection: str):
|
|
viewfile = self._load_viewfile(path)
|
|
if viewfile is None:
|
|
return None
|
|
|
|
return replace(viewfile, connection=connection)
|
|
|
|
|
|
|
|
@dataclass
|
|
class LookerModel:
|
|
connection: str
|
|
includes: typing.List[str]
|
|
resolved_includes: typing.List[str]
|
|
|
|
@staticmethod
|
|
def from_looker_dict(looker_model_dict):
|
|
connection = looker_model_dict["connection"]
|
|
includes = looker_model_dict["includes"]
|
|
resolved_includes = LookerModel.resolve_includes(includes)
|
|
|
|
return LookerModel(connection=connection, includes=includes, resolved_includes=resolved_includes)
|
|
|
|
@staticmethod
|
|
def resolve_includes(includes) -> typing.List[str]:
|
|
resolved = []
|
|
for inc in includes:
|
|
# Massage the looker include into a valid glob wildcard expression
|
|
glob_expr = f"{LOOKER_DIRECTORY}/{inc}"
|
|
outputs = glob.glob(glob_expr)
|
|
resolved.extend(outputs)
|
|
return resolved
|
|
|
|
|
|
@dataclass
|
|
class LookerViewFile:
|
|
absolute_file_path: str
|
|
connection: typing.Optional[str]
|
|
includes: typing.List[str]
|
|
resolved_includes: typing.List[str]
|
|
views: typing.List[typing.Dict]
|
|
|
|
@staticmethod
|
|
def from_looker_dict(absolute_file_path, looker_view_file_dict):
|
|
includes = looker_view_file_dict.get("includes", [])
|
|
resolved_includes = LookerModel.resolve_includes(includes)
|
|
views = looker_view_file_dict.get("views", [])
|
|
|
|
return LookerViewFile(absolute_file_path=absolute_file_path, connection=None, includes=includes, resolved_includes=resolved_includes, views=views)
|
|
|
|
@dataclass
|
|
class LookerView:
|
|
absolute_file_path: str
|
|
connection: str
|
|
view_name: str
|
|
sql_table_names: typing.List[str]
|
|
|
|
def get_relative_file_path(self):
|
|
if LOOKER_DIRECTORY in self.absolute_file_path:
|
|
return self.absolute_file_path.replace(LOOKER_DIRECTORY, '').lstrip('/')
|
|
else:
|
|
raise Exception(f"Found a looker view with name: {view_name} at path: {absolute_file_path} not underneath the base LOOKER_DIRECTORY: {LOOKER_DIRECTORY}. This should not happen")
|
|
|
|
@staticmethod
|
|
def from_looker_dict(looker_view, connection: str, looker_viewfile: LookerViewFile, looker_viewfile_loader: LookerViewFileLoader) -> typing.Optional["LookerView"]:
|
|
view_name = looker_view["name"]
|
|
sql_table_name = looker_view.get("sql_table_name", None)
|
|
# Some sql_table_name fields contain quotes like: optimizely."group", just remove the quotes
|
|
sql_table_name = sql_table_name.replace('"', '') if sql_table_name is not None else None
|
|
derived_table = looker_view.get("derived_table", None)
|
|
|
|
# Parse SQL from derived tables to extract dependencies
|
|
if derived_table is not None and 'sql' in derived_table:
|
|
# Get the list of tables in the query
|
|
sql_tables: typing.List[str] = get_query_tables(derived_table['sql'])
|
|
|
|
# Remove temporary tables from WITH statements
|
|
sql_table_names = [t for t in sql_tables if not re.search(f'WITH(.*,)?\s+{t}(\s*\([\w\s,]+\))?\s+AS\s+\(', derived_table['sql'], re.IGNORECASE|re.DOTALL)]
|
|
|
|
# Remove quotes from tables
|
|
sql_table_names = [t.replace('"', '') for t in sql_table_names]
|
|
|
|
return LookerView(absolute_file_path=looker_viewfile.absolute_file_path, connection=connection, view_name=view_name, sql_table_names=sql_table_names)
|
|
|
|
# There is a single dependency in the view, on the sql_table_name
|
|
if sql_table_name is not None:
|
|
return LookerView(absolute_file_path=looker_viewfile.absolute_file_path, connection=connection, view_name=view_name, sql_table_names=[sql_table_name])
|
|
|
|
# The sql_table_name might be defined in another view and this view is extending that view, try to find it
|
|
else:
|
|
extends = looker_view.get("extends", [])
|
|
if len(extends) == 0:
|
|
# The view is malformed, the view is not a derived table, does not contain a sql_table_name or an extends
|
|
print(f"Skipping malformed with view_name: {view_name}. View should have a sql_table_name if it is not a derived table")
|
|
return None
|
|
|
|
extends_to_looker_view = []
|
|
|
|
# The base view could live in the same file
|
|
for raw_view in looker_viewfile.views:
|
|
raw_view_name = raw_view["name"]
|
|
# Make sure to skip loading view we are currently trying to resolve
|
|
if raw_view_name != view_name:
|
|
maybe_looker_view = LookerView.from_looker_dict(raw_view, connection, looker_viewfile, looker_viewfile_loader)
|
|
if maybe_looker_view is not None and maybe_looker_view.view_name in extends:
|
|
extends_to_looker_view.append(maybe_looker_view)
|
|
|
|
# Or it could live in one of the included files, we do not know which file the base view lives in, try them all!
|
|
for include in looker_viewfile.resolved_includes:
|
|
looker_viewfile = looker_viewfile_loader.load_viewfile(include, connection)
|
|
if looker_viewfile is not None:
|
|
for view in looker_viewfile.views:
|
|
maybe_looker_view = LookerView.from_looker_dict(view, connection, looker_viewfile, looker_viewfile_loader)
|
|
if maybe_looker_view is None:
|
|
continue
|
|
|
|
if maybe_looker_view is not None and maybe_looker_view.view_name in extends:
|
|
extends_to_looker_view.append(maybe_looker_view)
|
|
|
|
if len(extends_to_looker_view) != 1:
|
|
print(f"Skipping malformed view with view_name: {view_name}. View should have a single view in a view inheritance chain with a sql_table_name")
|
|
return None
|
|
|
|
output_looker_view = LookerView(absolute_file_path=looker_viewfile.absolute_file_path, connection=connection, view_name=view_name, sql_table_names=extends_to_looker_view[0].sql_table_names)
|
|
return output_looker_view
|
|
|
|
|
|
|
|
|
|
def get_platform_and_table(view_name: str, connection: str, sql_table_name: str):
|
|
"""
|
|
This will depend on what database connections you use in Looker
|
|
For SpotHero, we had two database connections in Looker: "redshift_test" (a redshift database) and "presto" (a presto database)
|
|
Presto supports querying across multiple catalogs, so we infer which underlying database presto is using based on the presto catalog name
|
|
For SpotHero, we have 3 catalogs in presto: "redshift", "hive", and "hive_emr"
|
|
"""
|
|
if connection == "redshift_test":
|
|
platform = "redshift"
|
|
table_name = sql_table_name
|
|
return platform, table_name
|
|
|
|
elif connection == "presto":
|
|
parts = sql_table_name.split(".")
|
|
catalog = parts[0]
|
|
|
|
if catalog == "hive":
|
|
platform = "hive"
|
|
elif catalog == "hive_emr":
|
|
platform = "hive_emr"
|
|
elif catalog == "redshift":
|
|
platform = "redshift"
|
|
else:
|
|
# Looker lets you exclude a catalog and use a configured default, the default we have configured is to use hive_emr
|
|
if sql_table_name.count(".") != 1:
|
|
raise Exception("Unknown catalog for sql_table_name: {sql_table_name} for view_name: {view_name}")
|
|
|
|
platform = "hive_emr"
|
|
return platform, sql_table_name
|
|
|
|
table_name = ".".join(parts[1::])
|
|
return platform, table_name
|
|
else:
|
|
raise Exception(f"Could not find a platform for looker view with connection: {connection}")
|
|
|
|
|
|
def construct_datalineage_urn(view_name: str, connection: str, sql_table_name: str):
|
|
platform, table_name = get_platform_and_table(view_name, connection, sql_table_name)
|
|
return f"urn:li:dataset:(urn:li:dataPlatform:{platform},{table_name},PROD)"
|
|
|
|
def construct_data_urn(looker_view: LookerView):
|
|
return f"urn:li:dataset:(urn:li:dataPlatform:{LOOKER_VIEW_PLATFORM},{looker_view.view_name},PROD)"
|
|
|
|
|
|
|
|
def build_dataset_mce(looker_view: LookerView):
|
|
"""
|
|
Creates MetadataChangeEvent for the dataset, creating upstream lineage links
|
|
"""
|
|
actor, sys_time = "urn:li:corpuser:etl", int(time.time()) * 1000
|
|
|
|
upstreams = [{
|
|
"auditStamp":{
|
|
"time": sys_time,
|
|
"actor":actor
|
|
},
|
|
"dataset": construct_datalineage_urn(looker_view.view_name, looker_view.connection, sql_table_name),
|
|
"type":"TRANSFORMED"
|
|
} for sql_table_name in looker_view.sql_table_names]
|
|
|
|
|
|
doc_elements = [{
|
|
"url":f"https://github.com/spothero/internal-looker-repo/blob/master/{looker_view.get_relative_file_path()}",
|
|
"description":"Github looker view definition",
|
|
"createStamp":{
|
|
"time": sys_time,
|
|
"actor": actor
|
|
}
|
|
}]
|
|
|
|
owners = [{
|
|
"owner": f"urn:li:corpuser:analysts",
|
|
"type": "DEVELOPER"
|
|
}]
|
|
|
|
return {
|
|
"auditHeader": None,
|
|
"proposedSnapshot":("com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot", {
|
|
"urn": construct_data_urn(looker_view),
|
|
"aspects": [
|
|
("com.linkedin.pegasus2avro.dataset.UpstreamLineage", {"upstreams": upstreams}),
|
|
("com.linkedin.pegasus2avro.common.InstitutionalMemory", {"elements": doc_elements}),
|
|
("com.linkedin.pegasus2avro.common.Ownership", {
|
|
"owners": owners,
|
|
"lastModified":{
|
|
"time": sys_time,
|
|
"actor": actor
|
|
}
|
|
})
|
|
]
|
|
}),
|
|
"proposedDelta": None
|
|
}
|
|
|
|
|
|
def delivery_report(err, msg):
|
|
""" Called once for each message produced to indicate delivery result.
|
|
Triggered by poll() or flush(). """
|
|
if err is not None:
|
|
print('Message delivery failed: {}'.format(err))
|
|
else:
|
|
print('Message delivered to {} [{}]'.format(msg.topic(), msg.partition()))
|
|
|
|
|
|
def make_kafka_producer(extra_kafka_conf):
|
|
conf = {
|
|
"on_delivery": delivery_report,
|
|
**extra_kafka_conf
|
|
}
|
|
|
|
key_schema = avro.loads('{"type": "string"}')
|
|
record_schema = avro.load(AVSC_PATH)
|
|
producer = AvroProducer(conf, default_key_schema=key_schema, default_value_schema=record_schema)
|
|
return producer
|
|
|
|
|
|
def main():
|
|
kafka_producer = make_kafka_producer(EXTRA_KAFKA_CONF)
|
|
viewfile_loader = LookerViewFileLoader()
|
|
|
|
looker_models = []
|
|
all_views = []
|
|
|
|
model_files = sorted(f for f in glob.glob(f"{LOOKER_DIRECTORY}/**/*.model.lkml", recursive=True))
|
|
for f in model_files:
|
|
try:
|
|
with open(f, 'r') as file:
|
|
parsed = lkml.load(file)
|
|
looker_model = LookerModel.from_looker_dict(parsed)
|
|
looker_models.append(looker_model)
|
|
except Exception as e:
|
|
print(e)
|
|
print(f"Error processing model file {f}. Skipping it")
|
|
|
|
|
|
|
|
for model in looker_models:
|
|
for include in model.resolved_includes:
|
|
looker_viewfile = viewfile_loader.load_viewfile(include, model.connection)
|
|
if looker_viewfile is not None:
|
|
for raw_view in looker_viewfile.views:
|
|
maybe_looker_view = LookerView.from_looker_dict(raw_view, model.connection, looker_viewfile, viewfile_loader)
|
|
if maybe_looker_view:
|
|
all_views.append(maybe_looker_view)
|
|
|
|
|
|
for view in all_views:
|
|
MCE = build_dataset_mce(view)
|
|
print(view)
|
|
print(MCE)
|
|
kafka_producer.produce(topic=KAFKA_TOPIC, key=MCE['proposedSnapshot'][1]['urn'], value=MCE)
|
|
kafka_producer.flush()
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|