feat(ingest): Improve lookml sql derived tables detection, add cascading derived tables to lineage (#2770)

This commit is contained in:
Remi 2021-06-29 20:41:34 -06:00 committed by GitHub
parent 6fee59ebac
commit 2aa95ec750
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 10 additions and 16 deletions

View File

@ -82,7 +82,7 @@ plugins: Dict[str, Set[str]] = {
},
"ldap": {"python-ldap>=2.4"},
"looker": {"looker-sdk==21.6.0"},
"lookml": {"lkml>=1.1.0", "sql-metadata==1.12.0"},
"lookml": {"lkml>=1.1.0", "sql-metadata==2.2.1"},
"mongodb": {"pymongo>=3.11"},
"mssql": sql_common | {"sqlalchemy-pytds>=0.3"},
"mssql-odbc": sql_common | {"pyodbc"},

View File

@ -13,7 +13,7 @@ if sys.version_info >= (3, 7):
import lkml
else:
raise ModuleNotFoundError("The lookml plugin requires Python 3.7 or newer.")
from sql_metadata import get_query_tables
from sql_metadata import Parser as SQLParser
from datahub.configuration import ConfigModel
from datahub.configuration.common import AllowDenyPattern
@ -197,20 +197,9 @@ class LookerView: # pragma: no cover
@classmethod
def _get_sql_table_names(cls, sql: str) -> List[str]:
sql_tables: List[str] = get_query_tables(sql)
sql_table_names: List[str] = SQLParser(sql).tables
# Remove temporary tables from WITH statements
sql_table_names = [
t
for t in sql_tables
if not re.search(
fr"WITH(.*,)?\s+{t}(\s*\([\w\s,]+\))?\s+AS\s+\(",
sql,
re.IGNORECASE | re.DOTALL,
)
]
# Remove quotes from tables
# Remove quotes from table names
sql_table_names = [t.replace('"', "") for t in sql_table_names]
return sql_table_names
@ -383,7 +372,12 @@ class LookMLSource(Source): # pragma: no cover
def _construct_datalineage_urn(self, sql_table_name: str, connection: str) -> str:
platform = self._get_platform_based_on_connection(connection)
if "." in platform:
# Check if table name matches cascading derived tables pattern (same platform)
if re.fullmatch(r"\w+\.SQL_TABLE_NAME", sql_table_name):
platform_name = self.source_config.platform_name
sql_table_name = sql_table_name.lower().split(".")[0]
# Check if table database is in platform name (upstream platform)
elif "." in platform:
platform_name, database_name = platform.lower().split(".", maxsplit=1)
sql_table_name = f"{database_name}.{sql_table_name}".lower()
else: