feat(ingestion): bring your own SQL parser (#3110)

This commit is contained in:
Fredrik Sannholm 2021-08-24 06:21:48 +03:00 committed by GitHub
parent 263cec0c66
commit f73725fdf6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 74 additions and 9 deletions

View File

@ -53,11 +53,13 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
| `view_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. |
| `parse_table_names_from_sql` | | `False` | See note below. |
| `sql_parser` | | `datahub.utilities.sql_parser.DefaultSQLParser` | See note below. |
Note! The integration can use [`sql-metadata`](https://pypi.org/project/sql-metadata/) to try to parse the tables the
views depends on. As these SQL's can be complicated, and the package doesn't official support all the SQL dialects that
Looker supports, the result might not be correct. This parsing is disabled by default, but can be enabled by setting
`parse_table_names_from_sql: True`.
Note! The integration can use an SQL parser to try to parse the tables the views depends on. This parsing is disabled by default,
but can be enabled by setting `parse_table_names_from_sql: True`. The default parser is based on the [`sql-metadata`](https://pypi.org/project/sql-metadata/) package.
As this package doesn't officially support all the SQL dialects that Looker supports, the result might not be correct. You can, however, implement a
custom parser and take it into use by setting the `sql_parser` configuration value. A custom SQL parser must inherit from `datahub.utilities.sql_parser.SQLParser`
and must be made available to Datahub by ,for example, installing it. The configuration then needs to be set to `module_name.ClassName` of the parser.
## Compatibility

View File

@ -1,4 +1,5 @@
import glob
import importlib
import itertools
import logging
import pathlib
@ -8,15 +9,16 @@ from dataclasses import dataclass
from dataclasses import field as dataclass_field
from dataclasses import replace
from enum import Enum
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type
import pydantic
from datahub.utilities.sql_parser import SQLParser
if sys.version_info >= (3, 7):
import lkml
else:
raise ModuleNotFoundError("The lookml plugin requires Python 3.7 or newer.")
from sql_metadata import Parser as SQLParser
import datahub.emitter.mce_builder as builder
from datahub.configuration import ConfigModel
@ -66,6 +68,7 @@ class LookMLSourceConfig(ConfigModel):
view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
env: str = builder.DEFAULT_ENV
parse_table_names_from_sql: bool = False
sql_parser: str = "datahub.utilities.sql_parser.DefaultSQLParser"
@dataclass
@ -252,8 +255,23 @@ class LookerView:
fields: List[ViewField]
@classmethod
def _get_sql_table_names(cls, sql: str) -> List[str]:
sql_table_names: List[str] = SQLParser(sql).tables
def _import_sql_parser_cls(cls, sql_parser_path: str) -> Type[SQLParser]:
assert "." in sql_parser_path, "sql_parser-path must contain a ."
module_name, cls_name = sql_parser_path.rsplit(".", 1)
import sys
logger.info(sys.path)
parser_cls = getattr(importlib.import_module(module_name), cls_name)
if not issubclass(parser_cls, SQLParser):
raise ValueError(f"must be derived from {SQLParser}; got {parser_cls}")
return parser_cls
@classmethod
def _get_sql_table_names(cls, sql: str, sql_parser_path: str) -> List[str]:
parser_cls = cls._import_sql_parser_cls(sql_parser_path)
sql_table_names: List[str] = parser_cls(sql).get_tables()
# Remove quotes from table names
sql_table_names = [t.replace('"', "") for t in sql_table_names]
@ -290,6 +308,7 @@ class LookerView:
looker_viewfile_loader: LookerViewFileLoader,
reporter: LookMLSourceReport,
parse_table_names_from_sql: bool = False,
sql_parser_path: str = "datahub.utilities.sql_parser.DefaultSQLParser",
) -> Optional["LookerView"]:
view_name = looker_view["name"]
logger.debug(f"Handling view {view_name}")
@ -330,7 +349,9 @@ class LookerView:
sql_table_names = []
if parse_table_names_from_sql and "sql" in derived_table:
# Get the list of tables in the query
sql_table_names = cls._get_sql_table_names(derived_table["sql"])
sql_table_names = cls._get_sql_table_names(
derived_table["sql"], sql_parser_path
)
return LookerView(
absolute_file_path=looker_viewfile.absolute_file_path,
@ -686,6 +707,7 @@ class LookMLSource(Source):
viewfile_loader,
self.reporter,
self.source_config.parse_table_names_from_sql,
self.source_config.sql_parser,
)
except Exception as e:
self.reporter.report_warning(

View File

@ -0,0 +1,24 @@
from abc import ABCMeta, abstractmethod
from typing import List
try:
from sql_metadata import Parser as MetadataSQLParser
except ImportError:
pass
class SQLParser(metaclass=ABCMeta):
def __init__(self, sql_query: str) -> None:
self._sql_query = sql_query
@abstractmethod
def get_tables(self) -> List[str]:
pass
class DefaultSQLParser(SQLParser):
def __init__(self, sql_query: str) -> None:
self._parser = MetadataSQLParser(sql_query)
def get_tables(self) -> List[str]:
return self._parser.tables

View File

@ -1,5 +1,10 @@
import sys
import pytest
from datahub.utilities.delayed_iter import delayed_iter
from datahub.utilities.groupby import groupby_unsorted
from datahub.utilities.sql_parser import DefaultSQLParser
def test_delayed_iter():
@ -44,3 +49,15 @@ def test_groupby_unsorted():
("B", ["B"]),
("C", ["C", "C"]),
]
@pytest.mark.integration
@pytest.mark.skipif(
sys.version_info < (3, 7), reason="The LookML source requires Python 3.7+"
)
def test_default_sql_parser():
sql_query = "SELECT foo.a, foo.b, bar.c FROM foo JOIN bar ON (foo.a == bar.b);"
tables_list = DefaultSQLParser(sql_query).get_tables()
assert tables_list == ["foo", "bar"]