mirror of
https://github.com/datahub-project/datahub.git
synced 2026-01-06 14:57:12 +00:00
feat(ingestion): bring your own SQL parser (#3110)
This commit is contained in:
parent
263cec0c66
commit
f73725fdf6
@ -53,11 +53,13 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
|
||||
| `view_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
|
||||
| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. |
|
||||
| `parse_table_names_from_sql` | | `False` | See note below. |
|
||||
| `sql_parser` | | `datahub.utilities.sql_parser.DefaultSQLParser` | See note below. |
|
||||
|
||||
Note! The integration can use [`sql-metadata`](https://pypi.org/project/sql-metadata/) to try to parse the tables the
|
||||
views depends on. As these SQL's can be complicated, and the package doesn't official support all the SQL dialects that
|
||||
Looker supports, the result might not be correct. This parsing is disabled by default, but can be enabled by setting
|
||||
`parse_table_names_from_sql: True`.
|
||||
Note! The integration can use an SQL parser to try to parse the tables the views depends on. This parsing is disabled by default,
|
||||
but can be enabled by setting `parse_table_names_from_sql: True`. The default parser is based on the [`sql-metadata`](https://pypi.org/project/sql-metadata/) package.
|
||||
As this package doesn't officially support all the SQL dialects that Looker supports, the result might not be correct. You can, however, implement a
|
||||
custom parser and take it into use by setting the `sql_parser` configuration value. A custom SQL parser must inherit from `datahub.utilities.sql_parser.SQLParser`
|
||||
and must be made available to Datahub by ,for example, installing it. The configuration then needs to be set to `module_name.ClassName` of the parser.
|
||||
|
||||
## Compatibility
|
||||
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import glob
|
||||
import importlib
|
||||
import itertools
|
||||
import logging
|
||||
import pathlib
|
||||
@ -8,15 +9,16 @@ from dataclasses import dataclass
|
||||
from dataclasses import field as dataclass_field
|
||||
from dataclasses import replace
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
|
||||
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type
|
||||
|
||||
import pydantic
|
||||
|
||||
from datahub.utilities.sql_parser import SQLParser
|
||||
|
||||
if sys.version_info >= (3, 7):
|
||||
import lkml
|
||||
else:
|
||||
raise ModuleNotFoundError("The lookml plugin requires Python 3.7 or newer.")
|
||||
from sql_metadata import Parser as SQLParser
|
||||
|
||||
import datahub.emitter.mce_builder as builder
|
||||
from datahub.configuration import ConfigModel
|
||||
@ -66,6 +68,7 @@ class LookMLSourceConfig(ConfigModel):
|
||||
view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
|
||||
env: str = builder.DEFAULT_ENV
|
||||
parse_table_names_from_sql: bool = False
|
||||
sql_parser: str = "datahub.utilities.sql_parser.DefaultSQLParser"
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -252,8 +255,23 @@ class LookerView:
|
||||
fields: List[ViewField]
|
||||
|
||||
@classmethod
|
||||
def _get_sql_table_names(cls, sql: str) -> List[str]:
|
||||
sql_table_names: List[str] = SQLParser(sql).tables
|
||||
def _import_sql_parser_cls(cls, sql_parser_path: str) -> Type[SQLParser]:
|
||||
assert "." in sql_parser_path, "sql_parser-path must contain a ."
|
||||
module_name, cls_name = sql_parser_path.rsplit(".", 1)
|
||||
import sys
|
||||
|
||||
logger.info(sys.path)
|
||||
parser_cls = getattr(importlib.import_module(module_name), cls_name)
|
||||
if not issubclass(parser_cls, SQLParser):
|
||||
raise ValueError(f"must be derived from {SQLParser}; got {parser_cls}")
|
||||
|
||||
return parser_cls
|
||||
|
||||
@classmethod
|
||||
def _get_sql_table_names(cls, sql: str, sql_parser_path: str) -> List[str]:
|
||||
parser_cls = cls._import_sql_parser_cls(sql_parser_path)
|
||||
|
||||
sql_table_names: List[str] = parser_cls(sql).get_tables()
|
||||
|
||||
# Remove quotes from table names
|
||||
sql_table_names = [t.replace('"', "") for t in sql_table_names]
|
||||
@ -290,6 +308,7 @@ class LookerView:
|
||||
looker_viewfile_loader: LookerViewFileLoader,
|
||||
reporter: LookMLSourceReport,
|
||||
parse_table_names_from_sql: bool = False,
|
||||
sql_parser_path: str = "datahub.utilities.sql_parser.DefaultSQLParser",
|
||||
) -> Optional["LookerView"]:
|
||||
view_name = looker_view["name"]
|
||||
logger.debug(f"Handling view {view_name}")
|
||||
@ -330,7 +349,9 @@ class LookerView:
|
||||
sql_table_names = []
|
||||
if parse_table_names_from_sql and "sql" in derived_table:
|
||||
# Get the list of tables in the query
|
||||
sql_table_names = cls._get_sql_table_names(derived_table["sql"])
|
||||
sql_table_names = cls._get_sql_table_names(
|
||||
derived_table["sql"], sql_parser_path
|
||||
)
|
||||
|
||||
return LookerView(
|
||||
absolute_file_path=looker_viewfile.absolute_file_path,
|
||||
@ -686,6 +707,7 @@ class LookMLSource(Source):
|
||||
viewfile_loader,
|
||||
self.reporter,
|
||||
self.source_config.parse_table_names_from_sql,
|
||||
self.source_config.sql_parser,
|
||||
)
|
||||
except Exception as e:
|
||||
self.reporter.report_warning(
|
||||
|
||||
24
metadata-ingestion/src/datahub/utilities/sql_parser.py
Normal file
24
metadata-ingestion/src/datahub/utilities/sql_parser.py
Normal file
@ -0,0 +1,24 @@
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from typing import List
|
||||
|
||||
try:
|
||||
from sql_metadata import Parser as MetadataSQLParser
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class SQLParser(metaclass=ABCMeta):
|
||||
def __init__(self, sql_query: str) -> None:
|
||||
self._sql_query = sql_query
|
||||
|
||||
@abstractmethod
|
||||
def get_tables(self) -> List[str]:
|
||||
pass
|
||||
|
||||
|
||||
class DefaultSQLParser(SQLParser):
|
||||
def __init__(self, sql_query: str) -> None:
|
||||
self._parser = MetadataSQLParser(sql_query)
|
||||
|
||||
def get_tables(self) -> List[str]:
|
||||
return self._parser.tables
|
||||
@ -1,5 +1,10 @@
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
from datahub.utilities.delayed_iter import delayed_iter
|
||||
from datahub.utilities.groupby import groupby_unsorted
|
||||
from datahub.utilities.sql_parser import DefaultSQLParser
|
||||
|
||||
|
||||
def test_delayed_iter():
|
||||
@ -44,3 +49,15 @@ def test_groupby_unsorted():
|
||||
("B", ["B"]),
|
||||
("C", ["C", "C"]),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.skipif(
|
||||
sys.version_info < (3, 7), reason="The LookML source requires Python 3.7+"
|
||||
)
|
||||
def test_default_sql_parser():
|
||||
sql_query = "SELECT foo.a, foo.b, bar.c FROM foo JOIN bar ON (foo.a == bar.b);"
|
||||
|
||||
tables_list = DefaultSQLParser(sql_query).get_tables()
|
||||
|
||||
assert tables_list == ["foo", "bar"]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user