2021-02-02 18:47:02 -08:00
|
|
|
import logging
|
2021-02-15 14:39:59 -08:00
|
|
|
from abc import abstractmethod
|
2021-02-09 15:58:26 -08:00
|
|
|
from dataclasses import dataclass, field
|
2021-04-21 15:14:59 -07:00
|
|
|
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type
|
2021-06-30 09:33:41 -07:00
|
|
|
from urllib.parse import quote_plus
|
2021-02-11 23:14:20 -08:00
|
|
|
|
2021-07-06 16:41:54 -07:00
|
|
|
import pydantic
|
2021-06-12 02:30:33 +02:00
|
|
|
from sqlalchemy import create_engine, inspect
|
2021-06-29 17:49:57 -07:00
|
|
|
from sqlalchemy.engine.reflection import Inspector
|
2021-02-19 19:57:13 -08:00
|
|
|
from sqlalchemy.sql import sqltypes as types
|
2021-02-02 18:47:02 -08:00
|
|
|
|
2021-02-23 15:55:31 -08:00
|
|
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
2021-07-08 12:11:06 -07:00
|
|
|
from datahub.emitter.mce_builder import DEFAULT_ENV
|
2021-03-03 19:49:46 -08:00
|
|
|
from datahub.ingestion.api.common import PipelineContext
|
2021-03-18 02:05:05 -04:00
|
|
|
from datahub.ingestion.api.source import Source, SourceReport
|
2021-06-29 11:43:43 -07:00
|
|
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
2021-02-15 15:04:21 -08:00
|
|
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
|
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
|
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
2021-02-11 23:14:20 -08:00
|
|
|
ArrayTypeClass,
|
2021-02-11 21:34:36 -08:00
|
|
|
BooleanTypeClass,
|
|
|
|
BytesTypeClass,
|
2021-03-10 02:07:20 -05:00
|
|
|
DateTypeClass,
|
2021-02-11 21:34:36 -08:00
|
|
|
EnumTypeClass,
|
2021-02-11 23:14:20 -08:00
|
|
|
MySqlDDL,
|
2021-02-11 21:34:36 -08:00
|
|
|
NullTypeClass,
|
2021-02-11 23:14:20 -08:00
|
|
|
NumberTypeClass,
|
2021-07-06 19:18:18 -07:00
|
|
|
RecordTypeClass,
|
2021-02-11 23:14:20 -08:00
|
|
|
SchemaField,
|
|
|
|
SchemaFieldDataType,
|
|
|
|
SchemaMetadata,
|
|
|
|
StringTypeClass,
|
2021-03-10 02:07:20 -05:00
|
|
|
TimeTypeClass,
|
2021-02-11 12:24:20 -08:00
|
|
|
)
|
2021-03-02 11:49:37 -08:00
|
|
|
from datahub.metadata.schema_classes import DatasetPropertiesClass
|
2021-02-11 12:24:20 -08:00
|
|
|
|
2021-04-14 13:40:24 -07:00
|
|
|
logger: logging.Logger = logging.getLogger(__name__)
|
2021-02-02 18:47:02 -08:00
|
|
|
|
|
|
|
|
2021-06-30 09:33:41 -07:00
|
|
|
def make_sqlalchemy_uri(
|
|
|
|
scheme: str,
|
|
|
|
username: Optional[str],
|
|
|
|
password: Optional[str],
|
|
|
|
at: Optional[str],
|
|
|
|
db: Optional[str],
|
|
|
|
uri_opts: Optional[Dict[str, Any]] = None,
|
|
|
|
) -> str:
|
|
|
|
url = f"{scheme}://"
|
|
|
|
if username is not None:
|
|
|
|
url += f"{quote_plus(username)}"
|
|
|
|
if password is not None:
|
|
|
|
url += f":{quote_plus(password)}"
|
|
|
|
url += "@"
|
|
|
|
if at is not None:
|
|
|
|
url += f"{at}"
|
|
|
|
if db is not None:
|
|
|
|
url += f"/{db}"
|
|
|
|
if uri_opts is not None:
|
|
|
|
if db is None:
|
|
|
|
url += "/"
|
|
|
|
params = "&".join(
|
|
|
|
f"{key}={quote_plus(value)}" for (key, value) in uri_opts.items() if value
|
|
|
|
)
|
|
|
|
url = f"{url}?{params}"
|
|
|
|
return url
|
|
|
|
|
|
|
|
|
2021-02-09 15:58:26 -08:00
|
|
|
@dataclass
|
|
|
|
class SQLSourceReport(SourceReport):
|
2021-04-14 13:40:24 -07:00
|
|
|
tables_scanned: int = 0
|
2021-06-12 02:30:33 +02:00
|
|
|
views_scanned: int = 0
|
2021-02-09 15:58:26 -08:00
|
|
|
filtered: List[str] = field(default_factory=list)
|
|
|
|
|
2021-06-12 02:30:33 +02:00
|
|
|
def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
|
|
|
|
"""
|
|
|
|
Entity could be a view or a table
|
|
|
|
"""
|
|
|
|
if ent_type == "table":
|
|
|
|
self.tables_scanned += 1
|
|
|
|
elif ent_type == "view":
|
|
|
|
self.views_scanned += 1
|
|
|
|
else:
|
|
|
|
raise KeyError(f"Unknown entity {ent_type}.")
|
|
|
|
|
|
|
|
def report_dropped(self, ent_name: str) -> None:
|
|
|
|
self.filtered.append(ent_name)
|
2021-02-09 15:58:26 -08:00
|
|
|
|
|
|
|
|
2021-02-23 15:55:31 -08:00
|
|
|
class SQLAlchemyConfig(ConfigModel):
|
2021-06-30 09:33:41 -07:00
|
|
|
env: str = DEFAULT_ENV
|
2021-03-03 19:49:46 -08:00
|
|
|
options: dict = {}
|
2021-03-11 09:08:01 +01:00
|
|
|
# Although the 'table_pattern' enables you to skip everything from certain schemas,
|
|
|
|
# having another option to allow/deny on schema level is an optimization for the case when there is a large number
|
|
|
|
# of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter
|
|
|
|
# them out afterwards via the table_pattern.
|
|
|
|
schema_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
|
2021-02-15 14:39:59 -08:00
|
|
|
table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
|
2021-06-12 02:30:33 +02:00
|
|
|
view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
|
|
|
|
|
2021-06-16 16:51:57 -07:00
|
|
|
include_views: Optional[bool] = True
|
2021-06-12 02:30:33 +02:00
|
|
|
include_tables: Optional[bool] = True
|
2021-02-15 14:39:59 -08:00
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def get_sql_alchemy_url(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class BasicSQLAlchemyConfig(SQLAlchemyConfig):
|
2021-03-02 19:47:47 +00:00
|
|
|
username: Optional[str] = None
|
2021-07-06 16:41:54 -07:00
|
|
|
password: Optional[pydantic.SecretStr] = None
|
2021-02-02 18:47:02 -08:00
|
|
|
host_port: str
|
2021-03-02 19:47:47 +00:00
|
|
|
database: Optional[str] = None
|
2021-07-20 19:31:42 +05:30
|
|
|
database_alias: Optional[str] = None
|
2021-02-02 18:47:02 -08:00
|
|
|
scheme: str
|
|
|
|
|
2021-06-30 09:33:41 -07:00
|
|
|
def get_sql_alchemy_url(self, uri_opts=None):
|
|
|
|
return make_sqlalchemy_uri(
|
|
|
|
self.scheme,
|
|
|
|
self.username,
|
2021-07-06 16:41:54 -07:00
|
|
|
self.password.get_secret_value() if self.password else None,
|
2021-06-30 09:33:41 -07:00
|
|
|
self.host_port,
|
|
|
|
self.database,
|
|
|
|
uri_opts=uri_opts,
|
|
|
|
)
|
2021-02-02 18:47:02 -08:00
|
|
|
|
2021-01-31 22:40:30 -08:00
|
|
|
|
|
|
|
@dataclass
|
2021-03-18 02:05:05 -04:00
|
|
|
class SqlWorkUnit(MetadataWorkUnit):
|
|
|
|
pass
|
2021-02-11 21:34:36 -08:00
|
|
|
|
|
|
|
|
2021-04-21 15:14:59 -07:00
|
|
|
_field_type_mapping: Dict[Type[types.TypeEngine], Type] = {
|
2021-02-11 12:24:20 -08:00
|
|
|
types.Integer: NumberTypeClass,
|
|
|
|
types.Numeric: NumberTypeClass,
|
|
|
|
types.Boolean: BooleanTypeClass,
|
|
|
|
types.Enum: EnumTypeClass,
|
|
|
|
types._Binary: BytesTypeClass,
|
2021-03-05 16:39:34 -08:00
|
|
|
types.LargeBinary: BytesTypeClass,
|
2021-02-11 12:24:20 -08:00
|
|
|
types.PickleType: BytesTypeClass,
|
|
|
|
types.ARRAY: ArrayTypeClass,
|
|
|
|
types.String: StringTypeClass,
|
2021-03-10 02:07:20 -05:00
|
|
|
types.Date: DateTypeClass,
|
|
|
|
types.DATE: DateTypeClass,
|
|
|
|
types.Time: TimeTypeClass,
|
|
|
|
types.DateTime: TimeTypeClass,
|
|
|
|
types.DATETIME: TimeTypeClass,
|
|
|
|
types.TIMESTAMP: TimeTypeClass,
|
2021-07-06 19:18:18 -07:00
|
|
|
types.JSON: RecordTypeClass,
|
2021-07-28 20:31:33 -07:00
|
|
|
# When SQLAlchemy is unable to map a type into its internal hierarchy, it
|
2021-02-19 19:57:13 -08:00
|
|
|
# assigns the NullType by default. We want to carry this warning through.
|
|
|
|
types.NullType: NullTypeClass,
|
2021-02-11 12:24:20 -08:00
|
|
|
}
|
2021-04-21 15:14:59 -07:00
|
|
|
_known_unknown_field_types: Set[Type[types.TypeEngine]] = {
|
2021-03-05 16:39:34 -08:00
|
|
|
types.Interval,
|
2021-03-10 02:07:20 -05:00
|
|
|
types.CLOB,
|
2021-03-05 16:39:34 -08:00
|
|
|
}
|
2021-02-11 12:24:20 -08:00
|
|
|
|
2021-02-11 21:34:36 -08:00
|
|
|
|
2021-04-21 15:14:59 -07:00
|
|
|
def register_custom_type(
|
|
|
|
tp: Type[types.TypeEngine], output: Optional[Type] = None
|
|
|
|
) -> None:
|
|
|
|
if output:
|
|
|
|
_field_type_mapping[tp] = output
|
|
|
|
else:
|
|
|
|
_known_unknown_field_types.add(tp)
|
|
|
|
|
|
|
|
|
2021-07-06 19:18:18 -07:00
|
|
|
class _CustomSQLAlchemyDummyType(types.TypeDecorator):
|
|
|
|
impl = types.LargeBinary
|
|
|
|
|
|
|
|
|
|
|
|
def make_sqlalchemy_type(name: str) -> Type[types.TypeEngine]:
|
|
|
|
# This usage of type() dynamically constructs a class.
|
|
|
|
# See https://stackoverflow.com/a/15247202/5004662 and
|
|
|
|
# https://docs.python.org/3/library/functions.html#type.
|
|
|
|
sqlalchemy_type: Type[types.TypeEngine] = type(
|
|
|
|
name,
|
|
|
|
(_CustomSQLAlchemyDummyType,),
|
|
|
|
{
|
|
|
|
"__repr__": lambda self: f"{name}()",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
return sqlalchemy_type
|
|
|
|
|
|
|
|
|
2021-02-11 22:48:08 -08:00
|
|
|
def get_column_type(
|
2021-03-10 02:07:20 -05:00
|
|
|
sql_report: SQLSourceReport, dataset_name: str, column_type: Any
|
2021-02-11 22:48:08 -08:00
|
|
|
) -> SchemaFieldDataType:
|
2021-02-11 12:24:20 -08:00
|
|
|
"""
|
|
|
|
Maps SQLAlchemy types (https://docs.sqlalchemy.org/en/13/core/type_basics.html) to corresponding schema types
|
|
|
|
"""
|
|
|
|
|
2021-03-10 02:07:20 -05:00
|
|
|
TypeClass: Optional[Type] = None
|
2021-02-11 12:24:20 -08:00
|
|
|
for sql_type in _field_type_mapping.keys():
|
|
|
|
if isinstance(column_type, sql_type):
|
|
|
|
TypeClass = _field_type_mapping[sql_type]
|
|
|
|
break
|
2021-03-05 16:39:34 -08:00
|
|
|
if TypeClass is None:
|
|
|
|
for sql_type in _known_unknown_field_types:
|
|
|
|
if isinstance(column_type, sql_type):
|
|
|
|
TypeClass = NullTypeClass
|
|
|
|
break
|
2021-02-11 21:34:36 -08:00
|
|
|
|
2021-02-11 12:24:20 -08:00
|
|
|
if TypeClass is None:
|
2021-02-11 22:48:08 -08:00
|
|
|
sql_report.report_warning(
|
2021-03-03 19:49:46 -08:00
|
|
|
dataset_name, f"unable to map type {column_type!r} to metadata schema"
|
2021-02-11 22:48:08 -08:00
|
|
|
)
|
2021-02-11 12:24:20 -08:00
|
|
|
TypeClass = NullTypeClass
|
2021-02-02 18:47:02 -08:00
|
|
|
|
2021-02-11 12:24:20 -08:00
|
|
|
return SchemaFieldDataType(type=TypeClass())
|
2021-01-31 22:40:30 -08:00
|
|
|
|
2021-02-09 01:02:05 -08:00
|
|
|
|
2021-02-11 22:48:08 -08:00
|
|
|
def get_schema_metadata(
|
2021-04-14 13:40:24 -07:00
|
|
|
sql_report: SQLSourceReport, dataset_name: str, platform: str, columns: List[dict]
|
2021-02-11 22:48:08 -08:00
|
|
|
) -> SchemaMetadata:
|
2021-02-11 12:24:20 -08:00
|
|
|
canonical_schema: List[SchemaField] = []
|
|
|
|
for column in columns:
|
|
|
|
field = SchemaField(
|
2021-02-11 22:48:08 -08:00
|
|
|
fieldPath=column["name"],
|
|
|
|
type=get_column_type(sql_report, dataset_name, column["type"]),
|
2021-06-30 22:57:13 -07:00
|
|
|
nativeDataType=column.get("full_type", repr(column["type"])),
|
2021-02-11 12:24:20 -08:00
|
|
|
description=column.get("comment", None),
|
2021-03-11 02:29:24 -05:00
|
|
|
nullable=column["nullable"],
|
2021-03-15 15:27:30 -07:00
|
|
|
recursive=False,
|
2021-02-11 12:24:20 -08:00
|
|
|
)
|
|
|
|
canonical_schema.append(field)
|
2021-02-02 18:47:02 -08:00
|
|
|
|
|
|
|
schema_metadata = SchemaMetadata(
|
|
|
|
schemaName=dataset_name,
|
2021-02-11 22:48:08 -08:00
|
|
|
platform=f"urn:li:dataPlatform:{platform}",
|
2021-02-02 18:47:02 -08:00
|
|
|
version=0,
|
|
|
|
hash="",
|
2021-02-11 22:48:08 -08:00
|
|
|
platformSchema=MySqlDDL(tableSchema=""),
|
2021-02-11 21:34:36 -08:00
|
|
|
fields=canonical_schema,
|
2021-02-11 12:24:20 -08:00
|
|
|
)
|
2021-02-02 18:47:02 -08:00
|
|
|
return schema_metadata
|
2021-01-31 22:40:30 -08:00
|
|
|
|
|
|
|
|
2021-02-09 01:02:05 -08:00
|
|
|
class SQLAlchemySource(Source):
|
|
|
|
"""A Base class for all SQL Sources that use SQLAlchemy to extend"""
|
|
|
|
|
2021-03-03 19:49:46 -08:00
|
|
|
def __init__(self, config: SQLAlchemyConfig, ctx: PipelineContext, platform: str):
|
2021-02-09 01:02:05 -08:00
|
|
|
super().__init__(ctx)
|
|
|
|
self.config = config
|
|
|
|
self.platform = platform
|
2021-02-09 15:58:26 -08:00
|
|
|
self.report = SQLSourceReport()
|
2021-02-09 01:02:05 -08:00
|
|
|
|
2021-06-30 15:54:17 -07:00
|
|
|
def get_inspectors(self) -> Iterable[Inspector]:
|
|
|
|
# This method can be overridden in the case that you want to dynamically
|
|
|
|
# run on multiple databases.
|
|
|
|
|
|
|
|
url = self.config.get_sql_alchemy_url()
|
|
|
|
logger.debug(f"sql_alchemy_url={url}")
|
|
|
|
engine = create_engine(url, **self.config.options)
|
|
|
|
inspector = inspect(engine)
|
|
|
|
yield inspector
|
|
|
|
|
2021-07-28 23:20:18 -04:00
|
|
|
def get_schema_names(self, inspector):
|
|
|
|
return inspector.get_schema_names()
|
|
|
|
|
2021-03-02 11:49:37 -08:00
|
|
|
def get_workunits(self) -> Iterable[SqlWorkUnit]:
|
2021-02-09 01:02:05 -08:00
|
|
|
sql_config = self.config
|
2021-06-03 11:14:34 -07:00
|
|
|
if logger.isEnabledFor(logging.DEBUG):
|
|
|
|
# If debug logging is enabled, we also want to echo each SQL query issued.
|
|
|
|
sql_config.options["echo"] = True
|
|
|
|
|
2021-06-30 15:54:17 -07:00
|
|
|
for inspector in self.get_inspectors():
|
2021-07-28 23:20:18 -04:00
|
|
|
for schema in self.get_schema_names(inspector):
|
2021-06-30 15:54:17 -07:00
|
|
|
if not sql_config.schema_pattern.allowed(schema):
|
2021-07-06 19:18:18 -07:00
|
|
|
self.report.report_dropped(f"{schema}.*")
|
2021-06-30 15:54:17 -07:00
|
|
|
continue
|
2021-03-11 09:08:01 +01:00
|
|
|
|
2021-06-30 15:54:17 -07:00
|
|
|
if sql_config.include_tables:
|
|
|
|
yield from self.loop_tables(inspector, schema, sql_config)
|
2021-06-12 02:30:33 +02:00
|
|
|
|
2021-06-30 15:54:17 -07:00
|
|
|
if sql_config.include_views:
|
|
|
|
yield from self.loop_views(inspector, schema, sql_config)
|
2021-06-12 02:30:33 +02:00
|
|
|
|
2021-07-28 20:31:33 -07:00
|
|
|
def standardize_schema_table_names(
|
|
|
|
self, schema: str, entity: str
|
|
|
|
) -> Tuple[str, str]:
|
|
|
|
# Some SQLAlchemy dialects need a standardization step to clean the schema
|
|
|
|
# and table names. See BigQuery for an example of when this is useful.
|
|
|
|
return schema, entity
|
|
|
|
|
|
|
|
def get_identifier(
|
|
|
|
self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
|
|
|
|
) -> str:
|
|
|
|
# Many SQLAlchemy dialects have three-level hierarchies. This method, which
|
|
|
|
# subclasses can override, enables them to modify the identifers as needed.
|
|
|
|
if hasattr(self.config, "get_identifier"):
|
|
|
|
# This path is deprecated and will eventually be removed.
|
|
|
|
return self.config.get_identifier(schema=schema, table=entity) # type: ignore
|
|
|
|
else:
|
|
|
|
return f"{schema}.{entity}"
|
|
|
|
|
2021-06-12 02:30:33 +02:00
|
|
|
def loop_tables(
|
|
|
|
self,
|
2021-06-29 17:49:57 -07:00
|
|
|
inspector: Inspector,
|
2021-06-12 02:30:33 +02:00
|
|
|
schema: str,
|
|
|
|
sql_config: SQLAlchemyConfig,
|
|
|
|
) -> Iterable[SqlWorkUnit]:
|
|
|
|
for table in inspector.get_table_names(schema):
|
2021-07-28 20:31:33 -07:00
|
|
|
schema, table = self.standardize_schema_table_names(
|
|
|
|
schema=schema, entity=table
|
|
|
|
)
|
|
|
|
dataset_name = self.get_identifier(
|
|
|
|
schema=schema, entity=table, inspector=inspector
|
|
|
|
)
|
2021-06-12 02:30:33 +02:00
|
|
|
self.report.report_entity_scanned(dataset_name, ent_type="table")
|
|
|
|
|
|
|
|
if not sql_config.table_pattern.allowed(dataset_name):
|
|
|
|
self.report.report_dropped(dataset_name)
|
|
|
|
continue
|
|
|
|
|
|
|
|
columns = inspector.get_columns(table, schema)
|
2021-07-20 06:45:35 -07:00
|
|
|
if len(columns) == 0:
|
|
|
|
self.report.report_warning(dataset_name, "missing column information")
|
|
|
|
|
2021-06-12 02:30:33 +02:00
|
|
|
try:
|
2021-06-29 17:49:57 -07:00
|
|
|
# SQLALchemy stubs are incomplete and missing this method.
|
|
|
|
# PR: https://github.com/dropbox/sqlalchemy-stubs/pull/223.
|
|
|
|
table_info: dict = inspector.get_table_comment(table, schema) # type: ignore
|
2021-06-12 02:30:33 +02:00
|
|
|
except NotImplementedError:
|
|
|
|
description: Optional[str] = None
|
|
|
|
properties: Dict[str, str] = {}
|
|
|
|
else:
|
|
|
|
description = table_info["text"]
|
|
|
|
|
|
|
|
# The "properties" field is a non-standard addition to SQLAlchemy's interface.
|
|
|
|
properties = table_info.get("properties", {})
|
|
|
|
|
|
|
|
# TODO: capture inspector.get_pk_constraint
|
|
|
|
# TODO: capture inspector.get_sorted_table_and_fkc_names
|
|
|
|
|
|
|
|
dataset_snapshot = DatasetSnapshot(
|
|
|
|
urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})",
|
|
|
|
aspects=[],
|
|
|
|
)
|
|
|
|
if description is not None or properties:
|
|
|
|
dataset_properties = DatasetPropertiesClass(
|
|
|
|
description=description,
|
|
|
|
customProperties=properties,
|
2021-03-11 16:41:05 -05:00
|
|
|
)
|
2021-06-12 02:30:33 +02:00
|
|
|
dataset_snapshot.aspects.append(dataset_properties)
|
|
|
|
schema_metadata = get_schema_metadata(
|
|
|
|
self.report, dataset_name, self.platform, columns
|
|
|
|
)
|
|
|
|
dataset_snapshot.aspects.append(schema_metadata)
|
|
|
|
|
|
|
|
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
|
|
wu = SqlWorkUnit(id=dataset_name, mce=mce)
|
|
|
|
self.report.report_workunit(wu)
|
|
|
|
yield wu
|
|
|
|
|
|
|
|
def loop_views(
|
|
|
|
self,
|
2021-06-29 17:49:57 -07:00
|
|
|
inspector: Inspector,
|
2021-06-12 02:30:33 +02:00
|
|
|
schema: str,
|
|
|
|
sql_config: SQLAlchemyConfig,
|
|
|
|
) -> Iterable[SqlWorkUnit]:
|
|
|
|
for view in inspector.get_view_names(schema):
|
2021-07-28 20:31:33 -07:00
|
|
|
schema, view = self.standardize_schema_table_names(
|
|
|
|
schema=schema, entity=view
|
|
|
|
)
|
|
|
|
dataset_name = self.get_identifier(
|
|
|
|
schema=schema, entity=view, inspector=inspector
|
|
|
|
)
|
2021-06-12 02:30:33 +02:00
|
|
|
self.report.report_entity_scanned(dataset_name, ent_type="view")
|
|
|
|
|
|
|
|
if not sql_config.view_pattern.allowed(dataset_name):
|
|
|
|
self.report.report_dropped(dataset_name)
|
|
|
|
continue
|
2021-03-11 16:41:05 -05:00
|
|
|
|
2021-07-02 10:26:50 -07:00
|
|
|
try:
|
|
|
|
columns = inspector.get_columns(view, schema)
|
|
|
|
except KeyError:
|
|
|
|
# For certain types of views, we are unable to fetch the list of columns.
|
|
|
|
self.report.report_warning(
|
|
|
|
dataset_name, "unable to get schema for this view"
|
|
|
|
)
|
|
|
|
schema_metadata = None
|
|
|
|
else:
|
|
|
|
schema_metadata = get_schema_metadata(
|
|
|
|
self.report, dataset_name, self.platform, columns
|
|
|
|
)
|
|
|
|
|
2021-06-12 02:30:33 +02:00
|
|
|
try:
|
2021-06-29 17:49:57 -07:00
|
|
|
# SQLALchemy stubs are incomplete and missing this method.
|
|
|
|
# PR: https://github.com/dropbox/sqlalchemy-stubs/pull/223.
|
|
|
|
view_info: dict = inspector.get_table_comment(view, schema) # type: ignore
|
2021-06-12 02:30:33 +02:00
|
|
|
except NotImplementedError:
|
|
|
|
description: Optional[str] = None
|
|
|
|
properties: Dict[str, str] = {}
|
|
|
|
else:
|
|
|
|
description = view_info["text"]
|
|
|
|
|
|
|
|
# The "properties" field is a non-standard addition to SQLAlchemy's interface.
|
|
|
|
properties = view_info.get("properties", {})
|
|
|
|
|
2021-06-29 17:49:57 -07:00
|
|
|
try:
|
|
|
|
view_definition = inspector.get_view_definition(view, schema)
|
|
|
|
if view_definition is None:
|
|
|
|
view_definition = ""
|
2021-07-02 10:26:50 -07:00
|
|
|
else:
|
|
|
|
# Some dialects return a TextClause instead of a raw string,
|
|
|
|
# so we need to convert them to a string.
|
|
|
|
view_definition = str(view_definition)
|
2021-06-29 17:49:57 -07:00
|
|
|
except NotImplementedError:
|
2021-06-12 02:30:33 +02:00
|
|
|
view_definition = ""
|
|
|
|
properties["view_definition"] = view_definition
|
|
|
|
properties["is_view"] = "True"
|
|
|
|
|
|
|
|
dataset_snapshot = DatasetSnapshot(
|
|
|
|
urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{dataset_name},{self.config.env})",
|
|
|
|
aspects=[],
|
|
|
|
)
|
|
|
|
if description is not None or properties:
|
|
|
|
dataset_properties = DatasetPropertiesClass(
|
|
|
|
description=description,
|
|
|
|
customProperties=properties,
|
|
|
|
# uri=dataset_name,
|
|
|
|
)
|
|
|
|
dataset_snapshot.aspects.append(dataset_properties)
|
2021-07-02 10:26:50 -07:00
|
|
|
|
|
|
|
if schema_metadata:
|
|
|
|
dataset_snapshot.aspects.append(schema_metadata)
|
2021-06-12 02:30:33 +02:00
|
|
|
|
|
|
|
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
|
|
wu = SqlWorkUnit(id=dataset_name, mce=mce)
|
|
|
|
self.report.report_workunit(wu)
|
|
|
|
yield wu
|
2021-02-11 21:34:36 -08:00
|
|
|
|
2021-02-09 15:58:26 -08:00
|
|
|
def get_report(self):
|
|
|
|
return self.report
|
2021-02-11 21:34:36 -08:00
|
|
|
|
2021-02-09 01:02:05 -08:00
|
|
|
def close(self):
|
|
|
|
pass
|