Fix linting (#1958)

This commit is contained in:
Pere Miquel Brull 2021-12-29 17:33:40 +01:00 committed by GitHub
parent b11e503117
commit 1e334af89c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 175 additions and 115 deletions

View File

@ -1,6 +1,10 @@
[BASIC] [BASIC]
# W1203: logging-fstring-interpolation - f-string brings better readability and unifies style # W1203: logging-fstring-interpolation - f-string brings better readability and unifies style
disable=W1203 # W1202: logging-format-interpolation - lazy formatting in logging functions
disable=W1203,W1202
docstring-min-length=20
max-args=7
max-attributes=12
[MASTER] [MASTER]
fail-under=6.0 fail-under=6.0

View File

@ -8,6 +8,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""
Generic source to build SQL connectors.
"""
import json import json
import logging import logging
import re import re
@ -16,7 +19,7 @@ import uuid
from abc import abstractmethod from abc import abstractmethod
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datetime import datetime from datetime import datetime
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type from typing import Dict, Iterable, List, Optional, Tuple
from urllib.parse import quote_plus from urllib.parse import quote_plus
from pydantic import SecretStr from pydantic import SecretStr
@ -54,20 +57,22 @@ logger: logging.Logger = logging.getLogger(__name__)
@dataclass @dataclass
class SQLSourceStatus(SourceStatus): class SQLSourceStatus(SourceStatus):
"""
Reports the source status after ingestion
"""
success: List[str] = field(default_factory=list) success: List[str] = field(default_factory=list)
failures: List[str] = field(default_factory=list) failures: List[str] = field(default_factory=list)
warnings: List[str] = field(default_factory=list) warnings: List[str] = field(default_factory=list)
filtered: List[str] = field(default_factory=list) filtered: List[str] = field(default_factory=list)
def scanned(self, table_name: str) -> None: def scanned(self, record: str) -> None:
self.success.append(table_name) self.success.append(record)
logger.info("Table Scanned: {}".format(table_name)) logger.info(f"Table Scanned: {record}")
def filter( def filter(self, record: str, err: str) -> None:
self, table_name: str, err: str, dataset_name: str = None, col_type: str = None self.filtered.append(record)
) -> None: logger.warning(f"Dropped Table {record} due to {err}")
self.filtered.append(table_name)
logger.warning("Dropped Table {} due to {}".format(table_name, err))
def build_sql_source_connection_url( def build_sql_source_connection_url(
@ -76,8 +81,11 @@ def build_sql_source_connection_url(
username: Optional[str] = None, username: Optional[str] = None,
password: Optional[SecretStr] = None, password: Optional[SecretStr] = None,
database: Optional[str] = None, database: Optional[str] = None,
options: dict = {}, options: Optional[dict] = None,
) -> str: ) -> str:
"""
Helper function to prepare the db URL
"""
url = f"{scheme}://" url = f"{scheme}://"
if username is not None: if username is not None:
@ -100,6 +108,12 @@ def build_sql_source_connection_url(
class SQLConnectionConfig(ConfigModel): class SQLConnectionConfig(ConfigModel):
"""
Config class containing all supported
configurations for an SQL source, including
data profiling and DBT generated information.
"""
username: Optional[str] = None username: Optional[str] = None
password: Optional[SecretStr] = None password: Optional[SecretStr] = None
host_port: str host_port: str
@ -144,7 +158,8 @@ def _get_table_description(schema: str, table: str, inspector: Inspector) -> str
description = None description = None
try: try:
table_info: dict = inspector.get_table_comment(table, schema) table_info: dict = inspector.get_table_comment(table, schema)
except Exception as err: # Catch any exception without breaking the ingestion
except Exception as err: # pylint: disable=broad-except
logger.error(f"Table Description Error : {err}") logger.error(f"Table Description Error : {err}")
else: else:
description = table_info["text"] description = table_info["text"]
@ -152,6 +167,12 @@ def _get_table_description(schema: str, table: str, inspector: Inspector) -> str
class SQLSource(Source[OMetaDatabaseAndTable]): class SQLSource(Source[OMetaDatabaseAndTable]):
"""
Source Connector implementation to extract
Database & Table information and convert it
to OpenMetadata Entities
"""
def __init__( def __init__(
self, self,
config: SQLConnectionConfig, config: SQLConnectionConfig,
@ -174,26 +195,38 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
self.data_profiler = None self.data_profiler = None
self.data_models = {} self.data_models = {}
if self.config.dbt_catalog_file is not None: if self.config.dbt_catalog_file is not None:
self.dbt_catalog = json.load(open(self.config.dbt_catalog_file, "r")) with open(self.config.dbt_catalog_file, "r", encoding="utf-8") as catalog:
self.dbt_catalog = json.load(catalog)
if self.config.dbt_manifest_file is not None: if self.config.dbt_manifest_file is not None:
self.dbt_manifest = json.load(open(self.config.dbt_manifest_file, "r")) with open(self.config.dbt_manifest_file, "r", encoding="utf-8") as manifest:
self.dbt_manifest = json.load(manifest)
def _instantiate_profiler(self): def _instantiate_profiler(self) -> bool:
"""
If the profiler is configured, load it and run.
Return True if the profiling ran correctly
"""
try: try:
if self.config.data_profiler_enabled: if self.config.data_profiler_enabled:
if self.data_profiler is None: if self.data_profiler is None:
# pylint: disable=import-outside-toplevel
from metadata.profiler.dataprofiler import DataProfiler from metadata.profiler.dataprofiler import DataProfiler
# pylint: enable=import-outside-toplevel
self.data_profiler = DataProfiler( self.data_profiler = DataProfiler(
status=self.status, connection_str=self.connection_string status=self.status, connection_str=self.connection_string
) )
return True return True
return False # Catch any errors during profiling init and continue ingestion
except Exception: except Exception as exc: # pylint: disable=broad-except
logger.error( logger.error(
f"Error loading profiler {exc}"
"DataProfiler configuration is enabled. Please make sure you ran " "DataProfiler configuration is enabled. Please make sure you ran "
"pip install 'openmetadata-ingestion[data-profiler]'" "pip install 'openmetadata-ingestion[data-profiler]'"
) )
return False
def prepare(self): def prepare(self):
self._parse_data_model() self._parse_data_model()
@ -204,15 +237,15 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
): ):
pass pass
def type_of_column_name(self, sa_type, table_name: str, column_name: str): @staticmethod
return sa_type def standardize_schema_table_names(schema: str, table: str) -> Tuple[str, str]:
def standardize_schema_table_names(
self, schema: str, table: str
) -> Tuple[str, str]:
return schema, table return schema, table
def fetch_sample_data(self, schema: str, table: str): def fetch_sample_data(self, schema: str, table: str) -> Optional[TableData]:
"""
Get some sample data from the source to be added
to the Table Entities
"""
try: try:
query = self.config.query.format(schema, table) query = self.config.query.format(schema, table)
logger.info(query) logger.info(query)
@ -221,14 +254,14 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
for col in results.keys(): for col in results.keys():
cols.append(col.replace(".", "_DOT_")) cols.append(col.replace(".", "_DOT_"))
rows = [] rows = []
for r in results: for res in results:
row = list(r) row = list(res)
rows.append(row) rows.append(row)
return TableData(columns=cols, rows=rows) return TableData(columns=cols, rows=rows)
except Exception as err: # Catch any errors and continue the ingestion
logger.error( except Exception as err: # pylint: disable=broad-except
"Failed to generate sample data for {} - {}".format(table, err) logger.error(f"Failed to generate sample data for {table} - {err}")
) return None
def next_record(self) -> Iterable[OMetaDatabaseAndTable]: def next_record(self) -> Iterable[OMetaDatabaseAndTable]:
inspector = inspect(self.engine) inspector = inspect(self.engine)
@ -236,7 +269,7 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
if not self.sql_config.schema_filter_pattern.included(schema): if not self.sql_config.schema_filter_pattern.included(schema):
self.status.filter(schema, "Schema pattern not allowed") self.status.filter(schema, "Schema pattern not allowed")
continue continue
logger.debug("total tables {}".format(inspector.get_table_names(schema))) logger.debug(f"Total tables {inspector.get_table_names(schema)}")
if self.config.include_tables: if self.config.include_tables:
yield from self.fetch_tables(inspector, schema) yield from self.fetch_tables(inspector, schema)
if self.config.include_views: if self.config.include_views:
@ -245,6 +278,10 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
def fetch_tables( def fetch_tables(
self, inspector: Inspector, schema: str self, inspector: Inspector, schema: str
) -> Iterable[OMetaDatabaseAndTable]: ) -> Iterable[OMetaDatabaseAndTable]:
"""
Scrape an SQL schema and prepare Database and Table
OpenMetadata Entities
"""
for table_name in inspector.get_table_names(schema): for table_name in inspector.get_table_names(schema):
try: try:
schema, table_name = self.standardize_schema_table_names( schema, table_name = self.standardize_schema_table_names(
@ -252,13 +289,11 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
) )
if not self.sql_config.table_filter_pattern.included(table_name): if not self.sql_config.table_filter_pattern.included(table_name):
self.status.filter( self.status.filter(
"{}.{}".format(self.config.get_service_name(), table_name), f"{self.config.get_service_name()}.{table_name}",
"Table pattern not allowed", "Table pattern not allowed",
) )
continue continue
self.status.scanned( self.status.scanned(f"{self.config.get_service_name()}.{table_name}")
"{}.{}".format(self.config.get_service_name(), table_name)
)
description = _get_table_description(schema, table_name, inspector) description = _get_table_description(schema, table_name, inspector)
fqn = f"{self.config.service_name}.{self.config.database}.{schema}.{table_name}" fqn = f"{self.config.service_name}.{self.config.database}.{schema}.{table_name}"
@ -275,7 +310,8 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
if self.sql_config.generate_sample_data: if self.sql_config.generate_sample_data:
table_data = self.fetch_sample_data(schema, table_name) table_data = self.fetch_sample_data(schema, table_name)
table_entity.sampleData = table_data table_entity.sampleData = table_data
except Exception as err: # Catch any errors during the ingestion and continue
except Exception as err: # pylint: disable=broad-except
logger.error(repr(err)) logger.error(repr(err))
logger.error(err) logger.error(err)
@ -291,16 +327,19 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
table=table_entity, database=self._get_database(schema) table=table_entity, database=self._get_database(schema)
) )
yield table_and_db yield table_and_db
except Exception as err: # Catch any errors during the ingestion and continue
except Exception as err: # pylint: disable=broad-except
logger.error(err) logger.error(err)
self.status.warnings.append( self.status.warnings.append(f"{self.config.service_name}.{table_name}")
"{}.{}".format(self.config.service_name, table_name)
)
continue continue
def fetch_views( def fetch_views(
self, inspector: Inspector, schema: str self, inspector: Inspector, schema: str
) -> Iterable[OMetaDatabaseAndTable]: ) -> Iterable[OMetaDatabaseAndTable]:
"""
Get all views in the SQL schema and prepare
Database & Table OpenMetadata Entities
"""
for view_name in inspector.get_view_names(schema): for view_name in inspector.get_view_names(schema):
try: try:
if self.config.scheme == "bigquery": if self.config.scheme == "bigquery":
@ -309,12 +348,11 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
) )
if not self.sql_config.table_filter_pattern.included(view_name): if not self.sql_config.table_filter_pattern.included(view_name):
self.status.filter( self.status.filter(
"{}.{}".format(self.config.get_service_name(), view_name), f"{self.config.get_service_name()}.{view_name}",
"View pattern not allowed", "View pattern not allowed",
) )
continue continue
try: try:
if self.config.scheme == "bigquery": if self.config.scheme == "bigquery":
view_definition = inspector.get_view_definition( view_definition = inspector.get_view_definition(
f"{self.config.project_id}.{schema}.{view_name}" f"{self.config.project_id}.{schema}.{view_name}"
@ -329,17 +367,15 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
except NotImplementedError: except NotImplementedError:
view_definition = "" view_definition = ""
description = _get_table_description(schema, view_name, inspector)
table_columns = self._get_columns(schema, view_name, inspector)
view_name = view_name.replace(".", "_DOT_")
fqn = f"{self.config.service_name}.{self.config.database}.{schema}.{view_name}"
table = Table( table = Table(
id=uuid.uuid4(), id=uuid.uuid4(),
name=view_name, name=view_name.replace(".", "_DOT_"),
tableType="View", tableType="View",
description=description if description is not None else " ", description=_get_table_description(schema, view_name, inspector)
fullyQualifiedName=fqn, or "",
columns=table_columns, # This will be generated in the backend!! #1673
fullyQualifiedName=view_name,
columns=self._get_columns(schema, view_name, inspector),
viewDefinition=view_definition, viewDefinition=view_definition,
) )
if self.sql_config.generate_sample_data: if self.sql_config.generate_sample_data:
@ -350,31 +386,34 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
table=table, database=self._get_database(schema) table=table, database=self._get_database(schema)
) )
yield table_and_db yield table_and_db
except Exception as err: # Catch any errors and continue the ingestion
except Exception as err: # pylint: disable=broad-except
logger.error(err) logger.error(err)
self.status.warnings.append( self.status.warnings.append(f"{self.config.service_name}.{view_name}")
"{}.{}".format(self.config.service_name, view_name)
)
continue continue
def _parse_data_model(self): def _parse_data_model(self):
"""
Get all the DBT information and feed it to the Table Entity
"""
if self.config.dbt_manifest_file and self.config.dbt_catalog_file: if self.config.dbt_manifest_file and self.config.dbt_catalog_file:
logger.info("Parsing Data Models") logger.info("Parsing Data Models")
manifest_nodes = self.dbt_manifest["nodes"] manifest_entities = {
manifest_sources = self.dbt_manifest["sources"] **self.dbt_manifest["nodes"],
manifest_entities = {**manifest_nodes, **manifest_sources} **self.dbt_manifest["sources"],
catalog_nodes = self.dbt_catalog["nodes"] }
catalog_sources = self.dbt_catalog["sources"] catalog_entities = {
catalog_entities = {**catalog_nodes, **catalog_sources} **self.dbt_catalog["nodes"],
**self.dbt_catalog["sources"],
}
for key, mnode in manifest_entities.items(): for key, mnode in manifest_entities.items():
name = mnode["alias"] if "alias" in mnode.keys() else mnode["name"] name = mnode["alias"] if "alias" in mnode.keys() else mnode["name"]
cnode = catalog_entities.get(key) cnode = catalog_entities.get(key)
if cnode is not None: columns = (
columns = self._parse_data_model_columns(name, mnode, cnode) self._parse_data_model_columns(name, mnode, cnode) if cnode else []
else: )
columns = []
if mnode["resource_type"] == "test": if mnode["resource_type"] == "test":
continue continue
upstream_nodes = self._parse_data_model_upstream(mnode) upstream_nodes = self._parse_data_model_upstream(mnode)
@ -382,14 +421,12 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
mnode["alias"] if "alias" in mnode.keys() else mnode["name"] mnode["alias"] if "alias" in mnode.keys() else mnode["name"]
) )
model_name = model_name.replace(".", "_DOT_") model_name = model_name.replace(".", "_DOT_")
description = mnode.get("description", "")
schema = mnode["schema"] schema = mnode["schema"]
path = f"{mnode['root_path']}/{mnode['original_file_path']}"
raw_sql = mnode.get("raw_sql", "") raw_sql = mnode.get("raw_sql", "")
model = DataModel( model = DataModel(
modelType=ModelType.DBT, modelType=ModelType.DBT,
description=description, description=mnode.get("description", ""),
path=path, path=f"{mnode['root_path']}/{mnode['original_file_path']}",
rawSql=raw_sql, rawSql=raw_sql,
sql=mnode.get("compiled_sql", raw_sql), sql=mnode.get("compiled_sql", raw_sql),
columns=columns, columns=columns,
@ -403,12 +440,14 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
if "depends_on" in mnode and "nodes" in mnode["depends_on"]: if "depends_on" in mnode and "nodes" in mnode["depends_on"]:
for node in mnode["depends_on"]["nodes"]: for node in mnode["depends_on"]["nodes"]:
try: try:
node_type, database, table = node.split(".", 2) _, database, table = node.split(".", 2)
table = table.replace(".", "_DOT_") table = table.replace(".", "_DOT_")
table_fqn = f"{self.config.service_name}.{database}.{table}" table_fqn = f"{self.config.service_name}.{database}.{table}"
upstream_nodes.append(table_fqn) upstream_nodes.append(table_fqn)
except Exception: except Exception as err: # pylint: disable=broad-except
logger.error(f"Failed to parse the node {node} to capture lineage") logger.error(
f"Failed to parse the node {node} to capture lineage {err}"
)
continue continue
return upstream_nodes return upstream_nodes
@ -443,8 +482,8 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
ordinalPosition=ccolumn["index"], ordinalPosition=ccolumn["index"],
) )
columns.append(col) columns.append(col)
except Exception as e: except Exception as err: # pylint: disable=broad-except
logger.error(f"Failed to parse column type due to {e}") logger.error(f"Failed to parse column type due to {err}")
return columns return columns
@ -454,33 +493,58 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
service=EntityReference(id=self.service.id, type=self.config.service_type), service=EntityReference(id=self.service.id, type=self.config.service_type),
) )
def parse_raw_data_type(self, raw_data_type): @staticmethod
return raw_data_type def _get_column_constraints(
column, pk_columns, unique_columns
) -> Optional[Constraint]:
"""
Prepare column constraints for the Table Entity
"""
constraint = None
if column["nullable"]:
constraint = Constraint.NULL
elif not column["nullable"]:
constraint = Constraint.NOT_NULL
if column["name"] in pk_columns:
constraint = Constraint.PRIMARY_KEY
elif column["name"] in unique_columns:
constraint = Constraint.UNIQUE
return constraint
def _get_columns( def _get_columns(
self, schema: str, table: str, inspector: Inspector self, schema: str, table: str, inspector: Inspector
) -> List[Column]: ) -> Optional[List[Column]]:
"""
Get columns types and constraints information
"""
# Get inspector information:
pk_constraints = inspector.get_pk_constraint(table, schema) pk_constraints = inspector.get_pk_constraint(table, schema)
try:
unique_constraints = inspector.get_unique_constraints(table, schema)
except NotImplementedError:
logger.warning("Cannot obtain constraints - NotImplementedError")
unique_constraints = []
pk_columns = ( pk_columns = (
pk_constraints["column_constraints"] pk_constraints["column_constraints"]
if len(pk_constraints) > 0 and "column_constraints" in pk_constraints.keys() if len(pk_constraints) > 0 and "column_constraints" in pk_constraints.keys()
else {} else {}
) )
unique_constraints = []
try: unique_columns = [
unique_constraints = inspector.get_unique_constraints(table, schema) constraint["column_names"]
except NotImplementedError: for constraint in unique_constraints
pass if "column_names" in constraint.keys()
unique_columns = [] ]
for constraint in unique_constraints:
if "column_names" in constraint.keys():
unique_columns = constraint["column_names"]
dataset_name = f"{schema}.{table}" dataset_name = f"{schema}.{table}"
columns = inspector.get_columns(table, schema)
table_columns = [] table_columns = []
row_order = 1
try: try:
for column in columns: for row_order, column in enumerate(inspector.get_columns(table, schema)):
if "." in column["name"]: if "." in column["name"]:
logger.info(f"Found '.' in {column['name']}") logger.info(f"Found '.' in {column['name']}")
column["name"] = column["name"].replace(".", "_DOT_") column["name"] = column["name"].replace(".", "_DOT_")
@ -489,9 +553,6 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
col_data_length = None col_data_length = None
arr_data_type = None arr_data_type = None
if "raw_data_type" in column and column["raw_data_type"] is not None: if "raw_data_type" in column and column["raw_data_type"] is not None:
column["raw_data_type"] = self.parse_raw_data_type(
column["raw_data_type"]
)
( (
col_type, col_type,
data_type_display, data_type_display,
@ -514,26 +575,16 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
r"(?:\w*)(?:[(]*)(\w*)(?:.*)", str(column["type"]) r"(?:\w*)(?:[(]*)(\w*)(?:.*)", str(column["type"])
).groups() ).groups()
data_type_display = column["type"] data_type_display = column["type"]
col_constraint = None
if column["nullable"]: col_constraint = self._get_column_constraints(
col_constraint = Constraint.NULL column, pk_columns, unique_columns
elif not column["nullable"]: )
col_constraint = Constraint.NOT_NULL
if column["name"] in pk_columns: if col_type.upper() in {"CHAR", "VARCHAR", "BINARY", "VARBINARY"}:
col_constraint = Constraint.PRIMARY_KEY
elif column["name"] in unique_columns:
col_constraint = Constraint.UNIQUE
if col_type.upper() in ["CHAR", "VARCHAR", "BINARY", "VARBINARY"]:
col_data_length = column["type"].length col_data_length = column["type"].length
if col_data_length is None: if col_data_length is None:
col_data_length = 1 col_data_length = 1
try: try:
if col_type == "NULL":
col_type = self.type_of_column_name(
col_type,
column_name=column["name"],
table_name=dataset_name,
)
if col_type == "NULL": if col_type == "NULL":
col_type = "VARCHAR" col_type = "VARCHAR"
data_type_display = "varchar" data_type_display = "varchar"
@ -544,27 +595,32 @@ class SQLSource(Source[OMetaDatabaseAndTable]):
name=column["name"], name=column["name"],
description=column.get("comment", None), description=column.get("comment", None),
dataType=col_type, dataType=col_type,
dataTypeDisplay="{}({})".format(col_type, col_data_length) dataTypeDisplay=f"{col_type}({col_data_length})"
if data_type_display is None if data_type_display is None
else f"{data_type_display}", else f"{data_type_display}",
dataLength=col_data_length, dataLength=col_data_length,
constraint=col_constraint, constraint=col_constraint,
ordinalPosition=row_order, ordinalPosition=row_order + 1, # enumerate starts at 0
children=children if children is not None else None, children=children,
arrayDataType=arr_data_type, arrayDataType=arr_data_type,
) )
except Exception as err: except Exception as err: # pylint: disable=broad-except
logger.error(traceback.format_exc()) logger.error(traceback.format_exc())
logger.error(traceback.print_exc()) logger.error(traceback.print_exc())
logger.error(f"{err} : {column}") logger.error(f"{err} : {column}")
continue continue
table_columns.append(om_column) table_columns.append(om_column)
row_order = row_order + 1
return table_columns return table_columns
except Exception as err: except Exception as err: # pylint: disable=broad-except
logger.error("{}: {} {}".format(repr(err), table, err)) logger.error(f"{repr(err)}: {table} {err}")
return None
def run_data_profiler(self, table: str, schema: str) -> TableProfile: def run_data_profiler(self, table: str, schema: str) -> TableProfile:
"""
Run the profiler for a table in a schema.
Prepare specific namings for different sources, e.g. bigquery
"""
dataset_name = f"{schema}.{table}" dataset_name = f"{schema}.{table}"
self.status.scanned(f"profile of {dataset_name}") self.status.scanned(f"profile of {dataset_name}")
logger.info( logger.info(