Metadata Source Lint (#8018)

This commit is contained in:
Mayur Singal 2022-10-10 10:16:59 +05:30 committed by GitHub
parent b8e989af6c
commit 13f6f79f30
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 133 additions and 86 deletions

View File

@ -9,12 +9,15 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""
Amundsen source to extract metadata
"""
import traceback import traceback
from typing import Iterable, List, Optional from typing import Iterable, List, Optional
from pydantic import SecretStr from pydantic import SecretStr
from sqlalchemy.engine.url import make_url from sqlalchemy.engine.url import make_url
from tomlkit import table
from metadata.clients.neo4j_client import Neo4JConfig, Neo4jHelper from metadata.clients.neo4j_client import Neo4JConfig, Neo4jHelper
from metadata.config.common import ConfigModel from metadata.config.common import ConfigModel
@ -100,25 +103,31 @@ SUPERSET_DEFAULT_CONFIG = {
class AmundsenStatus(SourceStatus): class AmundsenStatus(SourceStatus):
success: List[str] = list() success: List[str] = []
failures: List[str] = list() failures: List[str] = []
warnings: List[str] = list() warnings: List[str] = []
filtered: List[str] = list() filtered: List[str] = []
def scanned(self, entity_name: str) -> None: def scanned(self, record: str) -> None:
self.success.append(entity_name) self.success.append(record)
logger.info("Entity Scanned: {}".format(entity_name)) logger.info(f"Entity Scanned: {record}")
def failure(self, key: str, reason: str) -> None: def failure(self, key: str, reason: str) -> None:
self.failures.append({key: reason}) self.failures.append({key: reason})
class AmundsenSource(Source[Entity]): class AmundsenSource(Source[Entity]):
"""
Amundsen source class
"""
dashboard_service: DashboardService dashboard_service: DashboardService
def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection): def __init__(self, config: WorkflowSource, metadata_config: OpenMetadataConnection):
self.config = config self.config = config
self.metadata_config = metadata_config self.metadata_config = metadata_config
self.database_schema_object = None
self.database_object = None
self.metadata = OpenMetadata(self.metadata_config) self.metadata = OpenMetadata(self.metadata_config)
self.service_connection = self.config.serviceConnection.__root__.config self.service_connection = self.config.serviceConnection.__root__.config
neo4j_config = Neo4JConfig( neo4j_config = Neo4JConfig(
@ -207,13 +216,14 @@ class AmundsenSource(Source[Entity]):
entity=DatabaseService, fqn=service_url.get_backend_name() entity=DatabaseService, fqn=service_url.get_backend_name()
) )
if service_entity: if service_entity:
table_fqn = "{service}.{database_schema}.{table}".format( service = service_url.get_backend_name()
service=service_url.get_backend_name(), database_schema = (
database_schema=service_url.host service_url.host
if hasattr(service_entity.connection.config, "supportsDatabase") if hasattr(service_entity.connection.config, "supportsDatabase")
else f"default.{service_url.host.split('.')[-1]}", else f"default.{service_url.host.split('.')[-1]}"
table=service_url.database,
) )
table = service_url.database
table_fqn = f"{service}.{database_schema}.{table}"
table_entity: Table = self.metadata.get_by_name( table_entity: Table = self.metadata.get_by_name(
entity=Table, fqn=table_fqn entity=Table, fqn=table_fqn
) )
@ -245,11 +255,9 @@ class AmundsenSource(Source[Entity]):
yield tag_category yield tag_category
logger.info(f"Tag Category {tag_category}, Primary Tag {tag} Ingested") logger.info(f"Tag Category {tag_category}, Primary Tag {tag} Ingested")
def create_table_entity(self, table): def _yield_create_database(self, table):
try: try:
service_name = table["database"] service_entity = self.get_database_service(table["database"])
# TODO: use metadata.get_service_or_create
service_entity = self.get_database_service(service_name)
database_request = CreateDatabaseRequest( database_request = CreateDatabaseRequest(
name=table["cluster"] name=table["cluster"]
@ -263,32 +271,47 @@ class AmundsenSource(Source[Entity]):
database_fqn = fqn.build( database_fqn = fqn.build(
self.metadata, self.metadata,
entity_type=Database, entity_type=Database,
service_name=service_name, service_name=table["database"],
database_name=database_request.name.__root__, database_name=table["cluster"],
) )
database_object = self.metadata.get_by_name( self.database_object = self.metadata.get_by_name(
entity=Database, fqn=database_fqn entity=Database, fqn=database_fqn
) )
except Exception as err:
logger.error(f"Failed to Ingest database due to - {err}")
logger.debug(traceback.format_exc())
def _yield_create_database_schema(self, table):
try:
database_schema_request = CreateDatabaseSchemaRequest( database_schema_request = CreateDatabaseSchemaRequest(
name=table["schema"], name=table["schema"],
database=EntityReference(id=database_object.id, type="database"), database=EntityReference(id=self.database_object.id, type="database"),
) )
yield database_schema_request yield database_schema_request
database_schema_fqn = fqn.build( database_schema_fqn = fqn.build(
self.metadata, self.metadata,
entity_type=DatabaseSchema, entity_type=DatabaseSchema,
service_name=service_name, service_name=table["database"],
database_name=database_request.name.__root__, database_name=table["cluster"],
schema_name=database_schema_request.name.__root__, schema_name=database_schema_request.name.__root__,
) )
database_schema_object = self.metadata.get_by_name( self.database_schema_object = self.metadata.get_by_name(
entity=DatabaseSchema, fqn=database_schema_fqn entity=DatabaseSchema, fqn=database_schema_fqn
) )
except Exception as err:
logger.error(f"Failed to Ingest database due to - {err}")
logger.debug(traceback.format_exc())
def create_table_entity(self, table):
"""
Process table details and return CreateTableRequest
"""
try:
yield from self._yield_create_database(table)
yield from self._yield_create_database_schema(table)
columns: List[Column] = [] columns: List[Column] = []
if len(table["column_names"]) == len(table["column_descriptions"]): if len(table["column_names"]) == len(table["column_descriptions"]):
# zipping on column_descriptions can cause incorrect or no ingestion # zipping on column_descriptions can cause incorrect or no ingestion
@ -308,7 +331,9 @@ class AmundsenSource(Source[Entity]):
# Amundsen merges the length into type itself. Instead of making changes to our generic type builder # Amundsen merges the length into type itself. Instead of making changes to our generic type builder
# we will do a type match and see if it matches any primitive types and return a type # we will do a type match and see if it matches any primitive types and return a type
data_type = self.get_type_primitive_type(data_type) data_type = self.get_type_primitive_type(data_type)
parsed_string = ColumnTypeParser._parse_datatype_string(data_type) parsed_string = ColumnTypeParser._parse_datatype_string( # pylint: disable=protected-access
data_type
) # pylint: disable=protected-access
parsed_string["name"] = name parsed_string["name"] = name
parsed_string["dataLength"] = 1 parsed_string["dataLength"] = 1
parsed_string["description"] = description parsed_string["description"] = description
@ -383,7 +408,7 @@ class AmundsenSource(Source[Entity]):
tableType="Regular", tableType="Regular",
description=table["description"], description=table["description"],
databaseSchema=EntityReference( databaseSchema=EntityReference(
id=database_schema_object.id, type="databaseSchema" id=self.database_schema_object.id, type="databaseSchema"
), ),
tags=tags, tags=tags,
columns=columns, columns=columns,
@ -396,7 +421,6 @@ class AmundsenSource(Source[Entity]):
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
logger.warning(f"Failed to create table entity [{table}]: {exc}") logger.warning(f"Failed to create table entity [{table}]: {exc}")
self.status.failure(table["name"], str(exc)) self.status.failure(table["name"], str(exc))
return None
def create_dashboard_service(self, dashboard: dict): def create_dashboard_service(self, dashboard: dict):
service_name = dashboard["cluster"] service_name = dashboard["cluster"]
@ -412,6 +436,9 @@ class AmundsenSource(Source[Entity]):
) )
def create_dashboard_entity(self, dashboard): def create_dashboard_entity(self, dashboard):
"""
Method to process dashboard and return CreateDashboardRequest
"""
try: try:
self.status.scanned(dashboard["name"]) self.status.scanned(dashboard["name"])
yield CreateDashboardRequest( yield CreateDashboardRequest(
@ -432,7 +459,6 @@ class AmundsenSource(Source[Entity]):
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
logger.warning(f"Failed to create dashboard entity [{dashboard}]: {exc}") logger.warning(f"Failed to create dashboard entity [{dashboard}]: {exc}")
self.status.failure(dashboard["name"], str(exc)) self.status.failure(dashboard["name"], str(exc))
return None
def create_chart_entity(self, dashboard): def create_chart_entity(self, dashboard):
for (name, chart_id, chart_type, url) in zip( for (name, chart_id, chart_type, url) in zip(
@ -471,8 +497,8 @@ class AmundsenSource(Source[Entity]):
service = self.metadata.get_by_name(entity=DatabaseService, fqn=service_name) service = self.metadata.get_by_name(entity=DatabaseService, fqn=service_name)
if service is not None: if service is not None:
return service return service
else:
logger.error(f"Please create a service with name {service_name}") logger.error(f"Please create a service with name {service_name}")
return None
def test_connection(self) -> None: def test_connection(self) -> None:
pass pass

View File

@ -1,3 +1,18 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Atlas source to extract metadata
"""
import traceback import traceback
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
@ -40,8 +55,8 @@ logger = ingestion_logger()
class AtlasSourceStatus(SourceStatus): class AtlasSourceStatus(SourceStatus):
tables_scanned: List[str] = list() tables_scanned: List[str] = []
filtered: List[str] = list() filtered: List[str] = []
def table_scanned(self, table: str) -> None: def table_scanned(self, table: str) -> None:
self.tables_scanned.append(table) self.tables_scanned.append(table)
@ -52,6 +67,10 @@ class AtlasSourceStatus(SourceStatus):
@dataclass @dataclass
class AtlasSource(Source): class AtlasSource(Source):
"""
Atlas source class
"""
config: WorkflowSource config: WorkflowSource
atlas_client: AtlasClient atlas_client: AtlasClient
status: AtlasSourceStatus status: AtlasSourceStatus
@ -63,7 +82,6 @@ class AtlasSource(Source):
config: WorkflowSource, config: WorkflowSource,
metadata_config: OpenMetadataConnection, metadata_config: OpenMetadataConnection,
): ):
super().__init__()
self.config = config self.config = config
self.metadata_config = metadata_config self.metadata_config = metadata_config
self.metadata = OpenMetadata(metadata_config) self.metadata = OpenMetadata(metadata_config)
@ -78,11 +96,18 @@ class AtlasSource(Source):
if not path.is_file(): if not path.is_file():
logger.error(f"File not found {self.service_connection.entityTypes}") logger.error(f"File not found {self.service_connection.entityTypes}")
raise FileNotFoundError() raise FileNotFoundError()
with open(self.service_connection.entityTypes, "r") as f: with open(
self.service_connection.entityTypes = yaml.load(f, Loader=yaml.SafeLoader) self.service_connection.entityTypes, "r", encoding="utf-8"
) as entity_types_file:
self.service_connection.entityTypes = yaml.load(
entity_types_file, Loader=yaml.SafeLoader
)
self.tables: Dict[str, Any] = {} self.tables: Dict[str, Any] = {}
self.topics: Dict[str, Any] = {} self.topics: Dict[str, Any] = {}
self.service = None
self.message_service = None
@classmethod @classmethod
def create(cls, config_dict, metadata_config: OpenMetadataConnection): def create(cls, config_dict, metadata_config: OpenMetadataConnection):
@ -98,7 +123,6 @@ class AtlasSource(Source):
""" """
Not required to implement Not required to implement
""" """
pass
def next_record(self): def next_record(self):
for key in self.service_connection.entityTypes["Table"].keys(): for key in self.service_connection.entityTypes["Table"].keys():
@ -121,13 +145,15 @@ class AtlasSource(Source):
yield from self._parse_topic_entity(topic) yield from self._parse_topic_entity(topic)
def close(self): def close(self):
return super().close() """
Not required to implement
"""
def get_status(self) -> SourceStatus: def get_status(self) -> SourceStatus:
return self.status return self.status
def _parse_topic_entity(self, name): def _parse_topic_entity(self, name):
for key in self.topics.keys(): for key in self.topics:
topic_entity = self.atlas_client.get_entity(self.topics[key]) topic_entity = self.atlas_client.get_entity(self.topics[key])
tpc_entities = topic_entity["entities"] tpc_entities = topic_entity["entities"]
for tpc_entity in tpc_entities: for tpc_entity in tpc_entities:
@ -164,55 +190,45 @@ class AtlasSource(Source):
db_entity = tbl_entity["relationshipAttributes"][ db_entity = tbl_entity["relationshipAttributes"][
self.service_connection.entityTypes["Table"][name]["db"] self.service_connection.entityTypes["Table"][name]["db"]
] ]
database_request = self.get_database_entity( yield self.get_database_entity(db_entity["displayText"])
db_entity["displayText"]
)
table_name = tbl_attrs["name"]
tbl_description = tbl_attrs["description"]
yield database_request
database_fqn = fqn.build( database_fqn = fqn.build(
self.metadata, self.metadata,
entity_type=Database, entity_type=Database,
service_name=self.service_connection.dbService, service_name=self.service_connection.dbService,
database_name=database_request.name.__root__, database_name=db_entity["displayText"],
) )
database_object = self.metadata.get_by_name( database_object = self.metadata.get_by_name(
entity=Database, fqn=database_fqn entity=Database, fqn=database_fqn
) )
tbl_attrs = tbl_entity["attributes"] yield CreateDatabaseSchemaRequest(
db_entity = tbl_entity["relationshipAttributes"]["db"]
database_schema_request = CreateDatabaseSchemaRequest(
name=db_entity["displayText"], name=db_entity["displayText"],
database=EntityReference( database=EntityReference(
id=database_object.id, type="database" id=database_object.id, type="database"
), ),
) )
yield database_schema_request
database_schema_fqn = fqn.build( database_schema_fqn = fqn.build(
self.metadata, self.metadata,
entity_type=DatabaseSchema, entity_type=DatabaseSchema,
service_name=self.config.serviceConnection.__root__.config.dbService, service_name=self.config.serviceConnection.__root__.config.dbService,
database_name=database_request.name.__root__, database_name=db_entity["displayText"],
schema_name=database_schema_request.name.__root__, schema_name=db_entity["displayText"],
) )
database_schema_object = self.metadata.get_by_name( database_schema_object = self.metadata.get_by_name(
entity=DatabaseSchema, fqn=database_schema_fqn entity=DatabaseSchema, fqn=database_schema_fqn
) )
table_request = CreateTableRequest( yield CreateTableRequest(
name=table_name, name=tbl_attrs["name"],
databaseSchema=EntityReference( databaseSchema=EntityReference(
id=database_schema_object.id, type="databaseSchema" id=database_schema_object.id, type="databaseSchema"
), ),
description=tbl_description, description=tbl_attrs["description"],
columns=tbl_columns, columns=tbl_columns,
) )
yield table_request
yield from self.ingest_lineage(tbl_entity["guid"], name) yield from self.ingest_lineage(tbl_entity["guid"], name)
@ -239,7 +255,7 @@ class AtlasSource(Source):
dataType=ColumnTypeParser.get_column_type( dataType=ColumnTypeParser.get_column_type(
column["dataType"].upper() column["dataType"].upper()
), ),
dataTypeDisplay="{}({})".format(column["dataType"], "1"), dataTypeDisplay=column["dataType"],
dataLength=col_data_length, dataLength=col_data_length,
ordinalPosition=ordinal_pos, ordinalPosition=ordinal_pos,
) )
@ -257,6 +273,9 @@ class AtlasSource(Source):
) )
def ingest_lineage(self, source_guid, name) -> Iterable[AddLineageRequest]: def ingest_lineage(self, source_guid, name) -> Iterable[AddLineageRequest]:
"""
Fetch and ingest lineage
"""
lineage_response = self.atlas_client.get_lineage(source_guid) lineage_response = self.atlas_client.get_lineage(source_guid)
lineage_relations = lineage_response["relations"] lineage_relations = lineage_response["relations"]
tbl_entity = self.atlas_client.get_entity(lineage_response["baseEntityGuid"]) tbl_entity = self.atlas_client.get_entity(lineage_response["baseEntityGuid"])
@ -321,18 +340,19 @@ class AtlasSource(Source):
) )
yield lineage yield lineage
def get_lineage_entity_ref(self, fqn, metadata_config, type) -> EntityReference: def get_lineage_entity_ref(
self, to_fqn, metadata_config, entity_type
) -> EntityReference:
metadata = OpenMetadata(metadata_config) metadata = OpenMetadata(metadata_config)
if type == "table": if entity_type == "table":
table = metadata.get_by_name(entity=Table, fqn=fqn) table = metadata.get_by_name(entity=Table, fqn=to_fqn)
if not table: if table:
return
return EntityReference(id=table.id.__root__, type="table") return EntityReference(id=table.id.__root__, type="table")
elif type == "pipeline": if entity_type == "pipeline":
pipeline = metadata.get_by_name(entity=Pipeline, fqn=fqn) pipeline = metadata.get_by_name(entity=Pipeline, fqn=to_fqn)
if not pipeline: if pipeline:
return
return EntityReference(id=pipeline.id.__root__, type="pipeline") return EntityReference(id=pipeline.id.__root__, type="pipeline")
return None
def test_connection(self) -> None: def test_connection(self) -> None:
pass pass

View File

@ -42,9 +42,9 @@ logger = ingestion_logger()
class MetadataSourceStatus(SourceStatus): class MetadataSourceStatus(SourceStatus):
success: List[str] = list() success: List[str] = []
failures: List[str] = list() failures: List[str] = []
warnings: List[str] = list() warnings: List[str] = []
def scanned_entity(self, entity_class_name: str, entity_name: str) -> None: def scanned_entity(self, entity_class_name: str, entity_name: str) -> None:
self.success.append(entity_name) self.success.append(entity_name)
@ -59,6 +59,9 @@ class MetadataSourceStatus(SourceStatus):
class MetadataSource(Source[Entity]): class MetadataSource(Source[Entity]):
"""
Metadata Source to Fetch All Entities from backend
"""
config: WorkflowSource config: WorkflowSource
report: SourceStatus report: SourceStatus
@ -85,7 +88,7 @@ class MetadataSource(Source[Entity]):
def create(cls, config_dict, metadata_config: OpenMetadataConnection): def create(cls, config_dict, metadata_config: OpenMetadataConnection):
raise NotImplementedError("Create Method not implemented") raise NotImplementedError("Create Method not implemented")
def next_record(self) -> Iterable[Entity]: def next_record(self) -> Iterable[Entity]: # pylint: disable=too-many-branches
if self.service_connection.includeTables: if self.service_connection.includeTables:
yield from self.fetch_entities( yield from self.fetch_entities(
entity_class=Table, entity_class=Table,

View File

@ -27,6 +27,10 @@ logger = ingestion_logger()
class MetadataElasticsearchSource(MetadataSource): class MetadataElasticsearchSource(MetadataSource):
"""
Metadata Elastic Search Source
Used for metadata to ES pipeline
"""
config: WorkflowSource config: WorkflowSource
report: SourceStatus report: SourceStatus

View File

@ -70,17 +70,14 @@ class DatabaseServiceWrapper:
class MigrateSource(MetadataSource): class MigrateSource(MetadataSource):
"""
Metadata Migrate source module
to migrate from 0.9 from 0.10
"""
config: WorkflowSource config: WorkflowSource
report: SourceStatus report: SourceStatus
def __init__(
self,
config: WorkflowSource,
metadata_config: OpenMetadataConnection,
):
super().__init__(config, metadata_config)
@classmethod @classmethod
def create(cls, config_dict, metadata_config: OpenMetadataConnection): def create(cls, config_dict, metadata_config: OpenMetadataConnection):
config: WorkflowSource = WorkflowSource.parse_obj(config_dict) config: WorkflowSource = WorkflowSource.parse_obj(config_dict)
@ -92,6 +89,9 @@ class MigrateSource(MetadataSource):
return cls(config, metadata_config) return cls(config, metadata_config)
def next_record(self) -> Iterable[Entity]: def next_record(self) -> Iterable[Entity]:
"""
Fetch all relebvent entities
"""
if self.service_connection.includeTables: if self.service_connection.includeTables:
yield from self.fetch_tables( yield from self.fetch_tables(
fields=[ fields=[

View File

@ -25,17 +25,11 @@ logger = ingestion_logger()
class OpenmetadataSource(MetadataSource): class OpenmetadataSource(MetadataSource):
"""Metadata source Class"""
config: WorkflowSource config: WorkflowSource
report: SourceStatus report: SourceStatus
def __init__(
self,
config: WorkflowSource,
metadata_config: OpenMetadataConnection,
):
super().__init__(config, metadata_config)
@classmethod @classmethod
def create(cls, config_dict, metadata_config: OpenMetadataConnection): def create(cls, config_dict, metadata_config: OpenMetadataConnection):
config: WorkflowSource = WorkflowSource.parse_obj(config_dict) config: WorkflowSource = WorkflowSource.parse_obj(config_dict)