Moved view lineage to post process (#6585)

Co-authored-by: Onkar Ravgan <onkarravgan@Onkars-MacBook-Pro.local>
This commit is contained in:
Onkar Ravgan 2022-08-04 23:33:17 +05:30 committed by GitHub
parent b37bca5471
commit 3d93aaad53
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 92 additions and 87 deletions

View File

@ -90,9 +90,10 @@ class TopologyRunnerMixin(Generic[C]):
if node.post_process: if node.post_process:
logger.debug(f"Post processing node {node}") logger.debug(f"Post processing node {node}")
node_post_process = getattr(self, node.post_process) for process in node.post_process:
for entity_request in node_post_process(): node_post_process = getattr(self, process)
yield entity_request for entity_request in node_post_process():
yield entity_request
def next_record(self) -> Iterable[Entity]: def next_record(self) -> Iterable[Entity]:
""" """

View File

@ -66,7 +66,7 @@ class TopologyNode(BaseModel):
children: Optional[List[str]] = None # nodes to call execute next children: Optional[List[str]] = None # nodes to call execute next
post_process: Optional[ post_process: Optional[
str List[str]
] = None # Method to be run after the node has been fully processed ] = None # Method to be run after the node has been fully processed

View File

@ -52,6 +52,7 @@ from metadata.ingestion.source.database.database_service import (
) )
from metadata.ingestion.source.database.sql_column_handler import SqlColumnHandlerMixin from metadata.ingestion.source.database.sql_column_handler import SqlColumnHandlerMixin
from metadata.ingestion.source.database.sqlalchemy_source import SqlAlchemySource from metadata.ingestion.source.database.sqlalchemy_source import SqlAlchemySource
from metadata.utils import fqn
from metadata.utils.connections import get_connection, test_connection from metadata.utils.connections import get_connection, test_connection
from metadata.utils.filters import filter_by_schema, filter_by_table from metadata.utils.filters import filter_by_schema, filter_by_table
from metadata.utils.logger import ingestion_logger from metadata.utils.logger import ingestion_logger
@ -90,6 +91,7 @@ class CommonDbSourceService(
self.data_models = {} self.data_models = {}
self.table_constraints = None self.table_constraints = None
self.database_source_state = set() self.database_source_state = set()
self.context.table_views = []
super().__init__() super().__init__()
def set_inspector(self, database_name: str) -> None: def set_inspector(self, database_name: str) -> None:
@ -276,6 +278,13 @@ class CommonDbSourceService(
inspector=self.inspector, inspector=self.inspector,
) )
view_definition = self.get_view_definition(
table_type=table_type,
table_name=table_name,
schema_name=schema_name,
inspector=self.inspector,
)
table_request = CreateTableRequest( table_request = CreateTableRequest(
name=table_name, name=table_name,
tableType=table_type, tableType=table_type,
@ -285,12 +294,7 @@ class CommonDbSourceService(
inspector=self.inspector, inspector=self.inspector,
), ),
columns=columns, columns=columns,
viewDefinition=self.get_view_definition( viewDefinition=view_definition,
table_type=table_type,
table_name=table_name,
schema_name=schema_name,
inspector=self.inspector,
),
tableConstraints=table_constraints if table_constraints else None, tableConstraints=table_constraints if table_constraints else None,
databaseSchema=EntityReference( databaseSchema=EntityReference(
id=self.context.database_schema.id, id=self.context.database_schema.id,
@ -301,6 +305,15 @@ class CommonDbSourceService(
), # Pick tags from context info, if any ), # Pick tags from context info, if any
) )
if table_type == TableType.View or view_definition:
table_view = {
"table_name": table_name,
"table_type": table_type,
"schema_name": schema_name,
"db_name": db_name,
}
self.context.table_views.append(table_view)
yield table_request yield table_request
self.register_record(table_request=table_request) self.register_record(table_request=table_request)
@ -312,54 +325,64 @@ class CommonDbSourceService(
"{}.{}".format(self.config.serviceName, table_name) "{}.{}".format(self.config.serviceName, table_name)
) )
def yield_view_lineage( def yield_view_lineage(self) -> Optional[Iterable[AddLineageRequest]]:
self, table_name_and_type: Tuple[str, str] logger.info(f"Processing Lineage for Views")
) -> Optional[Iterable[AddLineageRequest]]: for view in self.context.table_views:
table_name, table_type = table_name_and_type table_name = view.get("table_name")
table_entity: Table = self.context.table table_type = view.get("table_type")
schema_name = self.context.database_schema.name.__root__ schema_name = view.get("schema_name")
db_name = self.context.database.name.__root__ db_name = view.get("db_name")
view_definition = self.get_view_definition( table_fqn = fqn.build(
table_type=table_type, self.metadata,
table_name=table_name, entity_type=Table,
schema_name=schema_name, service_name=self.context.database_service.name.__root__,
inspector=self.inspector, database_name=db_name,
) schema_name=schema_name,
if table_type != TableType.View or not view_definition: table_name=table_name,
return )
# Prevent sqllineage from modifying the logger config table_entity = self.metadata.get_by_name(
# Disable the DictConfigurator.configure method while importing LineageRunner entity=Table,
configure = DictConfigurator.configure fqn=table_fqn,
DictConfigurator.configure = lambda _: None )
from sqllineage.runner import LineageRunner view_definition = self.get_view_definition(
table_type=table_type,
table_name=table_name,
schema_name=schema_name,
inspector=self.inspector,
)
# Prevent sqllineage from modifying the logger config
# Disable the DictConfigurator.configure method while importing LineageRunner
configure = DictConfigurator.configure
DictConfigurator.configure = lambda _: None
from sqllineage.runner import LineageRunner
# Reverting changes after import is done # Reverting changes after import is done
DictConfigurator.configure = configure DictConfigurator.configure = configure
try: try:
result = LineageRunner(view_definition) result = LineageRunner(view_definition)
if result.source_tables and result.target_tables: if result.source_tables and result.target_tables:
yield from get_lineage_by_query( yield from get_lineage_by_query(
self.metadata, self.metadata,
query=view_definition, query=view_definition,
service_name=self.context.database_service.name.__root__, service_name=self.context.database_service.name.__root__,
database_name=db_name, database_name=db_name,
schema_name=schema_name, schema_name=schema_name,
) or [] ) or []
else: else:
yield from get_lineage_via_table_entity( yield from get_lineage_via_table_entity(
self.metadata, self.metadata,
table_entity=table_entity, table_entity=table_entity,
service_name=self.context.database_service.name.__root__, service_name=self.context.database_service.name.__root__,
database_name=db_name, database_name=db_name,
schema_name=schema_name, schema_name=schema_name,
query=view_definition, query=view_definition,
) or [] ) or []
except Exception: except Exception:
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
logger.debug(f"Query : {view_definition}") logger.debug(f"Query : {view_definition}")
logger.warning("Could not parse query: Ingesting lineage failed") logger.warning("Could not parse query: Ingesting lineage failed")
def test_connection(self) -> None: def test_connection(self) -> None:
""" """

View File

@ -120,7 +120,7 @@ class DatabaseServiceTopology(ServiceTopology):
), ),
], ],
children=["database"], children=["database"],
post_process="create_dbt_lineage", post_process=["create_dbt_lineage", "yield_view_lineage"],
) )
database = TopologyNode( database = TopologyNode(
producer="get_database_names", producer="get_database_names",
@ -133,7 +133,7 @@ class DatabaseServiceTopology(ServiceTopology):
) )
], ],
children=["databaseSchema"], children=["databaseSchema"],
post_process="mark_tables_as_deleted", post_process=["mark_tables_as_deleted"],
) )
databaseSchema = TopologyNode( databaseSchema = TopologyNode(
producer="get_database_schema_names", producer="get_database_schema_names",
@ -171,13 +171,6 @@ class DatabaseServiceTopology(ServiceTopology):
consumer=["storage_service"], consumer=["storage_service"],
nullable=True, nullable=True,
), ),
NodeStage(
type_=AddLineageRequest,
context="view_lineage",
processor="yield_view_lineage",
ack_sink=False,
nullable=True,
),
NodeStage( NodeStage(
type_=DataModelLink, type_=DataModelLink,
processor="yield_datamodel", processor="yield_datamodel",
@ -317,9 +310,7 @@ class DatabaseServiceSource(DBTMixin, TopologyRunnerMixin, Source, ABC):
yield from self.yield_tag(schema_name) or [] yield from self.yield_tag(schema_name) or []
@abstractmethod @abstractmethod
def yield_view_lineage( def yield_view_lineage(self) -> Optional[Iterable[AddLineageRequest]]:
self, table_name_and_type: Tuple[str, TableType]
) -> Optional[Iterable[AddLineageRequest]]:
""" """
From topology. From topology.
Parses view definition to get lineage information Parses view definition to get lineage information

View File

@ -322,10 +322,8 @@ class DatalakeSource(DatabaseServiceSource):
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
logger.error(err) logger.error(err)
def yield_view_lineage( def yield_view_lineage(self) -> Optional[Iterable[AddLineageRequest]]:
self, table_name_and_type: Tuple[str, str] yield from []
) -> Optional[Iterable[AddLineageRequest]]:
pass
def yield_tag(self, schema_name: str) -> Iterable[OMetaTagAndCategory]: def yield_tag(self, schema_name: str) -> Iterable[OMetaTagAndCategory]:
pass pass

View File

@ -351,10 +351,8 @@ class DeltalakeSource(DatabaseServiceSource):
return parsed_columns return parsed_columns
def yield_view_lineage( def yield_view_lineage(self) -> Optional[Iterable[AddLineageRequest]]:
self, table_name_and_type: Tuple[str, str] yield from []
) -> Optional[Iterable[AddLineageRequest]]:
pass
def yield_tag(self, schema_name: str) -> Iterable[OMetaTagAndCategory]: def yield_tag(self, schema_name: str) -> Iterable[OMetaTagAndCategory]:
pass pass

View File

@ -189,10 +189,8 @@ class DynamodbSource(DatabaseServiceSource):
"{}.{}".format(self.config.serviceName, table_name) "{}.{}".format(self.config.serviceName, table_name)
) )
def yield_view_lineage( def yield_view_lineage(self) -> Optional[Iterable[AddLineageRequest]]:
self, table_name_and_type: Tuple[str, str] yield from []
) -> Optional[Iterable[AddLineageRequest]]:
pass
def yield_tag(self, schema_name: str) -> Iterable[OMetaTagAndCategory]: def yield_tag(self, schema_name: str) -> Iterable[OMetaTagAndCategory]:
pass pass

View File

@ -341,10 +341,8 @@ class GlueSource(DatabaseServiceSource):
def standardize_table_name(self, schema: str, table: str) -> str: def standardize_table_name(self, schema: str, table: str) -> str:
return table[:128] return table[:128]
def yield_view_lineage( def yield_view_lineage(self) -> Optional[Iterable[AddLineageRequest]]:
self, table_name_and_type: Tuple[str, str] yield from []
) -> Optional[Iterable[AddLineageRequest]]:
pass
def yield_tag(self, schema_name: str) -> Iterable[OMetaTagAndCategory]: def yield_tag(self, schema_name: str) -> Iterable[OMetaTagAndCategory]:
pass pass

View File

@ -219,10 +219,8 @@ class SalesforceSource(DatabaseServiceSource):
return "INT" return "INT"
return "VARCHAR" return "VARCHAR"
def yield_view_lineage( def yield_view_lineage(self) -> Optional[Iterable[AddLineageRequest]]:
self, table_name_and_type: Tuple[str, str] yield from []
) -> Optional[Iterable[AddLineageRequest]]:
pass
def yield_tag(self, schema_name: str) -> Iterable[OMetaTagAndCategory]: def yield_tag(self, schema_name: str) -> Iterable[OMetaTagAndCategory]:
pass pass