Fix#6203: Refactor LineageRunner use (#9073)

* Refactor LineageRunner use

* Address PR comments

* Address pylint errors

* Fix failing test
This commit is contained in:
Nahuel 2022-11-30 16:02:21 +01:00 committed by GitHub
parent 9c8492c7a3
commit 76773e69de
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 349 additions and 412 deletions

View File

@ -50,6 +50,7 @@ base_requirements = {
"croniter~=1.3.0", "croniter~=1.3.0",
"requests-aws4auth==1.1.2", "requests-aws4auth==1.1.2",
"pymysql>=1.0.2", "pymysql>=1.0.2",
"cached-property==1.5.2",
} }

View File

@ -14,12 +14,18 @@ Lineage Parser configuration
import traceback import traceback
from collections import defaultdict from collections import defaultdict
from logging.config import DictConfigurator from logging.config import DictConfigurator
from typing import Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
from cached_property import cached_property
from sqlparse.sql import Comparison, Identifier, Statement from sqlparse.sql import Comparison, Identifier, Statement
from metadata.generated.schema.type.tableUsageCount import TableColumn, TableColumnJoin from metadata.generated.schema.type.tableUsageCount import TableColumn, TableColumnJoin
from metadata.utils.helpers import find_in_iter, get_formatted_entity_name from metadata.utils.helpers import (
find_in_iter,
get_formatted_entity_name,
insensitive_match,
insensitive_replace,
)
from metadata.utils.logger import ingestion_logger from metadata.utils.logger import ingestion_logger
# Prevent sqllineage from modifying the logger config # Prevent sqllineage from modifying the logger config
@ -27,7 +33,7 @@ from metadata.utils.logger import ingestion_logger
# pylint: disable=wrong-import-position # pylint: disable=wrong-import-position
configure = DictConfigurator.configure configure = DictConfigurator.configure
DictConfigurator.configure = lambda _: None DictConfigurator.configure = lambda _: None
from sqllineage.core import models from sqllineage.core.models import Column, Table
from sqllineage.exceptions import SQLLineageException from sqllineage.exceptions import SQLLineageException
from sqllineage.runner import LineageRunner from sqllineage.runner import LineageRunner
@ -37,70 +43,114 @@ DictConfigurator.configure = configure
logger = ingestion_logger() logger = ingestion_logger()
def get_involved_tables_from_parser( class LineageParser:
parser: LineageRunner, """
) -> Optional[List[models.Table]]: Class that acts like a wrapper for the LineageRunner library usage
"""
parser: LineageRunner
query: str
_clean_query: str
def __init__(self, query: str):
self.query = query
self._clean_query = self.clean_raw_query(query)
self.parser = LineageRunner(query)
@cached_property
def involved_tables(self) -> Optional[List[Table]]:
""" """
Use the LineageRunner parser and combine Use the LineageRunner parser and combine
source and intermediate tables into source and intermediate tables into
a single set. a single set.
:param parser: LineageRunner
:return: List of involved tables :return: List of involved tables
""" """
try: try:
# These are @lazy_property, not properly being picked up by IDEs. Ignore the warning
return list( return list(
set(parser.source_tables) set(self.source_tables)
.union(set(parser.intermediate_tables)) .union(set(self.intermediate_tables))
.union(set(parser.target_tables)) .union(set(self.target_tables))
) )
except SQLLineageException as exc: except SQLLineageException as exc:
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
logger.warning( logger.warning(
f"Cannot extract source table information from query [{parser._sql}]: {exc}" # pylint: disable=protected-access f"Cannot extract source table information from query [{self.query}]: {exc}"
) )
return None return None
@cached_property
def intermediate_tables(self) -> List[Table]:
"""
Get a list of intermediate tables
"""
# These are @lazy_property, not properly being picked up by IDEs. Ignore the warning
return self.retrieve_tables(self.parser.intermediate_tables)
def get_clean_parser_table_list(tables: List[models.Table]) -> List[str]: @cached_property
def source_tables(self) -> List[Table]:
"""
Get a list of source tables
"""
# These are @lazy_property, not properly being picked up by IDEs. Ignore the warning
return self.retrieve_tables(self.parser.source_tables)
@cached_property
def target_tables(self) -> List[Table]:
"""
Get a list of target tables
"""
# These are @lazy_property, not properly being picked up by IDEs. Ignore the warning
return self.retrieve_tables(self.parser.target_tables)
@cached_property
def column_lineage(self) -> List[Tuple[Column, Column]]:
"""
Get a list of tuples of column lineage
"""
return self.parser.get_column_lineage()
@cached_property
def clean_table_list(self) -> List[str]:
""" """
Clean the table name if it has <default>. Clean the table name if it has <default>.
:param tables: involved tables
:return: clean table names :return: clean table names
""" """
return [get_formatted_entity_name(str(table)) for table in tables] return [get_formatted_entity_name(str(table)) for table in self.involved_tables]
@cached_property
def get_parser_table_aliases(tables: List[models.Table]) -> Dict[str, str]: def table_aliases(self) -> Dict[str, str]:
""" """
Prepare a dictionary in the shape of {alias: table_name} from Prepare a dictionary in the shape of {alias: table_name} from
the parser tables the parser tables
:param tables: parser tables
:return: alias dict :return: alias dict
""" """
return {table.alias: str(table).replace("<default>.", "") for table in tables} return {
table.alias: str(table).replace("<default>.", "")
for table in self.involved_tables
}
def get_table_name_from_list(
def get_table_name_from_list( self,
database_name: Optional[str], database_name: Optional[str],
schema_name: Optional[str], schema_name: Optional[str],
table_name: str, table_name: str,
tables: List[str], ) -> Optional[str]:
) -> Optional[str]:
""" """
Find the table name (in any format in my come) Find the table name (in any format in my come)
from the list using the given ingredients. from the list using the given ingredients.
:param database_name: db name :param database_name: db name
:param schema_name: schema name :param schema_name: schema name
:param table_name: table name :param table_name: table name
:param tables: Contains all involved tables
:return: table name from parser info :return: table name from parser info
""" """
tables = self.clean_table_list
table = find_in_iter(element=table_name, container=tables) table = find_in_iter(element=table_name, container=tables)
if table: if table:
return table return table
schema_table = find_in_iter(element=f"{schema_name}.{table_name}", container=tables) schema_table = find_in_iter(
element=f"{schema_name}.{table_name}", container=tables
)
if schema_table: if schema_table:
return schema_table return schema_table
@ -113,35 +163,34 @@ def get_table_name_from_list(
logger.debug(f"Cannot find table {db_schema_table} in involved tables") logger.debug(f"Cannot find table {db_schema_table} in involved tables")
return None return None
def get_comparison_elements(
def get_comparison_elements( self, identifier: Identifier
identifier: Identifier, tables: List[str], aliases: Dict[str, str] ) -> Tuple[Optional[str], Optional[str]]:
) -> Optional[Tuple[str, str]]:
""" """
Return the tuple table_name, column_name from each comparison element Return the tuple table_name, column_name from each comparison element
:param identifier: comparison identifier :param identifier: comparison identifier
:param tables: involved tables
:param aliases: table aliases
:return: table name and column name from the identifier :return: table name and column name from the identifier
""" """
aliases = self.table_aliases
values = identifier.value.split(".") values = identifier.value.split(".")
database_name, schema_name, table_or_alias, column_name = ( database_name, schema_name, table_or_alias, column_name = (
[None] * (4 - len(values)) [None] * (4 - len(values))
) + values ) + values
if not table_or_alias or not column_name: if not table_or_alias or not column_name:
logger.debug(f"Cannot obtain comparison elements from identifier {identifier}") logger.debug(
f"Cannot obtain comparison elements from identifier {identifier}"
)
return None, None return None, None
alias_to_table = aliases.get(table_or_alias) alias_to_table = aliases.get(table_or_alias)
if alias_to_table: if alias_to_table:
return alias_to_table, column_name return alias_to_table, column_name
table_from_list = get_table_name_from_list( table_from_list = self.get_table_name_from_list(
database_name=database_name, database_name=database_name,
schema_name=schema_name, schema_name=schema_name,
table_name=table_or_alias, table_name=table_or_alias,
tables=tables,
) )
if not table_from_list: if not table_from_list:
@ -150,12 +199,12 @@ def get_comparison_elements(
return table_from_list, column_name return table_from_list, column_name
@staticmethod
def stateful_add_table_joins( def stateful_add_table_joins(
statement_joins: Dict[str, List[TableColumnJoin]], statement_joins: Dict[str, List[TableColumnJoin]],
source: TableColumn, source: TableColumn,
target: TableColumn, target: TableColumn,
) -> None: ) -> None:
""" """
Update the statement_joins dict with the new table information Update the statement_joins dict with the new table information
:param statement_joins: dict with state info :param statement_joins: dict with state info
@ -173,7 +222,9 @@ def stateful_add_table_joins(
table_columns = [ table_columns = [
join_info.tableColumn for join_info in statement_joins[source.table] join_info.tableColumn for join_info in statement_joins[source.table]
] ]
existing_table_column = find_in_iter(element=source, container=table_columns) existing_table_column = find_in_iter(
element=source, container=table_columns
)
if existing_table_column: if existing_table_column:
existing_join_info = [ existing_join_info = [
join_info join_info
@ -187,19 +238,15 @@ def stateful_add_table_joins(
TableColumnJoin(tableColumn=source, joinedWith=[target]) TableColumnJoin(tableColumn=source, joinedWith=[target])
) )
def stateful_add_joins_from_statement(
def stateful_add_joins_from_statement( self,
join_data: Dict[str, List[TableColumnJoin]], join_data: Dict[str, List[TableColumnJoin]],
statement: Statement, statement: Statement,
tables: List[str], ) -> None:
aliases: Dict[str, str],
) -> None:
""" """
Parse a single statement to pick up join information Parse a single statement to pick up join information
:param join_data: join data from previous statements :param join_data: join data from previous statements
:param statement: Parsed sql statement to process :param statement: Parsed sql statement to process
:param tables: involved tables in the query
:param aliases: table aliases dict
:return: for each table name, list all joins against other tables :return: for each table name, list all joins against other tables
""" """
# Here we want to get tokens such as `tableA.col1 = tableB.col2` # Here we want to get tokens such as `tableA.col1 = tableB.col2`
@ -211,11 +258,11 @@ def stateful_add_joins_from_statement(
logger.debug(f"Ignoring comparison {comparison}") logger.debug(f"Ignoring comparison {comparison}")
continue continue
table_left, column_left = get_comparison_elements( table_left, column_left = self.get_comparison_elements(
identifier=comparison.left, tables=tables, aliases=aliases identifier=comparison.left
) )
table_right, column_right = get_comparison_elements( table_right, column_right = self.get_comparison_elements(
identifier=comparison.right, tables=tables, aliases=aliases identifier=comparison.right
) )
if not table_left or not table_right: if not table_left or not table_right:
@ -227,24 +274,58 @@ def stateful_add_joins_from_statement(
# We just send the info once, from Left -> Right. # We just send the info once, from Left -> Right.
# The backend will prepare the symmetric information. # The backend will prepare the symmetric information.
stateful_add_table_joins(join_data, left_table_column, right_table_column) self.stateful_add_table_joins(
join_data, left_table_column, right_table_column
)
@cached_property
def get_table_joins( def table_joins(self) -> Dict[str, List[TableColumnJoin]]:
parser: LineageRunner, tables: List[str], aliases: Dict[str, str]
) -> Dict[str, List[TableColumnJoin]]:
""" """
For each table involved in the query, find its joins against any For each table involved in the query, find its joins against any
other table. other table.
:param parser: LineageRunner parser
:param tables: involved tables in the query
:param aliases: table aliases dict
:return: for each table name, list all joins against other tables :return: for each table name, list all joins against other tables
""" """
join_data = defaultdict(list) join_data = defaultdict(list)
for statement in parser.statements_parsed: # These are @lazy_property, not properly being picked up by IDEs. Ignore the warning
stateful_add_joins_from_statement( for statement in self.parser.statements_parsed:
join_data, statement=statement, tables=tables, aliases=aliases self.stateful_add_joins_from_statement(join_data, statement=statement)
)
return join_data return join_data
def retrieve_tables(self, tables: List[Any]) -> List[Table]:
if not self._clean_query:
return []
return [table for table in tables if isinstance(table, Table)]
@classmethod
def clean_raw_query(cls, raw_query: str) -> Optional[str]:
"""
Given a raw query from any input (e.g., view definition,
query from logs, etc.), perform a cleaning step
before passing it to the LineageRunner
"""
clean_query = insensitive_replace(
raw_str=raw_query,
to_replace=" copy grants ", # snowflake specific
replace_by=" ", # remove it as it does not add any value to lineage
)
clean_query = insensitive_replace(
raw_str=clean_query.strip(),
to_replace="\n", # remove line breaks
replace_by=" ",
)
if insensitive_match(clean_query, ".*merge into .*when matched.*"):
clean_query = insensitive_replace(
raw_str=clean_query,
to_replace="when matched.*", # merge into queries specific
replace_by="", # remove it as LineageRunner is not able to perform the lineage
)
# We remove queries of the type 'COPY table FROM path' since they are data manipulation statement and do not
# provide value for user
if insensitive_match(clean_query, "^COPY.*"):
return None
return clean_query.strip()

View File

@ -12,7 +12,6 @@
Helper functions to handle SQL lineage operations Helper functions to handle SQL lineage operations
""" """
import traceback import traceback
from logging.config import DictConfigurator
from typing import Any, Iterable, Iterator, List, Optional from typing import Any, Iterable, Iterator, List, Optional
from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest from metadata.generated.schema.api.lineage.addLineage import AddLineageRequest
@ -23,64 +22,16 @@ from metadata.generated.schema.type.entityLineage import (
LineageDetails, LineageDetails,
) )
from metadata.generated.schema.type.entityReference import EntityReference from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.lineage.parser import LineageParser
from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.utils import fqn from metadata.utils import fqn
from metadata.utils.helpers import insensitive_match, insensitive_replace
from metadata.utils.logger import utils_logger from metadata.utils.logger import utils_logger
from metadata.utils.lru_cache import LRUCache from metadata.utils.lru_cache import LRUCache
# Prevent sqllineage from modifying the logger config
# Disable the DictConfigurator.configure method while importing LineageRunner
configure = DictConfigurator.configure
DictConfigurator.configure = lambda _: None
from sqllineage.runner import LineageRunner # pylint: disable=wrong-import-position
# Reverting changes after import is done
DictConfigurator.configure = configure
logger = utils_logger() logger = utils_logger()
LRU_CACHE_SIZE = 4096 LRU_CACHE_SIZE = 4096
def clean_raw_query(raw_query: str) -> str:
"""
Given a raw query from any input (e.g., view definition,
query from logs, etc.), perform a cleaning step
before passing it to the LineageRunner
"""
clean_query = insensitive_replace(
raw_str=raw_query,
to_replace=" copy grants ", # snowflake specific
replace_by=" ", # remove it as it does not add any value to lineage
)
clean_query = insensitive_replace(
raw_str=clean_query.strip(),
to_replace="\n", # remove line breaks
replace_by=" ",
)
if insensitive_match(clean_query, ".*merge into .*when matched.*"):
clean_query = insensitive_replace(
raw_str=clean_query,
to_replace="when matched.*", # merge into queries specific
replace_by="", # remove it as LineageRunner is not able to perform the lineage
)
return clean_query.strip()
def split_raw_table_name(database: str, raw_name: str) -> dict:
database_schema = None
if "." in raw_name:
# pylint: disable=unbalanced-tuple-unpacking
database_schema, table = fqn.split(raw_name)[-2:]
# pylint: enable=unbalanced-tuple-unpacking
if database_schema == "<default>":
database_schema = None
return {"database": database, "database_schema": database_schema, "table": table}
def get_column_fqn(table_entity: Table, column: str) -> Optional[str]: def get_column_fqn(table_entity: Table, column: str) -> Optional[str]:
""" """
Get fqn of column if exist in table entity Get fqn of column if exist in table entity
@ -378,13 +329,13 @@ def get_lineage_by_query(
try: try:
logger.debug(f"Running lineage with query: {query}") logger.debug(f"Running lineage with query: {query}")
result = LineageRunner(clean_raw_query(query)) lineage_parser = LineageParser(query)
raw_column_lineage = result.get_column_lineage() raw_column_lineage = lineage_parser.column_lineage
column_lineage.update(populate_column_lineage_map(raw_column_lineage)) column_lineage.update(populate_column_lineage_map(raw_column_lineage))
for intermediate_table in result.intermediate_tables: for intermediate_table in lineage_parser.intermediate_tables:
for source_table in result.source_tables: for source_table in lineage_parser.source_tables:
yield from _create_lineage_by_table_name( yield from _create_lineage_by_table_name(
metadata, metadata,
from_table=str(source_table), from_table=str(source_table),
@ -395,7 +346,7 @@ def get_lineage_by_query(
query=query, query=query,
column_lineage_map=column_lineage, column_lineage_map=column_lineage,
) )
for target_table in result.target_tables: for target_table in lineage_parser.target_tables:
yield from _create_lineage_by_table_name( yield from _create_lineage_by_table_name(
metadata, metadata,
from_table=str(intermediate_table), from_table=str(intermediate_table),
@ -406,9 +357,9 @@ def get_lineage_by_query(
query=query, query=query,
column_lineage_map=column_lineage, column_lineage_map=column_lineage,
) )
if not result.intermediate_tables: if not lineage_parser.intermediate_tables:
for target_table in result.target_tables: for target_table in lineage_parser.target_tables:
for source_table in result.source_tables: for source_table in lineage_parser.source_tables:
yield from _create_lineage_by_table_name( yield from _create_lineage_by_table_name(
metadata, metadata,
from_table=str(source_table), from_table=str(source_table),
@ -452,10 +403,10 @@ def get_lineage_via_table_entity(
try: try:
logger.debug(f"Getting lineage via table entity using query: {query}") logger.debug(f"Getting lineage via table entity using query: {query}")
parser = LineageRunner(clean_raw_query(query)) lineage_parser = LineageParser(query)
to_table_name = table_entity.name.__root__ to_table_name = table_entity.name.__root__
for from_table_name in parser.source_tables: for from_table_name in lineage_parser.source_tables:
yield from _create_lineage_by_table_name( yield from _create_lineage_by_table_name(
metadata, metadata,
from_table=str(from_table_name), from_table=str(from_table_name),

View File

@ -14,7 +14,6 @@ Query parser implementation
import datetime import datetime
import traceback import traceback
from logging.config import DictConfigurator
from typing import Optional from typing import Optional
from metadata.config.common import ConfigModel from metadata.config.common import ConfigModel
@ -24,22 +23,9 @@ from metadata.generated.schema.entity.services.connections.metadata.openMetadata
from metadata.generated.schema.type.queryParserData import ParsedData, QueryParserData from metadata.generated.schema.type.queryParserData import ParsedData, QueryParserData
from metadata.generated.schema.type.tableQuery import TableQueries, TableQuery from metadata.generated.schema.type.tableQuery import TableQueries, TableQuery
from metadata.ingestion.api.processor import Processor, ProcessorStatus from metadata.ingestion.api.processor import Processor, ProcessorStatus
from metadata.ingestion.lineage.parser import ( from metadata.ingestion.lineage.parser import LineageParser
get_clean_parser_table_list,
get_involved_tables_from_parser,
get_parser_table_aliases,
get_table_joins,
)
from metadata.ingestion.lineage.sql_lineage import clean_raw_query
from metadata.utils.logger import ingestion_logger from metadata.utils.logger import ingestion_logger
configure = DictConfigurator.configure
DictConfigurator.configure = lambda _: None
from sqllineage.runner import LineageRunner # pylint: disable=wrong-import-position
# Reverting changes after import is done
DictConfigurator.configure = configure
logger = ingestion_logger() logger = ingestion_logger()
@ -58,19 +44,14 @@ def parse_sql_statement(record: TableQuery) -> Optional[ParsedData]:
str(record.analysisDate), "%Y-%m-%d %H:%M:%S" str(record.analysisDate), "%Y-%m-%d %H:%M:%S"
).date() ).date()
parser = LineageRunner(clean_raw_query(record.query)) lineage_parser = LineageParser(record.query)
tables = get_involved_tables_from_parser(parser) if not lineage_parser.involved_tables:
if not tables:
return None return None
clean_tables = get_clean_parser_table_list(tables)
aliases = get_parser_table_aliases(tables)
return ParsedData( return ParsedData(
tables=clean_tables, tables=lineage_parser.clean_table_list,
joins=get_table_joins(parser=parser, tables=clean_tables, aliases=aliases), joins=lineage_parser.table_joins,
databaseName=record.databaseName, databaseName=record.databaseName,
databaseSchema=record.databaseSchema, databaseSchema=record.databaseSchema,
sql=record.query, sql=record.query,

View File

@ -11,7 +11,6 @@
"""Metabase source module""" """Metabase source module"""
import traceback import traceback
from logging.config import DictConfigurator
from typing import Iterable, List, Optional from typing import Iterable, List, Optional
import requests import requests
@ -34,10 +33,8 @@ from metadata.generated.schema.metadataIngestion.workflow import (
) )
from metadata.generated.schema.type.entityReference import EntityReference from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.api.source import InvalidSourceException from metadata.ingestion.api.source import InvalidSourceException
from metadata.ingestion.lineage.sql_lineage import ( from metadata.ingestion.lineage.parser import LineageParser
clean_raw_query, from metadata.ingestion.lineage.sql_lineage import search_table_entities
search_table_entities,
)
from metadata.ingestion.source.dashboard.dashboard_service import DashboardServiceSource from metadata.ingestion.source.dashboard.dashboard_service import DashboardServiceSource
from metadata.ingestion.source.database.common_db_source import SQLSourceStatus from metadata.ingestion.source.database.common_db_source import SQLSourceStatus
from metadata.utils import fqn from metadata.utils import fqn
@ -45,12 +42,6 @@ from metadata.utils.filters import filter_by_chart
from metadata.utils.helpers import get_standard_chart_type, replace_special_with from metadata.utils.helpers import get_standard_chart_type, replace_special_with
from metadata.utils.logger import ingestion_logger from metadata.utils.logger import ingestion_logger
# Prevent sqllineage from modifying the logger config
# Disable the DictConfigurator.configure method while importing LineageRunner
configure = DictConfigurator.configure
DictConfigurator.configure = lambda _: None
from sqllineage.runner import LineageRunner # pylint: disable=C0413
HEADERS = {"Content-Type": "application/json", "Accept": "*/*"} HEADERS = {"Content-Type": "application/json", "Accept": "*/*"}
logger = ingestion_logger() logger = ingestion_logger()
@ -241,10 +232,9 @@ class MetabaseSource(DashboardServiceSource):
resp_database = self.req_get(f"/api/database/{chart_details['database_id']}") resp_database = self.req_get(f"/api/database/{chart_details['database_id']}")
if resp_database.status_code == 200: if resp_database.status_code == 200:
database = resp_database.json() database = resp_database.json()
query = chart_details["dataset_query"]["native"]["query"] query = chart_details["dataset_query"]["native"]["query"]
table_list = LineageRunner(clean_raw_query(query)) lineage_parser = LineageParser(query)
for table in table_list.source_tables: for table in lineage_parser.source_tables:
database_schema_name, table = fqn.split(str(table))[-2:] database_schema_name, table = fqn.split(str(table))[-2:]
database_schema_name = ( database_schema_name = (
None None

View File

@ -11,7 +11,6 @@
"""Mode source module""" """Mode source module"""
import traceback import traceback
from logging.config import DictConfigurator
from typing import Iterable, List, Optional from typing import Iterable, List, Optional
from metadata.clients import mode_client from metadata.clients import mode_client
@ -33,24 +32,13 @@ from metadata.generated.schema.metadataIngestion.workflow import (
) )
from metadata.generated.schema.type.entityReference import EntityReference from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.api.source import InvalidSourceException from metadata.ingestion.api.source import InvalidSourceException
from metadata.ingestion.lineage.sql_lineage import ( from metadata.ingestion.lineage.parser import LineageParser
clean_raw_query, from metadata.ingestion.lineage.sql_lineage import search_table_entities
search_table_entities,
)
from metadata.ingestion.source.dashboard.dashboard_service import DashboardServiceSource from metadata.ingestion.source.dashboard.dashboard_service import DashboardServiceSource
from metadata.utils import fqn from metadata.utils import fqn
from metadata.utils.filters import filter_by_chart from metadata.utils.filters import filter_by_chart
from metadata.utils.logger import ingestion_logger from metadata.utils.logger import ingestion_logger
# Prevent sqllineage from modifying the logger config
# Disable the DictConfigurator.configure method while importing LineageRunner
configure = DictConfigurator.configure
DictConfigurator.configure = lambda _: None
from sqllineage.runner import LineageRunner # pylint: disable=C0413
# Reverting changes after import is done
DictConfigurator.configure = configure
logger = ingestion_logger() logger = ingestion_logger()
@ -140,8 +128,8 @@ class ModeSource(DashboardServiceSource):
data_source = self.data_sources.get(query.get("data_source_id")) data_source = self.data_sources.get(query.get("data_source_id"))
if not data_source: if not data_source:
continue continue
table_list = LineageRunner(clean_raw_query(query.get("raw_query"))) lineage_parser = LineageParser(query.get("raw_query"))
for table in table_list.source_tables: for table in lineage_parser.source_tables:
database_schema_name, table = fqn.split(str(table))[-2:] database_schema_name, table = fqn.split(str(table))[-2:]
database_schema_name = ( database_schema_name = (
None None

View File

@ -12,7 +12,6 @@
Redash source module Redash source module
""" """
import traceback import traceback
from logging.config import DictConfigurator
from typing import Iterable, List, Optional from typing import Iterable, List, Optional
from metadata.generated.schema.api.data.createChart import CreateChartRequest from metadata.generated.schema.api.data.createChart import CreateChartRequest
@ -33,23 +32,13 @@ from metadata.generated.schema.metadataIngestion.workflow import (
) )
from metadata.generated.schema.type.entityReference import EntityReference from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.api.source import InvalidSourceException from metadata.ingestion.api.source import InvalidSourceException
from metadata.ingestion.lineage.sql_lineage import clean_raw_query from metadata.ingestion.lineage.parser import LineageParser
from metadata.ingestion.source.dashboard.dashboard_service import DashboardServiceSource from metadata.ingestion.source.dashboard.dashboard_service import DashboardServiceSource
from metadata.utils import fqn from metadata.utils import fqn
from metadata.utils.filters import filter_by_chart from metadata.utils.filters import filter_by_chart
from metadata.utils.helpers import get_standard_chart_type from metadata.utils.helpers import get_standard_chart_type
from metadata.utils.logger import ingestion_logger from metadata.utils.logger import ingestion_logger
# Prevent sqllineage from modifying the logger config
# Disable the DictConfigurator.configure method while importing LineageRunner
configure = DictConfigurator.configure
DictConfigurator.configure = lambda _: None
from sqllineage.runner import LineageRunner # pylint: disable=C0413
# Reverting changes after import is done
DictConfigurator.configure = configure
logger = ingestion_logger() logger = ingestion_logger()
@ -142,10 +131,8 @@ class RedashSource(DashboardServiceSource):
if not visualization: if not visualization:
continue continue
if visualization.get("query", {}).get("query"): if visualization.get("query", {}).get("query"):
parser = LineageRunner( lineage_parser = LineageParser(visualization["query"]["query"])
clean_raw_query(visualization["query"]["query"]) for table in lineage_parser.source_tables:
)
for table in parser.source_tables:
table_name = str(table) table_name = str(table)
database_schema_table = fqn.split_table_name(table_name) database_schema_table = fqn.split_table_name(table_name)
from_fqn = fqn.build( from_fqn = fqn.build(

View File

@ -124,7 +124,8 @@ class TableauSource(DashboardServiceSource):
"\nSomething went wrong while connecting to Tableau Metadata APIs\n" "\nSomething went wrong while connecting to Tableau Metadata APIs\n"
"Please check if the Tableau Metadata APIs are enabled for you Tableau instance\n" "Please check if the Tableau Metadata APIs are enabled for you Tableau instance\n"
"For more information on enabling the Tableau Metadata APIs follow the link below\n" "For more information on enabling the Tableau Metadata APIs follow the link below\n"
"https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html#enable-the-tableau-metadata-api-for-tableau-server\n" # pylint: disable=line-too-long "https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html"
"#enable-the-tableau-metadata-api-for-tableau-server\n"
) )
return super().prepare() return super().prepare()

View File

@ -14,7 +14,6 @@ Generic source to build SQL connectors.
import traceback import traceback
from abc import ABC from abc import ABC
from copy import deepcopy from copy import deepcopy
from logging.config import DictConfigurator
from typing import Iterable, Optional, Tuple from typing import Iterable, Optional, Tuple
from sqlalchemy.engine import Connection from sqlalchemy.engine import Connection
@ -40,8 +39,8 @@ from metadata.generated.schema.metadataIngestion.workflow import (
Source as WorkflowSource, Source as WorkflowSource,
) )
from metadata.generated.schema.type.entityReference import EntityReference from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.lineage.parser import LineageParser
from metadata.ingestion.lineage.sql_lineage import ( from metadata.ingestion.lineage.sql_lineage import (
clean_raw_query,
get_lineage_by_query, get_lineage_by_query,
get_lineage_via_table_entity, get_lineage_via_table_entity,
) )
@ -399,20 +398,10 @@ class CommonDbSourceService(
schema_name=schema_name, schema_name=schema_name,
inspector=self.inspector, inspector=self.inspector,
) )
# Prevent sqllineage from modifying the logger config
# Disable the DictConfigurator.configure method while importing LineageRunner
configure = DictConfigurator.configure
DictConfigurator.configure = lambda _: None
from sqllineage.runner import ( # pylint: disable=import-outside-toplevel
LineageRunner,
)
# Reverting changes after import is done
DictConfigurator.configure = configure
try: try:
result = LineageRunner(clean_raw_query(view_definition)) lineage_parser = LineageParser(view_definition)
if result.source_tables and result.target_tables: if lineage_parser.source_tables and lineage_parser.target_tables:
yield from get_lineage_by_query( yield from get_lineage_by_query(
self.metadata, self.metadata,
query=view_definition, query=view_definition,

View File

@ -421,8 +421,7 @@ class ProfilerWorkflow(WorkflowStatusMixin):
raise ValueError( raise ValueError(
"databaseFilterPattern returned 0 result. At least 1 database must be returned by the filter pattern." "databaseFilterPattern returned 0 result. At least 1 database must be returned by the filter pattern."
f"\n\t- includes: {self.source_config.databaseFilterPattern.includes if self.source_config.databaseFilterPattern else None}" # pylint: disable=line-too-long f"\n\t- includes: {self.source_config.databaseFilterPattern.includes if self.source_config.databaseFilterPattern else None}" # pylint: disable=line-too-long
f"\n\t- excludes: {self.source_config.databaseFilterPattern.excludes if self.source_config.databaseFilterPattern else None}" f"\n\t- excludes: {self.source_config.databaseFilterPattern.excludes if self.source_config.databaseFilterPattern else None}" # pylint: disable=line-too-long
# pylint: disable=line-too-long
) )
for database in databases: for database in databases:

View File

@ -13,26 +13,10 @@
Validate query parser logic Validate query parser logic
""" """
# Prevent sqllineage from modifying the logger config
# Disable the DictConfigurator.configure method while importing LineageRunner
from logging.config import DictConfigurator
from unittest import TestCase from unittest import TestCase
from metadata.generated.schema.type.tableUsageCount import TableColumn, TableColumnJoin from metadata.generated.schema.type.tableUsageCount import TableColumn, TableColumnJoin
from metadata.ingestion.lineage.parser import ( from metadata.ingestion.lineage.parser import LineageParser
get_clean_parser_table_list,
get_involved_tables_from_parser,
get_parser_table_aliases,
get_table_joins,
)
from metadata.ingestion.lineage.sql_lineage import clean_raw_query
configure = DictConfigurator.configure
DictConfigurator.configure = lambda _: None
from sqllineage.runner import LineageRunner
# Reverting changes after import is done
DictConfigurator.configure = configure
class QueryParserTests(TestCase): class QueryParserTests(TestCase):
@ -58,22 +42,20 @@ class QueryParserTests(TestCase):
WHERE a.col3 = 'abc' WHERE a.col3 = 'abc'
""" """
parser = LineageRunner(col_lineage) parser = LineageParser(col_lineage)
def test_involved_tables(self): def test_involved_tables(self):
tables = {str(table) for table in get_involved_tables_from_parser(self.parser)} tables = {str(table) for table in self.parser.involved_tables}
self.assertEqual( self.assertEqual(
tables, {"db.grault", "db.holis", "<default>.foo", "db.random"} tables, {"db.grault", "db.holis", "<default>.foo", "db.random"}
) )
def test_clean_parser_table_list(self): def test_clean_parser_table_list(self):
tables = get_involved_tables_from_parser(self.parser) clean_tables = set(self.parser.clean_table_list)
clean_tables = set(get_clean_parser_table_list(tables))
self.assertEqual(clean_tables, {"db.grault", "db.holis", "foo", "db.random"}) self.assertEqual(clean_tables, {"db.grault", "db.holis", "foo", "db.random"})
def test_parser_table_aliases(self): def test_parser_table_aliases(self):
tables = get_involved_tables_from_parser(self.parser) aliases = self.parser.table_aliases
aliases = get_parser_table_aliases(tables)
self.assertEqual( self.assertEqual(
aliases, {"b": "db.grault", "c": "db.holis", "a": "foo", "d": "db.random"} aliases, {"b": "db.grault", "c": "db.holis", "a": "foo", "d": "db.random"}
) )
@ -82,14 +64,7 @@ class QueryParserTests(TestCase):
""" """
main logic point main logic point
""" """
tables = get_involved_tables_from_parser(self.parser) joins = self.parser.table_joins
clean_tables = get_clean_parser_table_list(tables)
aliases = get_parser_table_aliases(tables)
joins = get_table_joins(
parser=self.parser, tables=clean_tables, aliases=aliases
)
self.assertEqual( self.assertEqual(
joins["foo"], joins["foo"],
@ -126,14 +101,9 @@ class QueryParserTests(TestCase):
; ;
""" """
parser = LineageRunner(query) parser = LineageParser(query)
tables = get_involved_tables_from_parser(parser) joins = parser.table_joins
clean_tables = get_clean_parser_table_list(tables)
aliases = get_parser_table_aliases(tables)
joins = get_table_joins(parser=parser, tables=clean_tables, aliases=aliases)
self.assertEqual( self.assertEqual(
joins["testdb.public.users"], joins["testdb.public.users"],
@ -153,23 +123,33 @@ class QueryParserTests(TestCase):
def test_clean_raw_query_copy_grants(self): def test_clean_raw_query_copy_grants(self):
""" """
Validate query cleaning logic Validate COPY GRANT query cleaning logic
""" """
query = "create or replace view my_view copy grants as select * from my_table" query = "create or replace view my_view copy grants as select * from my_table"
self.assertEqual( self.assertEqual(
clean_raw_query(query), LineageParser.clean_raw_query(query),
"create or replace view my_view as select * from my_table", "create or replace view my_view as select * from my_table",
) )
def test_clean_raw_query_merge_into(self): def test_clean_raw_query_merge_into(self):
""" """
Validate query cleaning logic Validate MERGE INTO query cleaning logic
""" """
query = """ query = """
/* comment */ merge into table_1 using (select a, b from table_2) when matched update set t.a = 'value' /* comment */ merge into table_1 using (select a, b from table_2) when matched update set t.a = 'value'
when not matched then insert (table_1.a, table_2.b) values ('value1', 'value2') when not matched then insert (table_1.a, table_2.b) values ('value1', 'value2')
""" """
self.assertEqual( self.assertEqual(
clean_raw_query(query), LineageParser.clean_raw_query(query),
"/* comment */ merge into table_1 using (select a, b from table_2)", "/* comment */ merge into table_1 using (select a, b from table_2)",
) )
def test_clean_raw_query_copy_from(self):
"""
Validate COPY FROM query cleaning logic
"""
query = "COPY my_schema.my_table FROM 's3://bucket/path/object.csv';"
self.assertEqual(
LineageParser.clean_raw_query(query),
None,
)

View File

@ -12,22 +12,11 @@
""" """
sql lineage utils tests sql lineage utils tests
""" """
from logging.config import DictConfigurator
from unittest import TestCase from unittest import TestCase
from sqllineage.runner import LineageRunner from metadata.ingestion.lineage.parser import LineageParser
from metadata.ingestion.lineage.sql_lineage import populate_column_lineage_map from metadata.ingestion.lineage.sql_lineage import populate_column_lineage_map
# Prevent sqllineage from modifying the logger config
# Disable the DictConfigurator.configure method while importing LineageRunner
configure = DictConfigurator.configure
DictConfigurator.configure = lambda _: None
# Reverting changes after import is done
DictConfigurator.configure = configure
QUERY = [ QUERY = [
"CREATE TABLE MYTABLE2 AS SELECT * FROM MYTABLE1;", "CREATE TABLE MYTABLE2 AS SELECT * FROM MYTABLE1;",
"CREATE TABLE MYTABLE3 AS SELECT ID, NAME FROM MYTABLE1", "CREATE TABLE MYTABLE3 AS SELECT ID, NAME FROM MYTABLE1",
@ -50,7 +39,7 @@ class SqlLineageTest(TestCase):
def test_populate_column_lineage_map(self): def test_populate_column_lineage_map(self):
for i in range(len(QUERY)): for i in range(len(QUERY)):
result = LineageRunner(QUERY[i]) lineage_parser = LineageParser(QUERY[i])
raw_column_lineage = result.get_column_lineage() raw_column_lineage = lineage_parser.column_lineage
lineage_map = populate_column_lineage_map(raw_column_lineage) lineage_map = populate_column_lineage_map(raw_column_lineage)
self.assertEqual(lineage_map, EXPECTED_LINEAGE_MAP[i]) self.assertEqual(lineage_map, EXPECTED_LINEAGE_MAP[i])