fix(ingest): bigquery-beta - Additional fixes for Bigquery beta (#6051)

This commit is contained in:
Tamas Nemeth 2022-09-26 23:08:09 +02:00 committed by GitHub
parent 60928757e0
commit 32b8bef92e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 128 additions and 70 deletions

View File

@ -150,7 +150,7 @@ redshift_common = {
"sqlalchemy-redshift",
"psycopg2-binary",
"GeoAlchemy2",
"sqllineage==1.3.5",
"sqllineage==1.3.6",
*path_spec_common,
}
@ -216,18 +216,18 @@ plugins: Dict[str, Set[str]] = {
"gql>=3.3.0",
"gql[requests]>=3.3.0",
},
"great-expectations": sql_common | {"sqllineage==1.3.5"},
"great-expectations": sql_common | {"sqllineage==1.3.6"},
# Source plugins
# PyAthena is pinned with exact version because we use private method in PyAthena
"athena": sql_common | {"PyAthena[SQLAlchemy]==2.4.1"},
"azure-ad": set(),
"bigquery": sql_common
| bigquery_common
| {"sqlalchemy-bigquery>=1.4.1", "sqllineage==1.3.5", "sqlparse"},
| {"sqlalchemy-bigquery>=1.4.1", "sqllineage==1.3.6", "sqlparse"},
"bigquery-usage": bigquery_common | usage_common | {"cachetools"},
"bigquery-beta": sql_common
| bigquery_common
| {"sqllineage==1.3.5", "sql_metadata"},
| {"sqllineage==1.3.6", "sql_metadata"},
"clickhouse": sql_common | {"clickhouse-sqlalchemy==0.1.8"},
"clickhouse-usage": sql_common
| usage_common
@ -269,9 +269,9 @@ plugins: Dict[str, Set[str]] = {
"looker": looker_common,
# lkml>=1.1.2 is required to support the sql_preamble expression in LookML
"lookml": looker_common
| {"lkml>=1.1.2", "sql-metadata==2.2.2", "sqllineage==1.3.5", "GitPython>2"},
"metabase": {"requests", "sqllineage==1.3.5"},
"mode": {"requests", "sqllineage==1.3.5", "tenacity>=8.0.1"},
| {"lkml>=1.1.2", "sql-metadata==2.2.2", "sqllineage==1.3.6", "GitPython>2"},
"metabase": {"requests", "sqllineage==1.3.6"},
"mode": {"requests", "sqllineage==1.3.6", "tenacity>=8.0.1"},
"mongodb": {"pymongo[srv]>=3.11", "packaging"},
"mssql": sql_common | {"sqlalchemy-pytds>=0.3"},
"mssql-odbc": sql_common | {"pyodbc"},
@ -284,7 +284,7 @@ plugins: Dict[str, Set[str]] = {
"presto-on-hive": sql_common
| {"psycopg2-binary", "acryl-pyhive[hive]>=0.6.12", "pymysql>=1.0.2"},
"pulsar": {"requests"},
"redash": {"redash-toolbelt", "sql-metadata", "sqllineage==1.3.5"},
"redash": {"redash-toolbelt", "sql-metadata", "sqllineage==1.3.6"},
"redshift": sql_common | redshift_common,
"redshift-usage": sql_common | usage_common | redshift_common,
"s3": {*s3_base, *data_lake_profiling},

View File

@ -571,9 +571,10 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
view.columns = self.get_columns_for_table(conn, table_identifier)
lineage_info: Optional[Tuple[UpstreamLineage, Dict[str, str]]] = None
lineage_info = self.lineage_extractor.get_upstream_lineage_info(
table_identifier, self.platform
)
if self.config.include_table_lineage:
lineage_info = self.lineage_extractor.get_upstream_lineage_info(
table_identifier, self.platform
)
view_workunits = self.gen_view_dataset_workunits(
view, project_id, dataset_name, lineage_info

View File

@ -15,7 +15,7 @@ class BigQueryV2Report(SQLSourceReport):
num_total_lineage_entries: Optional[int] = None
num_skipped_lineage_entries_missing_data: Optional[int] = None
num_skipped_lineage_entries_not_allowed: Optional[int] = None
num_skipped_lineage_entries_sql_parser_failure: Optional[int] = None
num_lineage_entries_sql_parser_failure: Optional[int] = None
num_skipped_lineage_entries_other: Optional[int] = None
num_total_log_entries: Optional[int] = None
num_parsed_log_entires: Optional[int] = None

View File

@ -362,7 +362,7 @@ timestamp < "{end_time}"
self.report.num_skipped_lineage_entries_missing_data = 0
self.report.num_skipped_lineage_entries_not_allowed = 0
self.report.num_skipped_lineage_entries_other = 0
self.report.num_skipped_lineage_entries_sql_parser_failure = 0
self.report.num_lineage_entries_sql_parser_failure = 0
for e in entries:
self.report.num_total_lineage_entries += 1
if e.destinationTable is None or not (
@ -400,10 +400,10 @@ timestamp < "{end_time}"
map(lambda x: x.split(".")[-1], parser.get_tables())
)
except Exception as ex:
logger.warning(
f"Sql Parser failed on query: {e.query}. It will be skipped from lineage. The error was {ex}"
logger.debug(
f"Sql Parser failed on query: {e.query}. It won't cause any issue except table/view lineage can't be detected reliably. The error was {ex}."
)
self.report.num_skipped_lineage_entries_sql_parser_failure += 1
self.report.num_lineage_entries_sql_parser_failure += 1
continue
curr_lineage_str = lineage_map[destination_table_str]
new_lineage_str = set()

View File

@ -13,7 +13,10 @@ from datahub.ingestion.api.common import WorkUnit
from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
from datahub.ingestion.source.bigquery_v2.bigquery_schema import BigqueryTable
from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
BigqueryColumn,
BigqueryTable,
)
from datahub.ingestion.source.ge_data_profiler import (
DatahubGEProfiler,
GEProfilerRequest,
@ -78,39 +81,57 @@ class BigqueryProfiler:
partition = table.max_partition_id
if partition:
partition_where_clause: str
logger.debug(f"{table} is partitioned and partition column is {partition}")
try:
(
partition_datetime,
upper_bound_partition_datetime,
) = self.get_partition_range_from_partition_id(
partition, partition_datetime
)
except ValueError as e:
logger.error(
f"Unable to get partition range for partition id: {partition} it failed with exception {e}"
)
self.report.invalid_partition_ids[f"{schema}.{table}"] = partition
return None, None
if table.time_partitioning.type_ in ("DAY", "MONTH", "YEAR"):
partition_where_clause = "{column_name} BETWEEN DATE('{partition_id}') AND DATE('{upper_bound_partition_id}')".format(
column_name=table.time_partitioning.field,
partition_id=partition_datetime,
upper_bound_partition_id=upper_bound_partition_datetime,
)
elif table.time_partitioning.type_ in ("HOUR"):
partition_where_clause = "{column_name} BETWEEN '{partition_id}' AND '{upper_bound_partition_id}'".format(
column_name=table.time_partitioning.field,
partition_id=partition_datetime,
upper_bound_partition_id=upper_bound_partition_datetime,
)
if not table.time_partitioning:
partition_column: Optional[BigqueryColumn] = None
for column in table.columns:
if column.is_partition_column:
partition_column = column
break
if partition_column:
partition_where_clause = f"{partition_column.name} >= {partition}"
else:
logger.warning(
f"Partitioned table {table.name} without partiton column"
)
return None, None
else:
logger.warning(
f"Not supported partition type {table.time_partitioning.type_}"
logger.debug(
f"{table.name} is partitioned and partition column is {partition}"
)
return None, None
try:
(
partition_datetime,
upper_bound_partition_datetime,
) = self.get_partition_range_from_partition_id(
partition, partition_datetime
)
except ValueError as e:
logger.error(
f"Unable to get partition range for partition id: {partition} it failed with exception {e}"
)
self.report.invalid_partition_ids[
f"{schema}.{table.name}"
] = partition
return None, None
if table.time_partitioning.type_ in ("DAY", "MONTH", "YEAR"):
partition_where_clause = "{column_name} BETWEEN DATE('{partition_id}') AND DATE('{upper_bound_partition_id}')".format(
column_name=table.time_partitioning.field,
partition_id=partition_datetime,
upper_bound_partition_id=upper_bound_partition_datetime,
)
elif table.time_partitioning.type_ in ("HOUR"):
partition_where_clause = "{column_name} BETWEEN '{partition_id}' AND '{upper_bound_partition_id}'".format(
column_name=table.time_partitioning.field,
partition_id=partition_datetime,
upper_bound_partition_id=upper_bound_partition_datetime,
)
else:
logger.warning(
f"Not supported partition type {table.time_partitioning.type_}"
)
return None, None
custom_sql = """
SELECT
*

View File

@ -546,10 +546,25 @@ class LookerView:
def _get_sql_info(cls, sql: str, sql_parser_path: str) -> SQLInfo:
parser_cls = cls._import_sql_parser_cls(sql_parser_path)
parser_instance: SQLParser = parser_cls(sql)
try:
parser_instance: SQLParser = parser_cls(sql)
except Exception as e:
logger.warning(f"Sql parser failed on {sql} with {e}")
return SQLInfo(table_names=[], column_names=[])
sql_table_names: List[str]
try:
sql_table_names = parser_instance.get_tables()
except Exception as e:
logger.warning(f"Sql parser failed on {sql} with {e}")
sql_table_names = []
try:
column_names: List[str] = parser_instance.get_columns()
except Exception as e:
logger.warning(f"Sql parser failed on {sql} with {e}")
column_names = []
sql_table_names: List[str] = parser_instance.get_tables()
column_names: List[str] = parser_instance.get_columns()
logger.debug(f"Column names parsed = {column_names}")
# Drop table names with # in them
sql_table_names = [t for t in sql_table_names if "#" not in t]

View File

@ -390,7 +390,11 @@ class RedashSource(Source):
def _get_sql_table_names(cls, sql: str, sql_parser_path: str) -> List[str]:
parser_cls = cls._import_sql_parser_cls(sql_parser_path)
sql_table_names: List[str] = parser_cls(sql).get_tables()
try:
sql_table_names: List[str] = parser_cls(sql).get_tables()
except Exception as e:
logger.warning(f"Sql parser failed on {sql} with {e}")
return []
# Remove quotes from table names
sql_table_names = [t.replace('"', "") for t in sql_table_names]

View File

@ -642,7 +642,12 @@ class DataHubValidationAction(ValidationAction):
query=query,
customProperties=batchSpecProperties,
)
tables = DefaultSQLParser(query).get_tables()
try:
tables = DefaultSQLParser(query).get_tables()
except Exception as e:
logger.warning(f"Sql parser failed on {query} with {e}")
tables = []
if len(set(tables)) != 1:
warn(
"DataHubValidationAction does not support cross dataset assertions."

View File

@ -8,6 +8,8 @@ from typing import Dict, List, Optional, Set
from sqllineage.core.holders import Column, SQLLineageHolder
from sqllineage.exceptions import SQLLineageException
from datahub.utilities.sql_parser_base import SQLParser, SqlParserException
with contextlib.suppress(ImportError):
import sqlparse
from networkx import DiGraph
@ -17,7 +19,7 @@ with contextlib.suppress(ImportError):
logger = logging.getLogger(__name__)
class SqlLineageSQLParserImpl:
class SqlLineageSQLParserImpl(SQLParser):
_DATE_SWAP_TOKEN = "__d_a_t_e"
_HOUR_SWAP_TOKEN = "__h_o_u_r"
_TIMESTAMP_SWAP_TOKEN = "__t_i_m_e_s_t_a_m_p"
@ -27,6 +29,7 @@ class SqlLineageSQLParserImpl:
_MYVIEW_LOOKER_TOKEN = "my_view.SQL_TABLE_NAME"
def __init__(self, sql_query: str) -> None:
super().__init__(sql_query)
original_sql_query = sql_query
# SqlLineageParser makes mistakes on lateral flatten queries, use the prefix
@ -97,7 +100,9 @@ class SqlLineageSQLParserImpl:
]
self._sql_holder = SQLLineageHolder.of(*self._stmt_holders)
except SQLLineageException as e:
logger.error(f"SQL lineage analyzer error '{e}' for query: '{self._sql}")
raise SqlParserException(
f"SQL lineage analyzer error '{e}' for query: '{self._sql}"
) from e
def get_tables(self) -> List[str]:
result: List[str] = []
@ -123,8 +128,7 @@ class SqlLineageSQLParserImpl:
def get_columns(self) -> List[str]:
if self._sql_holder is None:
logger.error("sql holder not present so cannot get columns")
return []
raise SqlParserException("sql holder not present so cannot get columns")
graph: DiGraph = self._sql_holder.graph # For mypy attribute checking
column_nodes = [n for n in graph.nodes if isinstance(n, Column)]
column_graph = graph.subgraph(column_nodes)

View File

@ -4,30 +4,17 @@ import multiprocessing
import re
import sys
import traceback
from abc import ABCMeta, abstractmethod
from multiprocessing import Process, Queue
from typing import List, Optional, Tuple, Type
from datahub.utilities.sql_lineage_parser_impl import SqlLineageSQLParserImpl
from datahub.utilities.sql_parser_base import SQLParser
with contextlib.suppress(ImportError):
from sql_metadata import Parser as MetadataSQLParser
logger = logging.getLogger(__name__)
class SQLParser(metaclass=ABCMeta):
def __init__(self, sql_query: str) -> None:
self._sql_query = sql_query
@abstractmethod
def get_tables(self) -> List[str]:
pass
@abstractmethod
def get_columns(self) -> List[str]:
pass
class MetadataSQLSQLParser(SQLParser):
_DATE_SWAP_TOKEN = "__d_a_t_e"
@ -104,7 +91,7 @@ def sql_lineage_parser_impl_func_wrapper(
exc_info = sys.exc_info()
exc_msg: str = str(exc_info[1]) + "".join(traceback.format_tb(exc_info[2]))
exception_details = (exc_info[0], exc_msg)
logger.error(exc_msg)
logger.debug(exc_msg)
finally:
queue.put((tables, columns, exception_details))

View File

@ -0,0 +1,21 @@
from abc import ABCMeta, abstractmethod
from typing import List
class SqlParserException(Exception):
"""Raised when sql parser fails"""
pass
class SQLParser(metaclass=ABCMeta):
def __init__(self, sql_query: str) -> None:
self._sql_query = sql_query
@abstractmethod
def get_tables(self) -> List[str]:
pass
@abstractmethod
def get_columns(self) -> List[str]:
pass