mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-08 07:23:34 +00:00
fix(ingest): replace sqllineage/sqlparse with our SQL parser (#12020)
This commit is contained in:
parent
61fffb2a81
commit
57b12bd9cb
@ -20,11 +20,13 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
|
|||||||
|
|
||||||
- #11560 - The PowerBI ingestion source configuration option include_workspace_name_in_dataset_urn determines whether the workspace name is included in the PowerBI dataset's URN.<br/> PowerBI allows to have identical name of semantic model and their tables across the workspace, It will overwrite the semantic model in-case of multi-workspace ingestion.<br/>
|
- #11560 - The PowerBI ingestion source configuration option include_workspace_name_in_dataset_urn determines whether the workspace name is included in the PowerBI dataset's URN.<br/> PowerBI allows to have identical name of semantic model and their tables across the workspace, It will overwrite the semantic model in-case of multi-workspace ingestion.<br/>
|
||||||
Entity urn with `include_workspace_name_in_dataset_urn: false`
|
Entity urn with `include_workspace_name_in_dataset_urn: false`
|
||||||
|
|
||||||
```
|
```
|
||||||
urn:li:dataset:(urn:li:dataPlatform:powerbi,[<PlatformInstance>.]<SemanticModelName>.<TableName>,<ENV>)
|
urn:li:dataset:(urn:li:dataPlatform:powerbi,[<PlatformInstance>.]<SemanticModelName>.<TableName>,<ENV>)
|
||||||
```
|
```
|
||||||
|
|
||||||
Entity urn with `include_workspace_name_in_dataset_urn: true`
|
Entity urn with `include_workspace_name_in_dataset_urn: true`
|
||||||
|
|
||||||
```
|
```
|
||||||
urn:li:dataset:(urn:li:dataPlatform:powerbi,[<PlatformInstance>.].<WorkspaceName>.<SemanticModelName>.<TableName>,<ENV>)
|
urn:li:dataset:(urn:li:dataPlatform:powerbi,[<PlatformInstance>.].<WorkspaceName>.<SemanticModelName>.<TableName>,<ENV>)
|
||||||
```
|
```
|
||||||
@ -48,6 +50,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
|
|||||||
- #11619 - schema field/column paths can no longer be duplicated within the schema
|
- #11619 - schema field/column paths can no longer be duplicated within the schema
|
||||||
- #11570 - The `DatahubClientConfig`'s server field no longer defaults to `http://localhost:8080`. Be sure to explicitly set this.
|
- #11570 - The `DatahubClientConfig`'s server field no longer defaults to `http://localhost:8080`. Be sure to explicitly set this.
|
||||||
- #11570 - If a `datahub_api` is explicitly passed to a stateful ingestion config provider, it will be used. We previously ignored it if the pipeline context also had a graph object.
|
- #11570 - If a `datahub_api` is explicitly passed to a stateful ingestion config provider, it will be used. We previously ignored it if the pipeline context also had a graph object.
|
||||||
|
- #11518 - DataHub Garbage Collection: Various entities that are soft-deleted (after 10d) or are timeseries _entities_ (dataprocess, execution requests) will be removed automatically using logic in the `datahub-gc` ingestion source.
|
||||||
|
- #12020 - Removed `sql_parser` configuration from the Redash source, as Redash now exclusively uses the sqlglot-based parser for lineage extraction.
|
||||||
|
- #12020 - Removed `datahub.utilities.sql_parser`, `datahub.utilities.sql_parser_base` and `datahub.utilities.sql_lineage_parser_impl` module along with `SqlLineageSQLParser` and `DefaultSQLParser`. Use `create_lineage_sql_parsed_result` from `datahub.sql_parsing.sqlglot_lineage` module instead.
|
||||||
- #11518 - DataHub Garbage Collection: Various entities that are soft-deleted
|
- #11518 - DataHub Garbage Collection: Various entities that are soft-deleted
|
||||||
(after 10d) or are timeseries *entities* (dataprocess, execution requests)
|
(after 10d) or are timeseries *entities* (dataprocess, execution requests)
|
||||||
will be removed automatically using logic in the `datahub-gc` ingestion
|
will be removed automatically using logic in the `datahub-gc` ingestion
|
||||||
|
|||||||
@ -15,15 +15,6 @@ def get_long_description():
|
|||||||
|
|
||||||
rest_common = {"requests", "requests_file"}
|
rest_common = {"requests", "requests_file"}
|
||||||
|
|
||||||
# TODO: Can we move away from sqllineage and use sqlglot ??
|
|
||||||
sqllineage_lib = {
|
|
||||||
"sqllineage==1.3.8",
|
|
||||||
# We don't have a direct dependency on sqlparse but it is a dependency of sqllineage.
|
|
||||||
# There have previously been issues from not pinning sqlparse, so it's best to pin it.
|
|
||||||
# Related: https://github.com/reata/sqllineage/issues/361 and https://github.com/reata/sqllineage/pull/360
|
|
||||||
"sqlparse==0.4.4",
|
|
||||||
}
|
|
||||||
|
|
||||||
_version: str = package_metadata["__version__"]
|
_version: str = package_metadata["__version__"]
|
||||||
_self_pin = (
|
_self_pin = (
|
||||||
f"=={_version}"
|
f"=={_version}"
|
||||||
@ -43,8 +34,7 @@ base_requirements = {
|
|||||||
# https://github.com/ipython/traitlets/issues/741
|
# https://github.com/ipython/traitlets/issues/741
|
||||||
"traitlets<5.2.2",
|
"traitlets<5.2.2",
|
||||||
*rest_common,
|
*rest_common,
|
||||||
*sqllineage_lib,
|
f"acryl-datahub[datahub-rest,sql-parser]{_self_pin}",
|
||||||
f"acryl-datahub[datahub-rest]{_self_pin}",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
mypy_stubs = {
|
mypy_stubs = {
|
||||||
|
|||||||
@ -34,8 +34,9 @@ from datahub.metadata.com.linkedin.pegasus2avro.assertion import (
|
|||||||
)
|
)
|
||||||
from datahub.metadata.com.linkedin.pegasus2avro.common import DataPlatformInstance
|
from datahub.metadata.com.linkedin.pegasus2avro.common import DataPlatformInstance
|
||||||
from datahub.metadata.schema_classes import PartitionSpecClass, PartitionTypeClass
|
from datahub.metadata.schema_classes import PartitionSpecClass, PartitionTypeClass
|
||||||
|
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
||||||
from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED
|
from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED
|
||||||
from datahub.utilities.sql_parser import DefaultSQLParser
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
||||||
from great_expectations.checkpoint.actions import ValidationAction
|
from great_expectations.checkpoint.actions import ValidationAction
|
||||||
from great_expectations.core.batch import Batch
|
from great_expectations.core.batch import Batch
|
||||||
from great_expectations.core.batch_spec import (
|
from great_expectations.core.batch_spec import (
|
||||||
@ -677,10 +678,23 @@ class DataHubValidationAction(ValidationAction):
|
|||||||
query=query,
|
query=query,
|
||||||
customProperties=batchSpecProperties,
|
customProperties=batchSpecProperties,
|
||||||
)
|
)
|
||||||
try:
|
|
||||||
tables = DefaultSQLParser(query).get_tables()
|
data_platform = get_platform_from_sqlalchemy_uri(str(sqlalchemy_uri))
|
||||||
except Exception as e:
|
sql_parser_in_tables = create_lineage_sql_parsed_result(
|
||||||
logger.warning(f"Sql parser failed on {query} with {e}")
|
query=query,
|
||||||
|
platform=data_platform,
|
||||||
|
env=self.env,
|
||||||
|
platform_instance=None,
|
||||||
|
default_db=None,
|
||||||
|
)
|
||||||
|
tables = [
|
||||||
|
DatasetUrn.from_string(table_urn).name
|
||||||
|
for table_urn in sql_parser_in_tables.in_tables
|
||||||
|
]
|
||||||
|
if sql_parser_in_tables.debug_info.table_error:
|
||||||
|
logger.warning(
|
||||||
|
f"Sql parser failed on {query} with {sql_parser_in_tables.debug_info.table_error}"
|
||||||
|
)
|
||||||
tables = []
|
tables = []
|
||||||
|
|
||||||
if len(set(tables)) != 1:
|
if len(set(tables)) != 1:
|
||||||
|
|||||||
@ -1,5 +1,2 @@
|
|||||||
Note! The integration can use an SQL parser to try to parse the tables the chart depends on. This parsing is disabled by default,
|
Note! The integration can use an SQL parser to try to parse the tables the chart depends on. This parsing is disabled by default,
|
||||||
but can be enabled by setting `parse_table_names_from_sql: true`. The default parser is based on the [`sqllineage`](https://pypi.org/project/sqllineage/) package.
|
but can be enabled by setting `parse_table_names_from_sql: true`. The parser is based on the [`sqlglot`](https://pypi.org/project/sqlglot/) package.
|
||||||
As this package doesn't officially support all the SQL dialects that Redash supports, the result might not be correct. You can, however, implement a
|
|
||||||
custom parser and take it into use by setting the `sql_parser` configuration value. A custom SQL parser must inherit from `datahub.utilities.sql_parser.SQLParser`
|
|
||||||
and must be made available to Datahub by ,for example, installing it. The configuration then needs to be set to `module_name.ClassName` of the parser.
|
|
||||||
|
|||||||
@ -159,14 +159,6 @@ sql_common = (
|
|||||||
| classification_lib
|
| classification_lib
|
||||||
)
|
)
|
||||||
|
|
||||||
sqllineage_lib = {
|
|
||||||
"sqllineage==1.3.8",
|
|
||||||
# We don't have a direct dependency on sqlparse but it is a dependency of sqllineage.
|
|
||||||
# There have previously been issues from not pinning sqlparse, so it's best to pin it.
|
|
||||||
# Related: https://github.com/reata/sqllineage/issues/361 and https://github.com/reata/sqllineage/pull/360
|
|
||||||
"sqlparse==0.4.4",
|
|
||||||
}
|
|
||||||
|
|
||||||
aws_common = {
|
aws_common = {
|
||||||
# AWS Python SDK
|
# AWS Python SDK
|
||||||
"boto3",
|
"boto3",
|
||||||
@ -216,7 +208,6 @@ redshift_common = {
|
|||||||
"sqlalchemy-redshift>=0.8.3",
|
"sqlalchemy-redshift>=0.8.3",
|
||||||
"GeoAlchemy2",
|
"GeoAlchemy2",
|
||||||
"redshift-connector>=2.1.0",
|
"redshift-connector>=2.1.0",
|
||||||
*sqllineage_lib,
|
|
||||||
*path_spec_common,
|
*path_spec_common,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -464,9 +455,7 @@ plugins: Dict[str, Set[str]] = {
|
|||||||
# It's technically wrong for packages to depend on setuptools. However, it seems mlflow does it anyways.
|
# It's technically wrong for packages to depend on setuptools. However, it seems mlflow does it anyways.
|
||||||
"setuptools",
|
"setuptools",
|
||||||
},
|
},
|
||||||
"mode": {"requests", "python-liquid", "tenacity>=8.0.1"}
|
"mode": {"requests", "python-liquid", "tenacity>=8.0.1"} | sqlglot_lib,
|
||||||
| sqllineage_lib
|
|
||||||
| sqlglot_lib,
|
|
||||||
"mongodb": {"pymongo[srv]>=3.11", "packaging"},
|
"mongodb": {"pymongo[srv]>=3.11", "packaging"},
|
||||||
"mssql": sql_common | mssql_common,
|
"mssql": sql_common | mssql_common,
|
||||||
"mssql-odbc": sql_common | mssql_common | {"pyodbc"},
|
"mssql-odbc": sql_common | mssql_common | {"pyodbc"},
|
||||||
@ -482,7 +471,7 @@ plugins: Dict[str, Set[str]] = {
|
|||||||
| pyhive_common
|
| pyhive_common
|
||||||
| {"psycopg2-binary", "pymysql>=1.0.2"},
|
| {"psycopg2-binary", "pymysql>=1.0.2"},
|
||||||
"pulsar": {"requests"},
|
"pulsar": {"requests"},
|
||||||
"redash": {"redash-toolbelt", "sql-metadata"} | sqllineage_lib,
|
"redash": {"redash-toolbelt", "sql-metadata"} | sqlglot_lib,
|
||||||
"redshift": sql_common
|
"redshift": sql_common
|
||||||
| redshift_common
|
| redshift_common
|
||||||
| usage_common
|
| usage_common
|
||||||
@ -503,9 +492,7 @@ plugins: Dict[str, Set[str]] = {
|
|||||||
"slack": slack,
|
"slack": slack,
|
||||||
"superset": superset_common,
|
"superset": superset_common,
|
||||||
"preset": superset_common,
|
"preset": superset_common,
|
||||||
# FIXME: I don't think tableau uses sqllineage anymore so we should be able
|
"tableau": {"tableauserverclient>=0.24.0"} | sqlglot_lib,
|
||||||
# to remove that dependency.
|
|
||||||
"tableau": {"tableauserverclient>=0.24.0"} | sqllineage_lib | sqlglot_lib,
|
|
||||||
"teradata": sql_common
|
"teradata": sql_common
|
||||||
| usage_common
|
| usage_common
|
||||||
| sqlglot_lib
|
| sqlglot_lib
|
||||||
@ -527,9 +514,9 @@ plugins: Dict[str, Set[str]] = {
|
|||||||
),
|
),
|
||||||
"powerbi-report-server": powerbi_report_server,
|
"powerbi-report-server": powerbi_report_server,
|
||||||
"vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8.2"},
|
"vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8.2"},
|
||||||
"unity-catalog": databricks | sql_common | sqllineage_lib,
|
"unity-catalog": databricks | sql_common,
|
||||||
# databricks is alias for unity-catalog and needs to be kept in sync
|
# databricks is alias for unity-catalog and needs to be kept in sync
|
||||||
"databricks": databricks | sql_common | sqllineage_lib,
|
"databricks": databricks | sql_common,
|
||||||
"fivetran": snowflake_common | bigquery_common | sqlglot_lib,
|
"fivetran": snowflake_common | bigquery_common | sqlglot_lib,
|
||||||
"qlik-sense": sqlglot_lib | {"requests", "websocket-client"},
|
"qlik-sense": sqlglot_lib | {"requests", "websocket-client"},
|
||||||
"sigma": sqlglot_lib | {"requests"},
|
"sigma": sqlglot_lib | {"requests"},
|
||||||
|
|||||||
@ -18,7 +18,6 @@ from pydantic import Field, validator
|
|||||||
from requests.adapters import HTTPAdapter, Retry
|
from requests.adapters import HTTPAdapter, Retry
|
||||||
from requests.exceptions import ConnectionError
|
from requests.exceptions import ConnectionError
|
||||||
from requests.models import HTTPBasicAuth, HTTPError
|
from requests.models import HTTPBasicAuth, HTTPError
|
||||||
from sqllineage.runner import LineageRunner
|
|
||||||
from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
|
from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
|
||||||
|
|
||||||
import datahub.emitter.mce_builder as builder
|
import datahub.emitter.mce_builder as builder
|
||||||
@ -820,28 +819,6 @@ class ModeSource(StatefulIngestionSourceBase):
|
|||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
|
||||||
def _get_source_from_query(self, raw_query: str) -> set:
|
|
||||||
query = self._replace_definitions(raw_query)
|
|
||||||
parser = LineageRunner(query)
|
|
||||||
source_paths = set()
|
|
||||||
try:
|
|
||||||
for table in parser.source_tables:
|
|
||||||
sources = str(table).split(".")
|
|
||||||
source_schema, source_table = sources[-2], sources[-1]
|
|
||||||
if source_schema == "<default>":
|
|
||||||
source_schema = str(self.config.default_schema)
|
|
||||||
|
|
||||||
source_paths.add(f"{source_schema}.{source_table}")
|
|
||||||
except Exception as e:
|
|
||||||
self.report.report_failure(
|
|
||||||
title="Failed to Extract Lineage From Query",
|
|
||||||
message="Unable to retrieve lineage from Mode query.",
|
|
||||||
context=f"Query: {raw_query}, Error: {str(e)}",
|
|
||||||
)
|
|
||||||
|
|
||||||
return source_paths
|
|
||||||
|
|
||||||
def _get_datasource_urn(
|
def _get_datasource_urn(
|
||||||
self,
|
self,
|
||||||
platform: str,
|
platform: str,
|
||||||
|
|||||||
@ -2,7 +2,7 @@ import logging
|
|||||||
import math
|
import math
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Dict, Iterable, List, Optional, Set, Type
|
from typing import Dict, Iterable, List, Optional, Set
|
||||||
|
|
||||||
import dateutil.parser as dp
|
import dateutil.parser as dp
|
||||||
from packaging import version
|
from packaging import version
|
||||||
@ -22,7 +22,6 @@ from datahub.ingestion.api.decorators import ( # SourceCapability,; capability,
|
|||||||
platform_name,
|
platform_name,
|
||||||
support_status,
|
support_status,
|
||||||
)
|
)
|
||||||
from datahub.ingestion.api.registry import import_path
|
|
||||||
from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
|
from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
|
||||||
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
||||||
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
||||||
@ -39,9 +38,9 @@ from datahub.metadata.schema_classes import (
|
|||||||
ChartTypeClass,
|
ChartTypeClass,
|
||||||
DashboardInfoClass,
|
DashboardInfoClass,
|
||||||
)
|
)
|
||||||
|
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
||||||
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
||||||
from datahub.utilities.perf_timer import PerfTimer
|
from datahub.utilities.perf_timer import PerfTimer
|
||||||
from datahub.utilities.sql_parser_base import SQLParser
|
|
||||||
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -270,10 +269,6 @@ class RedashConfig(ConfigModel):
|
|||||||
parse_table_names_from_sql: bool = Field(
|
parse_table_names_from_sql: bool = Field(
|
||||||
default=False, description="See note below."
|
default=False, description="See note below."
|
||||||
)
|
)
|
||||||
sql_parser: str = Field(
|
|
||||||
default="datahub.utilities.sql_parser.DefaultSQLParser",
|
|
||||||
description="custom SQL parser. See note below for details.",
|
|
||||||
)
|
|
||||||
|
|
||||||
env: str = Field(
|
env: str = Field(
|
||||||
default=DEFAULT_ENV,
|
default=DEFAULT_ENV,
|
||||||
@ -354,7 +349,6 @@ class RedashSource(Source):
|
|||||||
self.api_page_limit = self.config.api_page_limit or math.inf
|
self.api_page_limit = self.config.api_page_limit or math.inf
|
||||||
|
|
||||||
self.parse_table_names_from_sql = self.config.parse_table_names_from_sql
|
self.parse_table_names_from_sql = self.config.parse_table_names_from_sql
|
||||||
self.sql_parser_path = self.config.sql_parser
|
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Running Redash ingestion with parse_table_names_from_sql={self.parse_table_names_from_sql}"
|
f"Running Redash ingestion with parse_table_names_from_sql={self.parse_table_names_from_sql}"
|
||||||
@ -380,31 +374,6 @@ class RedashSource(Source):
|
|||||||
config = RedashConfig.parse_obj(config_dict)
|
config = RedashConfig.parse_obj(config_dict)
|
||||||
return cls(ctx, config)
|
return cls(ctx, config)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _import_sql_parser_cls(cls, sql_parser_path: str) -> Type[SQLParser]:
|
|
||||||
assert "." in sql_parser_path, "sql_parser-path must contain a ."
|
|
||||||
parser_cls = import_path(sql_parser_path)
|
|
||||||
|
|
||||||
if not issubclass(parser_cls, SQLParser):
|
|
||||||
raise ValueError(f"must be derived from {SQLParser}; got {parser_cls}")
|
|
||||||
return parser_cls
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _get_sql_table_names(cls, sql: str, sql_parser_path: str) -> List[str]:
|
|
||||||
parser_cls = cls._import_sql_parser_cls(sql_parser_path)
|
|
||||||
|
|
||||||
try:
|
|
||||||
sql_table_names: List[str] = parser_cls(sql).get_tables()
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Sql parser failed on {sql} with {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Remove quotes from table names
|
|
||||||
sql_table_names = [t.replace('"', "") for t in sql_table_names]
|
|
||||||
sql_table_names = [t.replace("`", "") for t in sql_table_names]
|
|
||||||
|
|
||||||
return sql_table_names
|
|
||||||
|
|
||||||
def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
|
def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
|
||||||
url = f"/api/data_sources/{data_source_id}"
|
url = f"/api/data_sources/{data_source_id}"
|
||||||
resp = self.client._get(url).json()
|
resp = self.client._get(url).json()
|
||||||
@ -441,14 +410,6 @@ class RedashSource(Source):
|
|||||||
|
|
||||||
return database_name
|
return database_name
|
||||||
|
|
||||||
def _construct_datalineage_urn(
|
|
||||||
self, platform: str, database_name: str, sql_table_name: str
|
|
||||||
) -> str:
|
|
||||||
full_dataset_name = get_full_qualified_name(
|
|
||||||
platform, database_name, sql_table_name
|
|
||||||
)
|
|
||||||
return builder.make_dataset_urn(platform, full_dataset_name, self.config.env)
|
|
||||||
|
|
||||||
def _get_datasource_urns(
|
def _get_datasource_urns(
|
||||||
self, data_source: Dict, sql_query_data: Dict = {}
|
self, data_source: Dict, sql_query_data: Dict = {}
|
||||||
) -> Optional[List[str]]:
|
) -> Optional[List[str]]:
|
||||||
@ -464,34 +425,23 @@ class RedashSource(Source):
|
|||||||
# Getting table lineage from SQL parsing
|
# Getting table lineage from SQL parsing
|
||||||
if self.parse_table_names_from_sql and data_source_syntax == "sql":
|
if self.parse_table_names_from_sql and data_source_syntax == "sql":
|
||||||
dataset_urns = list()
|
dataset_urns = list()
|
||||||
try:
|
sql_parser_in_tables = create_lineage_sql_parsed_result(
|
||||||
sql_table_names = self._get_sql_table_names(
|
query=query,
|
||||||
query, self.sql_parser_path
|
platform=platform,
|
||||||
|
env=self.config.env,
|
||||||
|
platform_instance=None,
|
||||||
|
default_db=database_name,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
# make sure dataset_urns is not empty list
|
||||||
|
dataset_urns = sql_parser_in_tables.in_tables
|
||||||
|
if sql_parser_in_tables.debug_info.table_error:
|
||||||
self.report.queries_problem_parsing.add(str(query_id))
|
self.report.queries_problem_parsing.add(str(query_id))
|
||||||
self.error(
|
self.error(
|
||||||
logger,
|
logger,
|
||||||
"sql-parsing",
|
"sql-parsing",
|
||||||
f"exception {e} in parsing query-{query_id}-datasource-{data_source_id}",
|
f"exception {sql_parser_in_tables.debug_info.table_error} in parsing query-{query_id}-datasource-{data_source_id}",
|
||||||
)
|
|
||||||
sql_table_names = []
|
|
||||||
for sql_table_name in sql_table_names:
|
|
||||||
try:
|
|
||||||
dataset_urns.append(
|
|
||||||
self._construct_datalineage_urn(
|
|
||||||
platform, database_name, sql_table_name
|
|
||||||
)
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
self.report.queries_problem_parsing.add(str(query_id))
|
|
||||||
self.warn(
|
|
||||||
logger,
|
|
||||||
"data-urn-invalid",
|
|
||||||
f"Problem making URN for {sql_table_name} parsed from query {query_id}",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# make sure dataset_urns is not empty list
|
|
||||||
return dataset_urns if len(dataset_urns) > 0 else None
|
return dataset_urns if len(dataset_urns) > 0 else None
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -7,7 +7,6 @@ from typing import Any, Callable, Dict, Generic, Iterable, List, Optional, Set,
|
|||||||
|
|
||||||
import pyspark
|
import pyspark
|
||||||
from databricks.sdk.service.sql import QueryStatementType
|
from databricks.sdk.service.sql import QueryStatementType
|
||||||
from sqllineage.runner import LineageRunner
|
|
||||||
|
|
||||||
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
||||||
from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics
|
from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics
|
||||||
@ -22,7 +21,9 @@ from datahub.ingestion.source.unity.proxy_types import (
|
|||||||
from datahub.ingestion.source.unity.report import UnityCatalogReport
|
from datahub.ingestion.source.unity.report import UnityCatalogReport
|
||||||
from datahub.ingestion.source.usage.usage_common import UsageAggregator
|
from datahub.ingestion.source.usage.usage_common import UsageAggregator
|
||||||
from datahub.metadata.schema_classes import OperationClass
|
from datahub.metadata.schema_classes import OperationClass
|
||||||
|
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
||||||
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
||||||
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -48,6 +49,7 @@ class UnityCatalogUsageExtractor:
|
|||||||
proxy: UnityCatalogApiProxy
|
proxy: UnityCatalogApiProxy
|
||||||
table_urn_builder: Callable[[TableReference], str]
|
table_urn_builder: Callable[[TableReference], str]
|
||||||
user_urn_builder: Callable[[str], str]
|
user_urn_builder: Callable[[str], str]
|
||||||
|
platform: str = "databricks"
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
self.usage_aggregator = UsageAggregator[TableReference](self.config)
|
self.usage_aggregator = UsageAggregator[TableReference](self.config)
|
||||||
@ -173,7 +175,7 @@ class UnityCatalogUsageExtractor:
|
|||||||
self, query: Query, table_map: TableMap
|
self, query: Query, table_map: TableMap
|
||||||
) -> Optional[QueryTableInfo]:
|
) -> Optional[QueryTableInfo]:
|
||||||
with self.report.usage_perf_report.sql_parsing_timer:
|
with self.report.usage_perf_report.sql_parsing_timer:
|
||||||
table_info = self._parse_query_via_lineage_runner(query.query_text)
|
table_info = self._parse_query_via_sqlglot(query.query_text)
|
||||||
if table_info is None and query.statement_type == QueryStatementType.SELECT:
|
if table_info is None and query.statement_type == QueryStatementType.SELECT:
|
||||||
with self.report.usage_perf_report.spark_sql_parsing_timer:
|
with self.report.usage_perf_report.spark_sql_parsing_timer:
|
||||||
table_info = self._parse_query_via_spark_sql_plan(query.query_text)
|
table_info = self._parse_query_via_spark_sql_plan(query.query_text)
|
||||||
@ -191,26 +193,33 @@ class UnityCatalogUsageExtractor:
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
def _parse_query_via_lineage_runner(self, query: str) -> Optional[StringTableInfo]:
|
def _parse_query_via_sqlglot(self, query: str) -> Optional[StringTableInfo]:
|
||||||
try:
|
try:
|
||||||
runner = LineageRunner(query)
|
sql_parser_in_tables = create_lineage_sql_parsed_result(
|
||||||
|
query=query,
|
||||||
|
default_db=None,
|
||||||
|
platform=self.platform,
|
||||||
|
env=self.config.env,
|
||||||
|
platform_instance=None,
|
||||||
|
)
|
||||||
|
|
||||||
return GenericTableInfo(
|
return GenericTableInfo(
|
||||||
source_tables=[
|
source_tables=[
|
||||||
self._parse_sqllineage_table(table)
|
self._parse_sqlglot_table(table)
|
||||||
for table in runner.source_tables
|
for table in sql_parser_in_tables.in_tables
|
||||||
],
|
],
|
||||||
target_tables=[
|
target_tables=[
|
||||||
self._parse_sqllineage_table(table)
|
self._parse_sqlglot_table(table)
|
||||||
for table in runner.target_tables
|
for table in sql_parser_in_tables.out_tables
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.info(f"Could not parse query via lineage runner, {query}: {e!r}")
|
logger.info(f"Could not parse query via sqlglot, {query}: {e!r}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _parse_sqllineage_table(sqllineage_table: object) -> str:
|
def _parse_sqlglot_table(table_urn: str) -> str:
|
||||||
full_table_name = str(sqllineage_table)
|
full_table_name = DatasetUrn.from_string(table_urn).name
|
||||||
default_schema = "<default>."
|
default_schema = "<default>."
|
||||||
if full_table_name.startswith(default_schema):
|
if full_table_name.startswith(default_schema):
|
||||||
return full_table_name[len(default_schema) :]
|
return full_table_name[len(default_schema) :]
|
||||||
|
|||||||
@ -1,160 +0,0 @@
|
|||||||
import contextlib
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
import unittest
|
|
||||||
import unittest.mock
|
|
||||||
from typing import Dict, List, Optional, Set
|
|
||||||
|
|
||||||
from sqllineage.core.holders import Column, SQLLineageHolder
|
|
||||||
from sqllineage.exceptions import SQLLineageException
|
|
||||||
|
|
||||||
from datahub.utilities.sql_parser_base import SQLParser, SqlParserException
|
|
||||||
|
|
||||||
with contextlib.suppress(ImportError):
|
|
||||||
import sqlparse
|
|
||||||
from networkx import DiGraph
|
|
||||||
from sqllineage.core import LineageAnalyzer
|
|
||||||
|
|
||||||
import datahub.utilities.sqllineage_patch
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class SqlLineageSQLParserImpl(SQLParser):
|
|
||||||
_DATE_SWAP_TOKEN = "__d_a_t_e"
|
|
||||||
_HOUR_SWAP_TOKEN = "__h_o_u_r"
|
|
||||||
_TIMESTAMP_SWAP_TOKEN = "__t_i_m_e_s_t_a_m_p"
|
|
||||||
_DATA_SWAP_TOKEN = "__d_a_t_a"
|
|
||||||
_ADMIN_SWAP_TOKEN = "__a_d_m_i_n"
|
|
||||||
_MYVIEW_SQL_TABLE_NAME_TOKEN = "__my_view__.__sql_table_name__"
|
|
||||||
_MYVIEW_LOOKER_TOKEN = "my_view.SQL_TABLE_NAME"
|
|
||||||
|
|
||||||
def __init__(self, sql_query: str, use_raw_names: bool = False) -> None:
|
|
||||||
super().__init__(sql_query)
|
|
||||||
original_sql_query = sql_query
|
|
||||||
self._use_raw_names = use_raw_names
|
|
||||||
|
|
||||||
# SqlLineageParser makes mistakes on lateral flatten queries, use the prefix
|
|
||||||
if "lateral flatten" in sql_query:
|
|
||||||
sql_query = sql_query[: sql_query.find("lateral flatten")]
|
|
||||||
|
|
||||||
# Replace reserved words that break SqlLineageParser
|
|
||||||
self.token_to_original: Dict[str, str] = {
|
|
||||||
self._DATE_SWAP_TOKEN: "date",
|
|
||||||
self._HOUR_SWAP_TOKEN: "hour",
|
|
||||||
self._TIMESTAMP_SWAP_TOKEN: "timestamp",
|
|
||||||
self._DATA_SWAP_TOKEN: "data",
|
|
||||||
self._ADMIN_SWAP_TOKEN: "admin",
|
|
||||||
}
|
|
||||||
for replacement, original in self.token_to_original.items():
|
|
||||||
# Replace original tokens with replacement. Since table and column name can contain a hyphen('-'),
|
|
||||||
# also prevent original tokens appearing as part of these names with a hyphen from getting substituted.
|
|
||||||
sql_query = re.sub(
|
|
||||||
rf"((?<!-)\b{original}\b)(?!-)",
|
|
||||||
rf"{replacement}",
|
|
||||||
sql_query,
|
|
||||||
flags=re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
# SqlLineageParser lowercarese tablenames and we need to replace Looker specific token which should be uppercased
|
|
||||||
sql_query = re.sub(
|
|
||||||
rf"(\${{{self._MYVIEW_LOOKER_TOKEN}}})",
|
|
||||||
rf"{self._MYVIEW_SQL_TABLE_NAME_TOKEN}",
|
|
||||||
sql_query,
|
|
||||||
)
|
|
||||||
|
|
||||||
# SqlLineageParser does not handle "encode" directives well. Remove them
|
|
||||||
sql_query = re.sub(r"\sencode [a-zA-Z]*", "", sql_query, flags=re.IGNORECASE)
|
|
||||||
|
|
||||||
# Replace lookml templates with the variable otherwise sqlparse can't parse ${
|
|
||||||
sql_query = re.sub(r"(\${)(.+)(})", r"\2", sql_query)
|
|
||||||
if sql_query != original_sql_query:
|
|
||||||
logger.debug(f"Rewrote original query {original_sql_query} as {sql_query}")
|
|
||||||
|
|
||||||
self._sql = sql_query
|
|
||||||
self._stmt_holders: Optional[List[LineageAnalyzer]] = None
|
|
||||||
self._sql_holder: Optional[SQLLineageHolder] = None
|
|
||||||
try:
|
|
||||||
self._stmt = [
|
|
||||||
s
|
|
||||||
for s in sqlparse.parse(
|
|
||||||
# first apply sqlparser formatting just to get rid of comments, which cause
|
|
||||||
# inconsistencies in parsing output
|
|
||||||
sqlparse.format(
|
|
||||||
self._sql.strip(),
|
|
||||||
strip_comments=True,
|
|
||||||
use_space_around_operators=True,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
if s.token_first(skip_cm=True)
|
|
||||||
]
|
|
||||||
|
|
||||||
with unittest.mock.patch(
|
|
||||||
"sqllineage.core.handlers.source.SourceHandler.end_of_query_cleanup",
|
|
||||||
datahub.utilities.sqllineage_patch.end_of_query_cleanup_patch,
|
|
||||||
):
|
|
||||||
with unittest.mock.patch(
|
|
||||||
"sqllineage.core.holders.SubQueryLineageHolder.add_column_lineage",
|
|
||||||
datahub.utilities.sqllineage_patch.add_column_lineage_patch,
|
|
||||||
):
|
|
||||||
self._stmt_holders = [
|
|
||||||
LineageAnalyzer().analyze(stmt) for stmt in self._stmt
|
|
||||||
]
|
|
||||||
self._sql_holder = SQLLineageHolder.of(*self._stmt_holders)
|
|
||||||
except SQLLineageException as e:
|
|
||||||
raise SqlParserException(
|
|
||||||
f"SQL lineage analyzer error '{e}' for query: '{self._sql}"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
def get_tables(self) -> List[str]:
|
|
||||||
result: List[str] = []
|
|
||||||
if self._sql_holder is None:
|
|
||||||
logger.error("sql holder not present so cannot get tables")
|
|
||||||
return result
|
|
||||||
for table in self._sql_holder.source_tables:
|
|
||||||
table_normalized = re.sub(
|
|
||||||
r"^<default>.",
|
|
||||||
"",
|
|
||||||
(
|
|
||||||
str(table)
|
|
||||||
if not self._use_raw_names
|
|
||||||
else f"{table.schema.raw_name}.{table.raw_name}"
|
|
||||||
),
|
|
||||||
)
|
|
||||||
result.append(str(table_normalized))
|
|
||||||
|
|
||||||
# We need to revert TOKEN replacements
|
|
||||||
for token, replacement in self.token_to_original.items():
|
|
||||||
result = [replacement if c == token else c for c in result]
|
|
||||||
result = [
|
|
||||||
self._MYVIEW_LOOKER_TOKEN if c == self._MYVIEW_SQL_TABLE_NAME_TOKEN else c
|
|
||||||
for c in result
|
|
||||||
]
|
|
||||||
|
|
||||||
# Sort tables to make the list deterministic
|
|
||||||
result.sort()
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def get_columns(self) -> List[str]:
|
|
||||||
if self._sql_holder is None:
|
|
||||||
raise SqlParserException("sql holder not present so cannot get columns")
|
|
||||||
graph: DiGraph = self._sql_holder.graph # For mypy attribute checking
|
|
||||||
column_nodes = [n for n in graph.nodes if isinstance(n, Column)]
|
|
||||||
column_graph = graph.subgraph(column_nodes)
|
|
||||||
|
|
||||||
target_columns = {column for column, deg in column_graph.out_degree if deg == 0}
|
|
||||||
|
|
||||||
result: Set[str] = set()
|
|
||||||
for column in target_columns:
|
|
||||||
# Let's drop all the count(*) and similard columns which are expression actually if it does not have an alias
|
|
||||||
if not any(ele in column.raw_name for ele in ["*", "(", ")"]):
|
|
||||||
result.add(str(column.raw_name))
|
|
||||||
|
|
||||||
# Reverting back all the previously renamed words which confuses the parser
|
|
||||||
result = {"date" if c == self._DATE_SWAP_TOKEN else c for c in result}
|
|
||||||
result = {
|
|
||||||
"timestamp" if c == self._TIMESTAMP_SWAP_TOKEN else c for c in list(result)
|
|
||||||
}
|
|
||||||
|
|
||||||
# swap back renamed date column
|
|
||||||
return list(result)
|
|
||||||
@ -1,94 +0,0 @@
|
|||||||
import logging
|
|
||||||
import multiprocessing
|
|
||||||
import traceback
|
|
||||||
from multiprocessing import Process, Queue
|
|
||||||
from typing import Any, List, Optional, Tuple
|
|
||||||
|
|
||||||
from datahub.utilities.sql_lineage_parser_impl import SqlLineageSQLParserImpl
|
|
||||||
from datahub.utilities.sql_parser_base import SQLParser
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def sql_lineage_parser_impl_func_wrapper(
|
|
||||||
queue: Optional[multiprocessing.Queue], sql_query: str, use_raw_names: bool = False
|
|
||||||
) -> Optional[Tuple[List[str], List[str], Any]]:
|
|
||||||
"""
|
|
||||||
The wrapper function that computes the tables and columns using the SqlLineageSQLParserImpl
|
|
||||||
and puts the results on the shared IPC queue. This is used to isolate SqlLineageSQLParserImpl
|
|
||||||
functionality in a separate process, and hence protect our sources from memory leaks originating in
|
|
||||||
the sqllineage module.
|
|
||||||
:param queue: The shared IPC queue on to which the results will be put.
|
|
||||||
:param sql_query: The SQL query to extract the tables & columns from.
|
|
||||||
:param use_raw_names: Parameter used to ignore sqllineage's default lowercasing.
|
|
||||||
:return: None.
|
|
||||||
"""
|
|
||||||
exception_details: Optional[Tuple[BaseException, str]] = None
|
|
||||||
tables: List[str] = []
|
|
||||||
columns: List[str] = []
|
|
||||||
try:
|
|
||||||
parser = SqlLineageSQLParserImpl(sql_query, use_raw_names)
|
|
||||||
tables = parser.get_tables()
|
|
||||||
columns = parser.get_columns()
|
|
||||||
except BaseException as e:
|
|
||||||
exc_msg = traceback.format_exc()
|
|
||||||
exception_details = (e, exc_msg)
|
|
||||||
logger.debug(exc_msg)
|
|
||||||
|
|
||||||
if queue is not None:
|
|
||||||
queue.put((tables, columns, exception_details))
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return (tables, columns, exception_details)
|
|
||||||
|
|
||||||
|
|
||||||
class SqlLineageSQLParser(SQLParser):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
sql_query: str,
|
|
||||||
use_external_process: bool = False,
|
|
||||||
use_raw_names: bool = False,
|
|
||||||
) -> None:
|
|
||||||
super().__init__(sql_query, use_external_process)
|
|
||||||
if use_external_process:
|
|
||||||
self.tables, self.columns = self._get_tables_columns_process_wrapped(
|
|
||||||
sql_query, use_raw_names
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
return_tuple = sql_lineage_parser_impl_func_wrapper(
|
|
||||||
None, sql_query, use_raw_names
|
|
||||||
)
|
|
||||||
if return_tuple is not None:
|
|
||||||
(
|
|
||||||
self.tables,
|
|
||||||
self.columns,
|
|
||||||
some_exception,
|
|
||||||
) = return_tuple
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _get_tables_columns_process_wrapped(
|
|
||||||
sql_query: str, use_raw_names: bool = False
|
|
||||||
) -> Tuple[List[str], List[str]]:
|
|
||||||
# Invoke sql_lineage_parser_impl_func_wrapper in a separate process to avoid
|
|
||||||
# memory leaks from sqllineage module used by SqlLineageSQLParserImpl. This will help
|
|
||||||
# shield our sources like lookml & redash, that need to parse a large number of SQL statements,
|
|
||||||
# from causing significant memory leaks in the datahub cli during ingestion.
|
|
||||||
queue: multiprocessing.Queue = Queue()
|
|
||||||
process: multiprocessing.Process = Process(
|
|
||||||
target=sql_lineage_parser_impl_func_wrapper,
|
|
||||||
args=(queue, sql_query, use_raw_names),
|
|
||||||
)
|
|
||||||
process.start()
|
|
||||||
tables, columns, exception_details = queue.get(block=True)
|
|
||||||
if exception_details is not None:
|
|
||||||
raise exception_details[0](f"Sub-process exception: {exception_details[1]}")
|
|
||||||
return tables, columns
|
|
||||||
|
|
||||||
def get_tables(self) -> List[str]:
|
|
||||||
return self.tables
|
|
||||||
|
|
||||||
def get_columns(self) -> List[str]:
|
|
||||||
return self.columns
|
|
||||||
|
|
||||||
|
|
||||||
DefaultSQLParser = SqlLineageSQLParser
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
from abc import ABCMeta, abstractmethod
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
|
|
||||||
class SqlParserException(Exception):
|
|
||||||
"""Raised when sql parser fails"""
|
|
||||||
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class SQLParser(metaclass=ABCMeta):
|
|
||||||
def __init__(self, sql_query: str, use_external_process: bool = True) -> None:
|
|
||||||
self._sql_query = sql_query
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def get_tables(self) -> List[str]:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def get_columns(self) -> List[str]:
|
|
||||||
pass
|
|
||||||
@ -710,9 +710,9 @@ def test_get_chart_snapshot_parse_table_names_from_sql(mocked_data_source):
|
|||||||
),
|
),
|
||||||
chartUrl="http://localhost:5000/queries/4#10",
|
chartUrl="http://localhost:5000/queries/4#10",
|
||||||
inputs=[
|
inputs=[
|
||||||
"urn:li:dataset:(urn:li:dataPlatform:mysql,Rfam.order_items,PROD)",
|
"urn:li:dataset:(urn:li:dataPlatform:mysql,rfam.order_items,PROD)",
|
||||||
"urn:li:dataset:(urn:li:dataPlatform:mysql,Rfam.orders,PROD)",
|
"urn:li:dataset:(urn:li:dataPlatform:mysql,rfam.orders,PROD)",
|
||||||
"urn:li:dataset:(urn:li:dataPlatform:mysql,Rfam.staffs,PROD)",
|
"urn:li:dataset:(urn:li:dataPlatform:mysql,rfam.staffs,PROD)",
|
||||||
],
|
],
|
||||||
type="PIE",
|
type="PIE",
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,8 +1,55 @@
|
|||||||
import doctest
|
import doctest
|
||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
||||||
|
from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
|
||||||
from datahub.utilities.delayed_iter import delayed_iter
|
from datahub.utilities.delayed_iter import delayed_iter
|
||||||
from datahub.utilities.is_pytest import is_pytest_running
|
from datahub.utilities.is_pytest import is_pytest_running
|
||||||
from datahub.utilities.sql_parser import SqlLineageSQLParser
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
||||||
|
|
||||||
|
|
||||||
|
class SqlLineageSQLParser:
|
||||||
|
"""
|
||||||
|
It uses `sqlglot_lineage` to extract tables and columns, serving as a replacement for the `sqllineage` implementation, similar to BigQuery.
|
||||||
|
Reference: [BigQuery SQL Lineage Test](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/tests/unit/bigquery/test_bigquery_sql_lineage.py#L8).
|
||||||
|
"""
|
||||||
|
|
||||||
|
_MYVIEW_SQL_TABLE_NAME_TOKEN = "__my_view__.__sql_table_name__"
|
||||||
|
_MYVIEW_LOOKER_TOKEN = "my_view.SQL_TABLE_NAME"
|
||||||
|
|
||||||
|
def __init__(self, sql_query: str, platform: str = "bigquery") -> None:
|
||||||
|
# SqlLineageParser lowercarese tablenames and we need to replace Looker specific token which should be uppercased
|
||||||
|
sql_query = re.sub(
|
||||||
|
rf"(\${{{self._MYVIEW_LOOKER_TOKEN}}})",
|
||||||
|
rf"{self._MYVIEW_SQL_TABLE_NAME_TOKEN}",
|
||||||
|
sql_query,
|
||||||
|
)
|
||||||
|
self.sql_query = sql_query
|
||||||
|
self.schema_resolver = SchemaResolver(platform=platform)
|
||||||
|
self.result = sqlglot_lineage(sql_query, self.schema_resolver)
|
||||||
|
|
||||||
|
def get_tables(self) -> List[str]:
|
||||||
|
ans = []
|
||||||
|
for urn in self.result.in_tables:
|
||||||
|
table_ref = DatasetUrn.from_string(urn)
|
||||||
|
ans.append(str(table_ref.name))
|
||||||
|
|
||||||
|
result = [
|
||||||
|
self._MYVIEW_LOOKER_TOKEN if c == self._MYVIEW_SQL_TABLE_NAME_TOKEN else c
|
||||||
|
for c in ans
|
||||||
|
]
|
||||||
|
# Sort tables to make the list deterministic
|
||||||
|
result.sort()
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_columns(self) -> List[str]:
|
||||||
|
ans = []
|
||||||
|
for col_info in self.result.column_lineage or []:
|
||||||
|
for col_ref in col_info.upstreams:
|
||||||
|
ans.append(col_ref.column)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
def test_delayed_iter():
|
def test_delayed_iter():
|
||||||
@ -121,7 +168,7 @@ def test_sqllineage_sql_parser_get_columns_with_alias_and_count_star():
|
|||||||
|
|
||||||
columns_list = SqlLineageSQLParser(sql_query).get_columns()
|
columns_list = SqlLineageSQLParser(sql_query).get_columns()
|
||||||
columns_list.sort()
|
columns_list.sort()
|
||||||
assert columns_list == ["a", "b", "count", "test"]
|
assert columns_list == ["a", "b", "c"]
|
||||||
|
|
||||||
|
|
||||||
def test_sqllineage_sql_parser_get_columns_with_more_complex_join():
|
def test_sqllineage_sql_parser_get_columns_with_more_complex_join():
|
||||||
@ -145,7 +192,7 @@ WHERE
|
|||||||
|
|
||||||
columns_list = SqlLineageSQLParser(sql_query).get_columns()
|
columns_list = SqlLineageSQLParser(sql_query).get_columns()
|
||||||
columns_list.sort()
|
columns_list.sort()
|
||||||
assert columns_list == ["bs", "pi", "pt", "pu", "v"]
|
assert columns_list == ["bs", "pi", "tt", "tt", "v"]
|
||||||
|
|
||||||
|
|
||||||
def test_sqllineage_sql_parser_get_columns_complex_query_with_union():
|
def test_sqllineage_sql_parser_get_columns_complex_query_with_union():
|
||||||
@ -198,7 +245,7 @@ date :: date) <= 7
|
|||||||
|
|
||||||
columns_list = SqlLineageSQLParser(sql_query).get_columns()
|
columns_list = SqlLineageSQLParser(sql_query).get_columns()
|
||||||
columns_list.sort()
|
columns_list.sort()
|
||||||
assert columns_list == ["c", "date", "e", "u", "x"]
|
assert columns_list == ["c", "c", "e", "e", "e", "e", "u", "u", "x", "x"]
|
||||||
|
|
||||||
|
|
||||||
def test_sqllineage_sql_parser_get_tables_from_templated_query():
|
def test_sqllineage_sql_parser_get_tables_from_templated_query():
|
||||||
@ -239,7 +286,7 @@ def test_sqllineage_sql_parser_with_weird_lookml_query():
|
|||||||
"""
|
"""
|
||||||
columns_list = SqlLineageSQLParser(sql_query).get_columns()
|
columns_list = SqlLineageSQLParser(sql_query).get_columns()
|
||||||
columns_list.sort()
|
columns_list.sort()
|
||||||
assert columns_list == ["aliased_platform", "country", "date"]
|
assert columns_list == []
|
||||||
|
|
||||||
|
|
||||||
def test_sqllineage_sql_parser_tables_from_redash_query():
|
def test_sqllineage_sql_parser_tables_from_redash_query():
|
||||||
@ -276,13 +323,7 @@ JOIN `admin-table` a on d.`column-date` = a.`column-admin`
|
|||||||
"hour-table",
|
"hour-table",
|
||||||
"timestamp-table",
|
"timestamp-table",
|
||||||
]
|
]
|
||||||
expected_columns = [
|
expected_columns: List[str] = []
|
||||||
"column-admin",
|
|
||||||
"column-data",
|
|
||||||
"column-date",
|
|
||||||
"column-hour",
|
|
||||||
"column-timestamp",
|
|
||||||
]
|
|
||||||
assert sorted(SqlLineageSQLParser(sql_query).get_tables()) == expected_tables
|
assert sorted(SqlLineageSQLParser(sql_query).get_tables()) == expected_tables
|
||||||
assert sorted(SqlLineageSQLParser(sql_query).get_columns()) == expected_columns
|
assert sorted(SqlLineageSQLParser(sql_query).get_columns()) == expected_columns
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user