mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-20 15:18:03 +00:00
feat(ingest/snowflake): add config option to specify deny patterns for upstreams (#7962)
Co-authored-by: Andrew Sikowitz <andrew.sikowitz@acryl.io>
This commit is contained in:
parent
13b1d66170
commit
c845c75a2d
@ -1,5 +1,7 @@
|
||||
from datahub.configuration.common import AllowDenyPattern
|
||||
|
||||
UUID_REGEX = r"[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}"
|
||||
|
||||
|
||||
def is_schema_allowed(
|
||||
schema_pattern: AllowDenyPattern,
|
||||
|
@ -1,10 +1,11 @@
|
||||
import logging
|
||||
from enum import Enum
|
||||
from typing import Dict, Optional, cast
|
||||
from typing import Dict, List, Optional, cast
|
||||
|
||||
from pydantic import Field, SecretStr, root_validator, validator
|
||||
|
||||
from datahub.configuration.common import AllowDenyPattern
|
||||
from datahub.configuration.pattern_utils import UUID_REGEX
|
||||
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
||||
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
||||
from datahub.ingestion.glossary.classifier import ClassificationConfig
|
||||
@ -20,6 +21,18 @@ from datahub.ingestion.source_config.usage.snowflake_usage import SnowflakeUsage
|
||||
|
||||
logger = logging.Logger(__name__)
|
||||
|
||||
# FIVETRAN creates temporary tables in schema named FIVETRAN_xxx_STAGING.
|
||||
# Ref - https://support.fivetran.com/hc/en-us/articles/1500003507122-Why-Is-There-an-Empty-Schema-Named-Fivetran-staging-in-the-Destination-
|
||||
#
|
||||
# DBT incremental models create temporary tables ending with __dbt_tmp
|
||||
# Ref - https://discourse.getdbt.com/t/handling-bigquery-incremental-dbt-tmp-tables/7540
|
||||
DEFAULT_UPSTREAMS_DENY_LIST = [
|
||||
r".*\.FIVETRAN_.*_STAGING\..*", # fivetran
|
||||
r".*__DBT_TMP$", # dbt
|
||||
rf".*\.SEGMENT_{UUID_REGEX}", # segment
|
||||
rf".*\.STAGING_.*_{UUID_REGEX}", # stitch
|
||||
]
|
||||
|
||||
|
||||
class TagOption(str, Enum):
|
||||
with_lineage = "with_lineage"
|
||||
@ -89,6 +102,16 @@ class SnowflakeV2Config(
|
||||
description="Whether to validate upstream snowflake tables against allow-deny patterns",
|
||||
)
|
||||
|
||||
tag_pattern: AllowDenyPattern = Field(
|
||||
default=AllowDenyPattern.allow_all(),
|
||||
description="List of regex patterns for tags to include in ingestion. Only used if `extract_tags` is enabled.",
|
||||
)
|
||||
|
||||
upstreams_deny_pattern: List[str] = Field(
|
||||
default=DEFAULT_UPSTREAMS_DENY_LIST,
|
||||
description="[Advanced] Regex patterns for upstream tables to filter in ingestion. Specify regex to match the entire table name in database.schema.table format. Defaults are to set in such a way to ignore the temporary staging tables created by known ETL tools. Not used if `use_legacy_lineage_method=True`",
|
||||
)
|
||||
|
||||
@validator("include_column_lineage")
|
||||
def validate_include_column_lineage(cls, v, values):
|
||||
if not values.get("include_table_lineage") and v:
|
||||
@ -97,11 +120,6 @@ class SnowflakeV2Config(
|
||||
)
|
||||
return v
|
||||
|
||||
tag_pattern: AllowDenyPattern = Field(
|
||||
default=AllowDenyPattern.allow_all(),
|
||||
description="List of regex patterns for tags to include in ingestion. Only used if `extract_tags` is enabled.",
|
||||
)
|
||||
|
||||
@root_validator(pre=False)
|
||||
def validate_unsupported_configs(cls, values: Dict) -> Dict:
|
||||
value = values.get("include_read_operational_stats")
|
||||
@ -140,15 +158,14 @@ class SnowflakeV2Config(
|
||||
and values["stateful_ingestion"].enabled
|
||||
and values["stateful_ingestion"].remove_stale_metadata
|
||||
)
|
||||
include_table_lineage = values.get("include_table_lineage")
|
||||
|
||||
# TODO: Allow lineage extraction and profiling irrespective of basic schema extraction,
|
||||
# as it seems possible with some refractor
|
||||
# as it seems possible with some refactor
|
||||
if not include_technical_schema and any(
|
||||
[include_profiles, delete_detection_enabled, include_table_lineage]
|
||||
[include_profiles, delete_detection_enabled]
|
||||
):
|
||||
raise ValueError(
|
||||
"Can not perform Deletion Detection, Lineage Extraction, Profiling without extracting snowflake technical schema. Set `include_technical_schema` to True or disable Deletion Detection, Lineage Extraction, Profiling."
|
||||
"Cannot perform Deletion Detection or Profiling without extracting snowflake technical schema. Set `include_technical_schema` to True or disable Deletion Detection and Profiling."
|
||||
)
|
||||
|
||||
return values
|
||||
|
@ -340,6 +340,7 @@ class SnowflakeLineageExtractor(
|
||||
if not self.config.ignore_start_time_lineage
|
||||
else 0,
|
||||
end_time_millis=int(self.config.end_time.timestamp() * 1000),
|
||||
upstreams_deny_pattern=self.config.upstreams_deny_pattern,
|
||||
include_view_lineage=self.config.include_view_lineage,
|
||||
include_column_lineage=self.config.include_column_lineage,
|
||||
)
|
||||
|
@ -1,6 +1,27 @@
|
||||
from typing import Optional
|
||||
from typing import List, Optional
|
||||
|
||||
from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
|
||||
from datahub.ingestion.source.snowflake.snowflake_config import (
|
||||
DEFAULT_UPSTREAMS_DENY_LIST,
|
||||
)
|
||||
|
||||
|
||||
def create_deny_regex_sql_filter(
|
||||
upstreams_deny_pattern: List[str], filter_cols: List[str]
|
||||
) -> str:
|
||||
upstream_sql_filter = (
|
||||
" AND ".join(
|
||||
[
|
||||
(f"NOT RLIKE({col_name},'{regexp}','i')")
|
||||
for col_name in filter_cols
|
||||
for regexp in upstreams_deny_pattern
|
||||
]
|
||||
)
|
||||
if upstreams_deny_pattern
|
||||
else ""
|
||||
)
|
||||
|
||||
return upstream_sql_filter
|
||||
|
||||
|
||||
class SnowflakeQuery:
|
||||
@ -426,20 +447,30 @@ class SnowflakeQuery:
|
||||
) = 1
|
||||
"""
|
||||
|
||||
# Note on use of `upstreams_deny_pattern` to ignore temporary tables:
|
||||
# Snowflake access history may include temporary tables in DIRECT_OBJECTS_ACCESSED and
|
||||
# OBJECTS_MODIFIED->columns->directSources. We do not need these temporary tables and filter these in the query.
|
||||
@staticmethod
|
||||
def table_to_table_lineage_history_v2(
|
||||
start_time_millis: int,
|
||||
end_time_millis: int,
|
||||
include_view_lineage: bool = True,
|
||||
include_column_lineage: bool = True,
|
||||
upstreams_deny_pattern: List[str] = DEFAULT_UPSTREAMS_DENY_LIST,
|
||||
) -> str:
|
||||
if include_column_lineage:
|
||||
return SnowflakeQuery.table_upstreams_with_column_lineage(
|
||||
start_time_millis, end_time_millis, include_view_lineage
|
||||
start_time_millis,
|
||||
end_time_millis,
|
||||
upstreams_deny_pattern,
|
||||
include_view_lineage,
|
||||
)
|
||||
else:
|
||||
return SnowflakeQuery.table_upstreams_only(
|
||||
start_time_millis, end_time_millis, include_view_lineage
|
||||
start_time_millis,
|
||||
end_time_millis,
|
||||
upstreams_deny_pattern,
|
||||
include_view_lineage,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@ -664,19 +695,11 @@ class SnowflakeQuery:
|
||||
basic_usage_counts.bucket_start_time
|
||||
"""
|
||||
|
||||
# Note on temporary tables:
|
||||
# Snowflake access history may include temporary tables in DIRECT_OBJECTS_ACCESSED and
|
||||
# OBJECTS_MODIFIED->columns->directSources. We do not need these temporary tables and filter these in the query.
|
||||
#
|
||||
# FIVETRAN creates temporary tables in schema named FIVETRAN_xxx_STAGING.
|
||||
# Ref - https://support.fivetran.com/hc/en-us/articles/1500003507122-Why-Is-There-an-Empty-Schema-Named-Fivetran-staging-in-the-Destination-
|
||||
#
|
||||
# DBT incremental models create temporary tables ending with __dbt_tmp
|
||||
# Ref - https://discourse.getdbt.com/t/handling-bigquery-incremental-dbt-tmp-tables/7540
|
||||
@staticmethod
|
||||
def table_upstreams_with_column_lineage(
|
||||
start_time_millis: int,
|
||||
end_time_millis: int,
|
||||
upstreams_deny_pattern: List[str],
|
||||
include_view_lineage: bool = True,
|
||||
) -> str:
|
||||
allowed_upstream_table_domains = (
|
||||
@ -684,6 +707,12 @@ class SnowflakeQuery:
|
||||
if include_view_lineage
|
||||
else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER
|
||||
)
|
||||
|
||||
upstream_sql_filter = create_deny_regex_sql_filter(
|
||||
upstreams_deny_pattern,
|
||||
["upstream_table_name", "upstream_column_table_name"],
|
||||
)
|
||||
|
||||
return f"""
|
||||
WITH column_lineage_history AS (
|
||||
SELECT
|
||||
@ -712,10 +741,7 @@ class SnowflakeQuery:
|
||||
AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
||||
AND upstream_table_domain in {allowed_upstream_table_domains}
|
||||
AND downstream_table_domain = '{SnowflakeObjectDomain.TABLE.capitalize()}'
|
||||
AND upstream_column_table_name NOT LIKE '%.FIVETRAN\\_%\\_STAGING.%'
|
||||
AND upstream_column_table_name NOT LIKE '%\\_\\_DBT\\_TMP'
|
||||
AND upstream_table_name NOT LIKE '%.FIVETRAN\\_%\\_STAGING.%'
|
||||
AND upstream_table_name NOT LIKE '%\\_\\_DBT\\_TMP'
|
||||
{("AND " + upstream_sql_filter) if upstream_sql_filter else ""}
|
||||
),
|
||||
column_upstream_jobs AS (
|
||||
SELECT
|
||||
@ -781,6 +807,7 @@ class SnowflakeQuery:
|
||||
def table_upstreams_only(
|
||||
start_time_millis: int,
|
||||
end_time_millis: int,
|
||||
upstreams_deny_pattern: List[str],
|
||||
include_view_lineage: bool = True,
|
||||
) -> str:
|
||||
allowed_upstream_table_domains = (
|
||||
@ -788,6 +815,11 @@ class SnowflakeQuery:
|
||||
if include_view_lineage
|
||||
else SnowflakeQuery.ACCESS_HISTORY_TABLE_DOMAINS_FILTER
|
||||
)
|
||||
|
||||
upstream_sql_filter = create_deny_regex_sql_filter(
|
||||
upstreams_deny_pattern,
|
||||
["upstream_table_name"],
|
||||
)
|
||||
return f"""
|
||||
WITH table_lineage_history AS (
|
||||
SELECT
|
||||
@ -810,8 +842,7 @@ class SnowflakeQuery:
|
||||
AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
||||
AND upstream_table_domain in {allowed_upstream_table_domains}
|
||||
AND downstream_table_domain = '{SnowflakeObjectDomain.TABLE.capitalize()}'
|
||||
AND upstream_table_name NOT LIKE '%.FIVETRAN\\_%\\_STAGING.%'
|
||||
AND upstream_table_name NOT LIKE '%\\_\\_DBT\\_TMP'
|
||||
{("AND " + upstream_sql_filter) if upstream_sql_filter else ""}
|
||||
)
|
||||
SELECT
|
||||
downstream_table_name AS "DOWNSTREAM_TABLE_NAME",
|
||||
|
@ -312,7 +312,10 @@ def default_query_results(query): # noqa: C901
|
||||
]
|
||||
elif query in (
|
||||
snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2(
|
||||
1654499820000, 1654586220000, True, True
|
||||
start_time_millis=1654499820000,
|
||||
end_time_millis=1654586220000,
|
||||
include_view_lineage=True,
|
||||
include_column_lineage=True,
|
||||
),
|
||||
):
|
||||
return [
|
||||
@ -381,7 +384,10 @@ def default_query_results(query): # noqa: C901
|
||||
]
|
||||
elif query in (
|
||||
snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2(
|
||||
1654499820000, 1654586220000, False, False
|
||||
start_time_millis=1654499820000,
|
||||
end_time_millis=1654586220000,
|
||||
include_view_lineage=False,
|
||||
include_column_lineage=False,
|
||||
),
|
||||
):
|
||||
return [
|
||||
|
@ -228,7 +228,10 @@ def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure(
|
||||
default_query_results,
|
||||
[
|
||||
snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2(
|
||||
1654499820000, 1654586220000, False, True
|
||||
start_time_millis=1654499820000,
|
||||
end_time_millis=1654586220000,
|
||||
include_view_lineage=False,
|
||||
include_column_lineage=True,
|
||||
)
|
||||
],
|
||||
"Database 'SNOWFLAKE' does not exist or not authorized.",
|
||||
|
@ -4,13 +4,20 @@ import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from datahub.configuration.common import OauthConfiguration
|
||||
from datahub.configuration.pattern_utils import UUID_REGEX
|
||||
from datahub.ingestion.api.source import SourceCapability
|
||||
from datahub.ingestion.source.snowflake.constants import (
|
||||
CLIENT_PREFETCH_THREADS,
|
||||
CLIENT_SESSION_KEEP_ALIVE,
|
||||
SnowflakeCloudProvider,
|
||||
)
|
||||
from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config
|
||||
from datahub.ingestion.source.snowflake.snowflake_config import (
|
||||
DEFAULT_UPSTREAMS_DENY_LIST,
|
||||
SnowflakeV2Config,
|
||||
)
|
||||
from datahub.ingestion.source.snowflake.snowflake_query import (
|
||||
create_deny_regex_sql_filter,
|
||||
)
|
||||
from datahub.ingestion.source.snowflake.snowflake_usage_v2 import (
|
||||
SnowflakeObjectAccessEntry,
|
||||
)
|
||||
@ -572,3 +579,28 @@ def test_snowflake_object_access_entry_missing_object_id():
|
||||
"objectName": "SOME.OBJECT.NAME",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def test_snowflake_query_create_deny_regex_sql():
|
||||
assert create_deny_regex_sql_filter([], ["col"]) == ""
|
||||
assert (
|
||||
create_deny_regex_sql_filter([".*tmp.*"], ["col"])
|
||||
== "NOT RLIKE(col,'.*tmp.*','i')"
|
||||
)
|
||||
|
||||
assert (
|
||||
create_deny_regex_sql_filter([".*tmp.*", UUID_REGEX], ["col"])
|
||||
== "NOT RLIKE(col,'.*tmp.*','i') AND NOT RLIKE(col,'[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}','i')"
|
||||
)
|
||||
|
||||
assert (
|
||||
create_deny_regex_sql_filter([".*tmp.*", UUID_REGEX], ["col1", "col2"])
|
||||
== "NOT RLIKE(col1,'.*tmp.*','i') AND NOT RLIKE(col1,'[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}','i') AND NOT RLIKE(col2,'.*tmp.*','i') AND NOT RLIKE(col2,'[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}','i')"
|
||||
)
|
||||
|
||||
assert (
|
||||
create_deny_regex_sql_filter(
|
||||
DEFAULT_UPSTREAMS_DENY_LIST, ["upstream_table_name"]
|
||||
)
|
||||
== r"NOT RLIKE(upstream_table_name,'.*\.FIVETRAN_.*_STAGING\..*','i') AND NOT RLIKE(upstream_table_name,'.*__DBT_TMP$','i') AND NOT RLIKE(upstream_table_name,'.*\.SEGMENT_[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}','i') AND NOT RLIKE(upstream_table_name,'.*\.STAGING_.*_[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}','i')"
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user