fix(ingest/bigquery): Fixing lineage filter query (#9114)

This commit is contained in:
Tamas Nemeth 2023-10-26 18:46:10 +02:00 committed by GitHub
parent f402090c1e
commit a96a512166
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 13 additions and 6 deletions

View File

@ -309,6 +309,7 @@ class BigQueryV2Config(
"dataset_pattern is not set but schema_pattern is set, using schema_pattern as dataset_pattern. schema_pattern will be deprecated, please use dataset_pattern instead." "dataset_pattern is not set but schema_pattern is set, using schema_pattern as dataset_pattern. schema_pattern will be deprecated, please use dataset_pattern instead."
) )
values["dataset_pattern"] = schema_pattern values["dataset_pattern"] = schema_pattern
dataset_pattern = schema_pattern
elif ( elif (
dataset_pattern != AllowDenyPattern.allow_all() dataset_pattern != AllowDenyPattern.allow_all()
and schema_pattern != AllowDenyPattern.allow_all() and schema_pattern != AllowDenyPattern.allow_all()

View File

@ -20,6 +20,7 @@ import humanfriendly
from google.cloud.datacatalog import lineage_v1 from google.cloud.datacatalog import lineage_v1
from google.cloud.logging_v2.client import Client as GCPLoggingClient from google.cloud.logging_v2.client import Client as GCPLoggingClient
from datahub.configuration.pattern_utils import is_schema_allowed
from datahub.emitter import mce_builder from datahub.emitter import mce_builder
from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.api.workunit import MetadataWorkUnit
@ -683,8 +684,11 @@ class BigqueryLineageExtractor:
self.report.num_skipped_lineage_entries_missing_data[e.project_id] += 1 self.report.num_skipped_lineage_entries_missing_data[e.project_id] += 1
continue continue
if not self.config.dataset_pattern.allowed( if not is_schema_allowed(
destination_table.table_identifier.dataset self.config.dataset_pattern,
destination_table.table_identifier.dataset,
destination_table.table_identifier.project_id,
self.config.match_fully_qualified_names,
) or not self.config.table_pattern.allowed( ) or not self.config.table_pattern.allowed(
destination_table.table_identifier.get_table_name() destination_table.table_identifier.get_table_name()
): ):

View File

@ -21,6 +21,7 @@ from typing import (
import humanfriendly import humanfriendly
from datahub.configuration.pattern_utils import is_schema_allowed
from datahub.configuration.time_window_config import ( from datahub.configuration.time_window_config import (
BaseTimeWindowConfig, BaseTimeWindowConfig,
get_time_bucket, get_time_bucket,
@ -335,10 +336,11 @@ class BigQueryUsageExtractor:
def _is_table_allowed(self, table_ref: Optional[BigQueryTableRef]) -> bool: def _is_table_allowed(self, table_ref: Optional[BigQueryTableRef]) -> bool:
return ( return (
table_ref is not None table_ref is not None
and self.config.dataset_pattern.allowed( and is_schema_allowed(
f"{table_ref.table_identifier.project_id}.{table_ref.table_identifier.dataset}" self.config.dataset_pattern,
if self.config.match_fully_qualified_names table_ref.table_identifier.dataset,
else table_ref.table_identifier.dataset table_ref.table_identifier.project_id,
self.config.match_fully_qualified_names,
) )
and self.config.table_pattern.allowed(str(table_ref.table_identifier)) and self.config.table_pattern.allowed(str(table_ref.table_identifier))
) )