fix(ingest/bigquery): Fixing lineage filter query (#9114)

This commit is contained in:
Tamas Nemeth 2023-10-26 18:46:10 +02:00 committed by GitHub
parent f402090c1e
commit a96a512166
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 13 additions and 6 deletions

View File

@ -309,6 +309,7 @@ class BigQueryV2Config(
"dataset_pattern is not set but schema_pattern is set, using schema_pattern as dataset_pattern. schema_pattern will be deprecated, please use dataset_pattern instead."
)
values["dataset_pattern"] = schema_pattern
dataset_pattern = schema_pattern
elif (
dataset_pattern != AllowDenyPattern.allow_all()
and schema_pattern != AllowDenyPattern.allow_all()

View File

@ -20,6 +20,7 @@ import humanfriendly
from google.cloud.datacatalog import lineage_v1
from google.cloud.logging_v2.client import Client as GCPLoggingClient
from datahub.configuration.pattern_utils import is_schema_allowed
from datahub.emitter import mce_builder
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.workunit import MetadataWorkUnit
@ -683,8 +684,11 @@ class BigqueryLineageExtractor:
self.report.num_skipped_lineage_entries_missing_data[e.project_id] += 1
continue
if not self.config.dataset_pattern.allowed(
destination_table.table_identifier.dataset
if not is_schema_allowed(
self.config.dataset_pattern,
destination_table.table_identifier.dataset,
destination_table.table_identifier.project_id,
self.config.match_fully_qualified_names,
) or not self.config.table_pattern.allowed(
destination_table.table_identifier.get_table_name()
):

View File

@ -21,6 +21,7 @@ from typing import (
import humanfriendly
from datahub.configuration.pattern_utils import is_schema_allowed
from datahub.configuration.time_window_config import (
BaseTimeWindowConfig,
get_time_bucket,
@ -335,10 +336,11 @@ class BigQueryUsageExtractor:
def _is_table_allowed(self, table_ref: Optional[BigQueryTableRef]) -> bool:
return (
table_ref is not None
and self.config.dataset_pattern.allowed(
f"{table_ref.table_identifier.project_id}.{table_ref.table_identifier.dataset}"
if self.config.match_fully_qualified_names
else table_ref.table_identifier.dataset
and is_schema_allowed(
self.config.dataset_pattern,
table_ref.table_identifier.dataset,
table_ref.table_identifier.project_id,
self.config.match_fully_qualified_names,
)
and self.config.table_pattern.allowed(str(table_ref.table_identifier))
)