fix(ingest): bigquery - Fixing querying non-date partition columns in profiling (#6554)

This commit is contained in:
Tamas Nemeth 2022-11-26 18:48:33 +01:00 committed by GitHub
parent d424edde41
commit 278c38cae4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 14 additions and 20 deletions

View File

@ -55,6 +55,7 @@ class BigQueryV2Report(SQLSourceReport):
upstream_lineage: LossyDict = field(default_factory=LossyDict)
partition_info: Dict[str, str] = field(default_factory=TopKDict)
profile_table_selection_criteria: Dict[str, str] = field(default_factory=TopKDict)
num_tables_not_eligible_profiling: Dict[str, int] = field(default_factory=TopKDict)
selected_profile_tables: Dict[str, List[str]] = field(default_factory=TopKDict)
invalid_partition_ids: Dict[str, str] = field(default_factory=TopKDict)
allow_pattern: Optional[str] = None

View File

@ -115,18 +115,15 @@ class BigqueryProfiler:
] = partition
return None, None
partition_column_type: str = "DATE"
for c in table.columns:
if c.is_partition_column:
partition_column_type = c.data_type
if table.time_partitioning.type_ in ("DAY", "MONTH", "YEAR"):
partition_where_clause = "{column_name} BETWEEN DATE('{partition_id}') AND DATE('{upper_bound_partition_id}')".format(
column_name=table.time_partitioning.field,
partition_id=partition_datetime,
upper_bound_partition_id=upper_bound_partition_datetime,
)
partition_where_clause = f"`{table.time_partitioning.field}` BETWEEN {partition_column_type}('{partition_datetime}') AND {partition_column_type}('{upper_bound_partition_datetime}')"
elif table.time_partitioning.type_ in ("HOUR"):
partition_where_clause = "{column_name} BETWEEN '{partition_id}' AND '{upper_bound_partition_id}'".format(
column_name=table.time_partitioning.field,
partition_id=partition_datetime,
upper_bound_partition_id=upper_bound_partition_datetime,
)
partition_where_clause = f"`{table.time_partitioning.field}` BETWEEN '{partition_datetime}' AND '{upper_bound_partition_datetime}'"
else:
logger.warning(
f"Not supported partition type {table.time_partitioning.type_}"
@ -216,14 +213,10 @@ WHERE
if not self.is_dataset_eligible_for_profiling(
dataset_name, table.last_altered, table.size_in_bytes, table.rows_count
):
# Profile only table level if dataset is filtered from profiling
# due to size limits alone
if self.is_dataset_eligible_for_profiling(
dataset_name, table.last_altered, 0, 0
):
profile_table_level_only = True
else:
skip_profiling = True
profile_table_level_only = True
self.report.num_tables_not_eligible_profiling[dataset] = (
self.report.num_tables_not_eligible_profiling.get(dataset, 0) + 1
)
if not table.columns:
skip_profiling = True

View File

@ -66,7 +66,7 @@ class OperationalDataMeta:
def bigquery_audit_metadata_query_template(
dataset: str,
use_date_sharded_tables: bool,
table_allow_filter: str = None,
table_allow_filter: Optional[str] = None,
limit: Optional[int] = None,
) -> str:
"""