diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 6311ddf7e2..bc8358b375 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -377,7 +377,7 @@ mypy_stubs = { "types-ujson>=5.2.0", "types-termcolor>=1.0.0", "types-Deprecated", - # Mypy complains with 4.21.0.0 => error: Library stubs not installed for "google.protobuf.descriptor" + # Mypy complains with 4.21.0.0 => error: Library stubs not installed for "google.protobuf.descriptor" "types-protobuf<4.21.0.0", } diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index dde13353f5..683d369973 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -55,6 +55,7 @@ class BigQueryV2Report(SQLSourceReport): upstream_lineage: LossyDict = field(default_factory=LossyDict) partition_info: Dict[str, str] = field(default_factory=TopKDict) profile_table_selection_criteria: Dict[str, str] = field(default_factory=TopKDict) + num_tables_not_eligible_profiling: Dict[str, int] = field(default_factory=TopKDict) selected_profile_tables: Dict[str, List[str]] = field(default_factory=TopKDict) invalid_partition_ids: Dict[str, str] = field(default_factory=TopKDict) allow_pattern: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py index 5c0d0f4a6d..cba201431d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py @@ -115,18 +115,15 @@ class BigqueryProfiler: ] = partition return None, None + partition_column_type: str = "DATE" + for c in table.columns: + if c.is_partition_column: + partition_column_type = c.data_type + if table.time_partitioning.type_ in ("DAY", "MONTH", "YEAR"): - partition_where_clause = "{column_name} BETWEEN DATE('{partition_id}') AND DATE('{upper_bound_partition_id}')".format( - column_name=table.time_partitioning.field, - partition_id=partition_datetime, - upper_bound_partition_id=upper_bound_partition_datetime, - ) + partition_where_clause = f"`{table.time_partitioning.field}` BETWEEN {partition_column_type}('{partition_datetime}') AND {partition_column_type}('{upper_bound_partition_datetime}')" elif table.time_partitioning.type_ in ("HOUR"): - partition_where_clause = "{column_name} BETWEEN '{partition_id}' AND '{upper_bound_partition_id}'".format( - column_name=table.time_partitioning.field, - partition_id=partition_datetime, - upper_bound_partition_id=upper_bound_partition_datetime, - ) + partition_where_clause = f"`{table.time_partitioning.field}` BETWEEN '{partition_datetime}' AND '{upper_bound_partition_datetime}'" else: logger.warning( f"Not supported partition type {table.time_partitioning.type_}" @@ -216,14 +213,10 @@ WHERE if not self.is_dataset_eligible_for_profiling( dataset_name, table.last_altered, table.size_in_bytes, table.rows_count ): - # Profile only table level if dataset is filtered from profiling - # due to size limits alone - if self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, 0, 0 - ): - profile_table_level_only = True - else: - skip_profiling = True + profile_table_level_only = True + self.report.num_tables_not_eligible_profiling[dataset] = ( + self.report.num_tables_not_eligible_profiling.get(dataset, 0) + 1 + ) if not table.columns: skip_profiling = True diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 97915586e8..22dff7a9d2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -66,7 +66,7 @@ class OperationalDataMeta: def bigquery_audit_metadata_query_template( dataset: str, use_date_sharded_tables: bool, - table_allow_filter: str = None, + table_allow_filter: Optional[str] = None, limit: Optional[int] = None, ) -> str: """