From 427a06cfa82d91fde18a143a728fee5badfab697 Mon Sep 17 00:00:00 2001 From: Ayush Shah Date: Sat, 18 Jan 2025 19:41:30 +0530 Subject: [PATCH] MINOR: BigQuery Improvement, Hive Partitioned Tables, Nonetype issue resolved (#19429) --- .../source/database/bigquery/metadata.py | 25 +++++++++++++++++++ .../database/bigquery/profiler/profiler.py | 1 + ingestion/src/metadata/sampler/partition.py | 3 ++- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/metadata.py b/ingestion/src/metadata/ingestion/source/database/bigquery/metadata.py index 078853aca59..310cb203601 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigquery/metadata.py +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/metadata.py @@ -667,6 +667,31 @@ class BigquerySource(LifeCycleQueryMixin, CommonDbSourceService, MultiDBSource): database = self.context.get().database table = self.client.get_table(fqn._build(database, schema_name, table_name)) columns = inspector.get_columns(table_name, schema_name, db_name=database) + if hasattr(table, "external_data_configuration") and hasattr( + table.external_data_configuration, "hive_partitioning" + ): + # Ingesting External Hive Partitioned Tables + from google.cloud.bigquery.external_config import ( # pylint: disable=import-outside-toplevel + HivePartitioningOptions, + ) + + partition_details: HivePartitioningOptions = ( + table.external_data_configuration.hive_partitioning + ) + return True, TablePartition( + columns=[ + PartitionColumnDetails( + columnName=self._get_partition_column_name( + columns=columns, + partition_field_name=field, + ), + interval=str(partition_details._properties.get("mode")), + intervalType=PartitionIntervalTypes.OTHER, + ) + for field in partition_details._properties.get("fields") + ] + ) + if table.time_partitioning is not None: if table.time_partitioning.field: table_partition = TablePartition( diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/profiler.py b/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/profiler.py index 2cd0f225b31..750ae61fb20 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/profiler.py +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/profiler.py @@ -24,6 +24,7 @@ class BigQueryProfiler(BigQueryProfilerInterface): return self.system_metrics_computer.get_system_metrics( table=runner.dataset, usage_location=self.service_connection_config.usageLocation, + runner=runner, ) def initialize_system_metrics_computer(self) -> BigQuerySystemMetricsComputer: diff --git a/ingestion/src/metadata/sampler/partition.py b/ingestion/src/metadata/sampler/partition.py index e62ad94a049..c2eb76f9833 100644 --- a/ingestion/src/metadata/sampler/partition.py +++ b/ingestion/src/metadata/sampler/partition.py @@ -49,7 +49,7 @@ def validate_athena_injected_partitioning( column_partitions: Optional[List[PartitionColumnDetails]] = table_partitions.columns if not column_partitions: - raise RuntimeError("Table parition is set but no columns are defined.") + raise RuntimeError("Table partition is set but no columns are defined.") for column_partition in column_partitions: if column_partition.intervalType == PartitionIntervalTypes.INJECTED: @@ -163,6 +163,7 @@ def _handle_bigquery_partition( partitionIntegerRangeStart=1, partitionIntegerRangeEnd=10000, ) + # TODO: Allow External Hive Partitioning for profiler raise TypeError( f"Unsupported partition type {partition.intervalType}. Skipping table" )