From dd892dfbb17b75f479a8bcf6e77f39b6a641e671 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Mon, 25 Nov 2024 10:31:17 +0530 Subject: [PATCH] fix(ingest/bigquery): ignore include constraints for biglake datasets (#11874) --- .../source/bigquery_v2/bigquery_schema.py | 47 ++++++++++++++----- .../source/bigquery_v2/bigquery_schema_gen.py | 23 ++++----- 2 files changed, 47 insertions(+), 23 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index 58317b108b..be85d037af 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -152,6 +152,21 @@ class BigqueryDataset: snapshots: List[BigqueryTableSnapshot] = field(default_factory=list) columns: List[BigqueryColumn] = field(default_factory=list) + # Some INFORMATION_SCHEMA views are not available for BigLake tables + # based on Amazon S3 and Blob Storage data. + # https://cloud.google.com/bigquery/docs/omni-introduction#limitations + # Omni Locations - https://cloud.google.com/bigquery/docs/omni-introduction#locations + def is_biglake_dataset(self) -> bool: + return self.location is not None and self.location.lower().startswith( + ("aws-", "azure-") + ) + + def supports_table_constraints(self) -> bool: + return not self.is_biglake_dataset() + + def supports_table_partitions(self) -> bool: + return not self.is_biglake_dataset() + @dataclass class BigqueryProject: @@ -541,18 +556,26 @@ class BigQuerySchemaApi: table_name=constraint.table_name, type=constraint.constraint_type, field_path=constraint.column_name, - referenced_project_id=constraint.referenced_catalog - if constraint.constraint_type == "FOREIGN KEY" - else None, - referenced_dataset=constraint.referenced_schema - if constraint.constraint_type == "FOREIGN KEY" - else None, - referenced_table_name=constraint.referenced_table - if constraint.constraint_type == "FOREIGN KEY" - else None, - referenced_column_name=constraint.referenced_column - if constraint.constraint_type == "FOREIGN KEY" - else None, + referenced_project_id=( + constraint.referenced_catalog + if constraint.constraint_type == "FOREIGN KEY" + else None + ), + referenced_dataset=( + constraint.referenced_schema + if constraint.constraint_type == "FOREIGN KEY" + else None + ), + referenced_table_name=( + constraint.referenced_table + if constraint.constraint_type == "FOREIGN KEY" + else None + ), + referenced_column_name=( + constraint.referenced_column + if constraint.constraint_type == "FOREIGN KEY" + else None + ), ) ) self.report.num_get_table_constraints_for_dataset_api_requests += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 6f3008ccfd..788016103d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -498,7 +498,10 @@ class BigQuerySchemaGenerator: report=self.report, rate_limiter=rate_limiter, ) - if self.config.include_table_constraints: + if ( + self.config.include_table_constraints + and bigquery_dataset.supports_table_constraints() + ): constraints = self.schema_api.get_table_constraints_for_dataset( project_id=project_id, dataset_name=dataset_name, report=self.report ) @@ -1157,9 +1160,11 @@ class BigQuerySchemaGenerator: # fields=[], fields=self.gen_schema_fields( columns, - table.constraints - if (isinstance(table, BigqueryTable) and table.constraints) - else [], + ( + table.constraints + if (isinstance(table, BigqueryTable) and table.constraints) + else [] + ), ), foreignKeys=foreign_keys if foreign_keys else None, ) @@ -1180,13 +1185,9 @@ class BigQuerySchemaGenerator: ) -> Iterable[BigqueryTable]: # In bigquery there is no way to query all tables in a Project id with PerfTimer() as timer: - # PARTITIONS INFORMATION_SCHEMA view is not available for BigLake tables - # based on Amazon S3 and Blob Storage data. - # https://cloud.google.com/bigquery/docs/omni-introduction#limitations - # Omni Locations - https://cloud.google.com/bigquery/docs/omni-introduction#locations - with_partitions = self.config.have_table_data_read_permission and not ( - dataset.location - and dataset.location.lower().startswith(("aws-", "azure-")) + with_partitions = ( + self.config.have_table_data_read_permission + and dataset.supports_table_partitions() ) # Partitions view throw exception if we try to query partition info for too many tables