fix(ingest/bigquery): ignore include constraints for biglake datasets (#11874)

This commit is contained in:
Mayuri Nehate 2024-11-25 10:31:17 +05:30 committed by GitHub
parent 766d36d164
commit dd892dfbb1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 47 additions and 23 deletions

View File

@ -152,6 +152,21 @@ class BigqueryDataset:
snapshots: List[BigqueryTableSnapshot] = field(default_factory=list) snapshots: List[BigqueryTableSnapshot] = field(default_factory=list)
columns: List[BigqueryColumn] = field(default_factory=list) columns: List[BigqueryColumn] = field(default_factory=list)
# Some INFORMATION_SCHEMA views are not available for BigLake tables
# based on Amazon S3 and Blob Storage data.
# https://cloud.google.com/bigquery/docs/omni-introduction#limitations
# Omni Locations - https://cloud.google.com/bigquery/docs/omni-introduction#locations
def is_biglake_dataset(self) -> bool:
return self.location is not None and self.location.lower().startswith(
("aws-", "azure-")
)
def supports_table_constraints(self) -> bool:
return not self.is_biglake_dataset()
def supports_table_partitions(self) -> bool:
return not self.is_biglake_dataset()
@dataclass @dataclass
class BigqueryProject: class BigqueryProject:
@ -541,18 +556,26 @@ class BigQuerySchemaApi:
table_name=constraint.table_name, table_name=constraint.table_name,
type=constraint.constraint_type, type=constraint.constraint_type,
field_path=constraint.column_name, field_path=constraint.column_name,
referenced_project_id=constraint.referenced_catalog referenced_project_id=(
if constraint.constraint_type == "FOREIGN KEY" constraint.referenced_catalog
else None, if constraint.constraint_type == "FOREIGN KEY"
referenced_dataset=constraint.referenced_schema else None
if constraint.constraint_type == "FOREIGN KEY" ),
else None, referenced_dataset=(
referenced_table_name=constraint.referenced_table constraint.referenced_schema
if constraint.constraint_type == "FOREIGN KEY" if constraint.constraint_type == "FOREIGN KEY"
else None, else None
referenced_column_name=constraint.referenced_column ),
if constraint.constraint_type == "FOREIGN KEY" referenced_table_name=(
else None, constraint.referenced_table
if constraint.constraint_type == "FOREIGN KEY"
else None
),
referenced_column_name=(
constraint.referenced_column
if constraint.constraint_type == "FOREIGN KEY"
else None
),
) )
) )
self.report.num_get_table_constraints_for_dataset_api_requests += 1 self.report.num_get_table_constraints_for_dataset_api_requests += 1

View File

@ -498,7 +498,10 @@ class BigQuerySchemaGenerator:
report=self.report, report=self.report,
rate_limiter=rate_limiter, rate_limiter=rate_limiter,
) )
if self.config.include_table_constraints: if (
self.config.include_table_constraints
and bigquery_dataset.supports_table_constraints()
):
constraints = self.schema_api.get_table_constraints_for_dataset( constraints = self.schema_api.get_table_constraints_for_dataset(
project_id=project_id, dataset_name=dataset_name, report=self.report project_id=project_id, dataset_name=dataset_name, report=self.report
) )
@ -1157,9 +1160,11 @@ class BigQuerySchemaGenerator:
# fields=[], # fields=[],
fields=self.gen_schema_fields( fields=self.gen_schema_fields(
columns, columns,
table.constraints (
if (isinstance(table, BigqueryTable) and table.constraints) table.constraints
else [], if (isinstance(table, BigqueryTable) and table.constraints)
else []
),
), ),
foreignKeys=foreign_keys if foreign_keys else None, foreignKeys=foreign_keys if foreign_keys else None,
) )
@ -1180,13 +1185,9 @@ class BigQuerySchemaGenerator:
) -> Iterable[BigqueryTable]: ) -> Iterable[BigqueryTable]:
# In bigquery there is no way to query all tables in a Project id # In bigquery there is no way to query all tables in a Project id
with PerfTimer() as timer: with PerfTimer() as timer:
# PARTITIONS INFORMATION_SCHEMA view is not available for BigLake tables with_partitions = (
# based on Amazon S3 and Blob Storage data. self.config.have_table_data_read_permission
# https://cloud.google.com/bigquery/docs/omni-introduction#limitations and dataset.supports_table_partitions()
# Omni Locations - https://cloud.google.com/bigquery/docs/omni-introduction#locations
with_partitions = self.config.have_table_data_read_permission and not (
dataset.location
and dataset.location.lower().startswith(("aws-", "azure-"))
) )
# Partitions view throw exception if we try to query partition info for too many tables # Partitions view throw exception if we try to query partition info for too many tables