fix(ingest/bigquery): ignore include constraints for biglake datasets (#11874)

This commit is contained in:
Mayuri Nehate 2024-11-25 10:31:17 +05:30 committed by GitHub
parent 766d36d164
commit dd892dfbb1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 47 additions and 23 deletions

View File

@ -152,6 +152,21 @@ class BigqueryDataset:
snapshots: List[BigqueryTableSnapshot] = field(default_factory=list)
columns: List[BigqueryColumn] = field(default_factory=list)
# Some INFORMATION_SCHEMA views are not available for BigLake tables
# based on Amazon S3 and Blob Storage data.
# https://cloud.google.com/bigquery/docs/omni-introduction#limitations
# Omni Locations - https://cloud.google.com/bigquery/docs/omni-introduction#locations
def is_biglake_dataset(self) -> bool:
return self.location is not None and self.location.lower().startswith(
("aws-", "azure-")
)
def supports_table_constraints(self) -> bool:
return not self.is_biglake_dataset()
def supports_table_partitions(self) -> bool:
return not self.is_biglake_dataset()
@dataclass
class BigqueryProject:
@ -541,18 +556,26 @@ class BigQuerySchemaApi:
table_name=constraint.table_name,
type=constraint.constraint_type,
field_path=constraint.column_name,
referenced_project_id=constraint.referenced_catalog
if constraint.constraint_type == "FOREIGN KEY"
else None,
referenced_dataset=constraint.referenced_schema
if constraint.constraint_type == "FOREIGN KEY"
else None,
referenced_table_name=constraint.referenced_table
if constraint.constraint_type == "FOREIGN KEY"
else None,
referenced_column_name=constraint.referenced_column
if constraint.constraint_type == "FOREIGN KEY"
else None,
referenced_project_id=(
constraint.referenced_catalog
if constraint.constraint_type == "FOREIGN KEY"
else None
),
referenced_dataset=(
constraint.referenced_schema
if constraint.constraint_type == "FOREIGN KEY"
else None
),
referenced_table_name=(
constraint.referenced_table
if constraint.constraint_type == "FOREIGN KEY"
else None
),
referenced_column_name=(
constraint.referenced_column
if constraint.constraint_type == "FOREIGN KEY"
else None
),
)
)
self.report.num_get_table_constraints_for_dataset_api_requests += 1

View File

@ -498,7 +498,10 @@ class BigQuerySchemaGenerator:
report=self.report,
rate_limiter=rate_limiter,
)
if self.config.include_table_constraints:
if (
self.config.include_table_constraints
and bigquery_dataset.supports_table_constraints()
):
constraints = self.schema_api.get_table_constraints_for_dataset(
project_id=project_id, dataset_name=dataset_name, report=self.report
)
@ -1157,9 +1160,11 @@ class BigQuerySchemaGenerator:
# fields=[],
fields=self.gen_schema_fields(
columns,
table.constraints
if (isinstance(table, BigqueryTable) and table.constraints)
else [],
(
table.constraints
if (isinstance(table, BigqueryTable) and table.constraints)
else []
),
),
foreignKeys=foreign_keys if foreign_keys else None,
)
@ -1180,13 +1185,9 @@ class BigQuerySchemaGenerator:
) -> Iterable[BigqueryTable]:
# In bigquery there is no way to query all tables in a Project id
with PerfTimer() as timer:
# PARTITIONS INFORMATION_SCHEMA view is not available for BigLake tables
# based on Amazon S3 and Blob Storage data.
# https://cloud.google.com/bigquery/docs/omni-introduction#limitations
# Omni Locations - https://cloud.google.com/bigquery/docs/omni-introduction#locations
with_partitions = self.config.have_table_data_read_permission and not (
dataset.location
and dataset.location.lower().startswith(("aws-", "azure-"))
with_partitions = (
self.config.have_table_data_read_permission
and dataset.supports_table_partitions()
)
# Partitions view throw exception if we try to query partition info for too many tables