feat(ingest/bigquery): Support for View Labels (#10648)

Co-authored-by: Ethan Cartwright <ethan.cartwright@acryl.io>
Co-authored-by: Andrew Sikowitz <andrew.sikowitz@acryl.io>
This commit is contained in:
ethan-cartwright 2024-06-17 09:06:41 -04:00 committed by GitHub
parent 933d2493d7
commit c58be155f3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 86 additions and 44 deletions

View File

@ -1064,11 +1064,19 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
project_id: str, project_id: str,
dataset_name: str, dataset_name: str,
) -> Iterable[MetadataWorkUnit]: ) -> Iterable[MetadataWorkUnit]:
tags_to_add = None
if table.labels and self.config.capture_view_label_as_tag:
tags_to_add = [
make_tag_urn(f"{k}:{v}")
for k, v in table.labels.items()
if is_tag_allowed(self.config.capture_view_label_as_tag, k)
]
yield from self.gen_dataset_workunits( yield from self.gen_dataset_workunits(
table=table, table=table,
columns=columns, columns=columns,
project_id=project_id, project_id=project_id,
dataset_name=dataset_name, dataset_name=dataset_name,
tags_to_add=tags_to_add,
sub_types=[DatasetSubTypes.VIEW], sub_types=[DatasetSubTypes.VIEW],
) )

View File

@ -127,6 +127,11 @@ class BigQueryV2Config(
description="Capture BigQuery table labels as DataHub tag", description="Capture BigQuery table labels as DataHub tag",
) )
capture_view_label_as_tag: Union[bool, AllowDenyPattern] = Field(
default=False,
description="Capture BigQuery view labels as DataHub tag",
)
capture_dataset_label_as_tag: Union[bool, AllowDenyPattern] = Field( capture_dataset_label_as_tag: Union[bool, AllowDenyPattern] = Field(
default=False, default=False,
description="Capture BigQuery dataset labels as DataHub tag", description="Capture BigQuery dataset labels as DataHub tag",

View File

@ -1,4 +1,5 @@
from typing import Optional import re
from typing import Dict, Optional
def unquote_and_decode_unicode_escape_seq( def unquote_and_decode_unicode_escape_seq(
@ -17,3 +18,10 @@ def unquote_and_decode_unicode_escape_seq(
cleaned_string = string.encode().decode("unicode-escape") cleaned_string = string.encode().decode("unicode-escape")
return cleaned_string return cleaned_string
def parse_labels(labels_str: str) -> Dict[str, str]:
pattern = r'STRUCT\("([^"]+)", "([^"]+)"\)'
# Map of BigQuery label keys to label values
return dict(re.findall(pattern, labels_str))

View File

@ -14,6 +14,7 @@ from google.cloud.bigquery.table import (
) )
from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
from datahub.ingestion.source.bigquery_v2.bigquery_helper import parse_labels
from datahub.ingestion.source.bigquery_v2.bigquery_report import ( from datahub.ingestion.source.bigquery_v2.bigquery_report import (
BigQuerySchemaApiPerfReport, BigQuerySchemaApiPerfReport,
BigQueryV2Report, BigQueryV2Report,
@ -54,9 +55,7 @@ class PartitionInfo:
cls, time_partitioning: TimePartitioning cls, time_partitioning: TimePartitioning
) -> "PartitionInfo": ) -> "PartitionInfo":
return cls( return cls(
field=time_partitioning.field field=time_partitioning.field or "_PARTITIONTIME",
if time_partitioning.field
else "_PARTITIONTIME",
type=time_partitioning.type_, type=time_partitioning.type_,
expiration_ms=time_partitioning.expiration_ms, expiration_ms=time_partitioning.expiration_ms,
require_partition_filter=time_partitioning.require_partition_filter, require_partition_filter=time_partitioning.require_partition_filter,
@ -107,6 +106,7 @@ class BigqueryTable(BaseTable):
class BigqueryView(BaseView): class BigqueryView(BaseView):
columns: List[BigqueryColumn] = field(default_factory=list) columns: List[BigqueryColumn] = field(default_factory=list)
materialized: bool = False materialized: bool = False
labels: Optional[Dict[str, str]] = None
@dataclass @dataclass
@ -245,9 +245,11 @@ class BigQuerySchemaApi:
BigqueryQuery.tables_for_dataset.format( BigqueryQuery.tables_for_dataset.format(
project_id=project_id, project_id=project_id,
dataset_name=dataset_name, dataset_name=dataset_name,
table_filter=f" and t.table_name in ({filter_clause})" table_filter=(
f" and t.table_name in ({filter_clause})"
if filter_clause if filter_clause
else "", else ""
),
), ),
) )
else: else:
@ -257,9 +259,11 @@ class BigQuerySchemaApi:
BigqueryQuery.tables_for_dataset_without_partition_data.format( BigqueryQuery.tables_for_dataset_without_partition_data.format(
project_id=project_id, project_id=project_id,
dataset_name=dataset_name, dataset_name=dataset_name,
table_filter=f" and t.table_name in ({filter_clause})" table_filter=(
f" and t.table_name in ({filter_clause})"
if filter_clause if filter_clause
else "", else ""
),
), ),
) )
@ -297,20 +301,22 @@ class BigQuerySchemaApi:
return BigqueryTable( return BigqueryTable(
name=table.table_name, name=table.table_name,
created=table.created, created=table.created,
last_altered=datetime.fromtimestamp( last_altered=(
datetime.fromtimestamp(
table.get("last_altered") / 1000, tz=timezone.utc table.get("last_altered") / 1000, tz=timezone.utc
) )
if table.get("last_altered") is not None if table.get("last_altered") is not None
else None, else None
),
size_in_bytes=table.get("bytes"), size_in_bytes=table.get("bytes"),
rows_count=table.get("row_count"), rows_count=table.get("row_count"),
comment=table.comment, comment=table.comment,
ddl=table.ddl, ddl=table.ddl,
expires=expiration, expires=expiration,
labels=table_basic.labels if table_basic else None, labels=table_basic.labels if table_basic else None,
partition_info=PartitionInfo.from_table_info(table_basic) partition_info=(
if table_basic PartitionInfo.from_table_info(table_basic) if table_basic else None
else None, ),
clustering_fields=table_basic.clustering_fields if table_basic else None, clustering_fields=table_basic.clustering_fields if table_basic else None,
max_partition_id=table.get("max_partition_id"), max_partition_id=table.get("max_partition_id"),
max_shard_id=shard, max_shard_id=shard,
@ -361,16 +367,17 @@ class BigQuerySchemaApi:
return BigqueryView( return BigqueryView(
name=view.table_name, name=view.table_name,
created=view.created, created=view.created,
last_altered=datetime.fromtimestamp( last_altered=(
view.get("last_altered") / 1000, tz=timezone.utc datetime.fromtimestamp(view.get("last_altered") / 1000, tz=timezone.utc)
)
if view.get("last_altered") is not None if view.get("last_altered") is not None
else None, else None
),
comment=view.comment, comment=view.comment,
view_definition=view.view_definition, view_definition=view.view_definition,
materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW, materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW,
size_in_bytes=view.get("size_bytes"), size_in_bytes=view.get("size_bytes"),
rows_count=view.get("row_count"), rows_count=view.get("row_count"),
labels=parse_labels(view.labels) if hasattr(view, "labels") else None,
) )
def get_policy_tags_for_column( def get_policy_tags_for_column(
@ -441,6 +448,7 @@ class BigQuerySchemaApi:
with self.report.get_columns_for_dataset: with self.report.get_columns_for_dataset:
try: try:
cur = self.get_query_result( cur = self.get_query_result(
(
BigqueryQuery.columns_for_dataset.format( BigqueryQuery.columns_for_dataset.format(
project_id=project_id, dataset_name=dataset_name project_id=project_id, dataset_name=dataset_name
) )
@ -449,6 +457,7 @@ class BigQuerySchemaApi:
project_id=project_id, project_id=project_id,
dataset_name=dataset_name, dataset_name=dataset_name,
column_limit=column_limit, column_limit=column_limit,
)
), ),
) )
except Exception as e: except Exception as e:
@ -578,11 +587,13 @@ class BigQuerySchemaApi:
return BigqueryTableSnapshot( return BigqueryTableSnapshot(
name=snapshot.table_name, name=snapshot.table_name,
created=snapshot.created, created=snapshot.created,
last_altered=datetime.fromtimestamp( last_altered=(
datetime.fromtimestamp(
snapshot.get("last_altered") / 1000, tz=timezone.utc snapshot.get("last_altered") / 1000, tz=timezone.utc
) )
if snapshot.get("last_altered") is not None if snapshot.get("last_altered") is not None
else None, else None
),
comment=snapshot.comment, comment=snapshot.comment,
ddl=snapshot.ddl, ddl=snapshot.ddl,
snapshot_time=snapshot.snapshot_time, snapshot_time=snapshot.snapshot_time,

View File

@ -117,7 +117,8 @@ SELECT
t.table_type as table_type, t.table_type as table_type,
t.creation_time as created, t.creation_time as created,
ts.last_modified_time as last_altered, ts.last_modified_time as last_altered,
tos.OPTION_VALUE as comment, tos_description.OPTION_VALUE as comment,
tos_labels.OPTION_VALUE as labels,
t.is_insertable_into, t.is_insertable_into,
t.ddl as view_definition, t.ddl as view_definition,
ts.row_count, ts.row_count,
@ -125,9 +126,12 @@ SELECT
FROM FROM
`{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos_description on t.table_schema = tos_description.table_schema
and t.TABLE_NAME = tos.TABLE_NAME and t.TABLE_NAME = tos_description.TABLE_NAME
and tos.OPTION_NAME = "description" and tos_description.OPTION_NAME = "description"
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos_labels on t.table_schema = tos_labels.table_schema
and t.TABLE_NAME = tos_labels.TABLE_NAME
and tos_labels.OPTION_NAME = "labels"
WHERE WHERE
table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}') table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
order by order by
@ -142,14 +146,18 @@ SELECT
t.table_name as table_name, t.table_name as table_name,
t.table_type as table_type, t.table_type as table_type,
t.creation_time as created, t.creation_time as created,
tos.OPTION_VALUE as comment, tos_description.OPTION_VALUE as comment,
tos_labels.OPTION_VALUE as labels,
t.is_insertable_into, t.is_insertable_into,
t.ddl as view_definition t.ddl as view_definition
FROM FROM
`{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos_description on t.table_schema = tos_description.table_schema
and t.TABLE_NAME = tos.TABLE_NAME and t.TABLE_NAME = tos_description.TABLE_NAME
and tos.OPTION_NAME = "description" and tos_description.OPTION_NAME = "description"
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos_labels on t.table_schema = tos_labels.table_schema
and t.TABLE_NAME = tos_labels.TABLE_NAME
and tos_labels.OPTION_NAME = "labels"
WHERE WHERE
table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}') table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
order by order by

View File

@ -820,6 +820,7 @@ def bigquery_view_1() -> BigqueryView:
comment="comment1", comment="comment1",
view_definition="CREATE VIEW 1", view_definition="CREATE VIEW 1",
materialized=False, materialized=False,
labels=None,
) )
@ -833,6 +834,7 @@ def bigquery_view_2() -> BigqueryView:
comment="comment2", comment="comment2",
view_definition="CREATE VIEW 2", view_definition="CREATE VIEW 2",
materialized=True, materialized=True,
labels=None,
) )