mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-31 18:59:23 +00:00
feat(ingest/bigquery): Support for View Labels (#10648)
Co-authored-by: Ethan Cartwright <ethan.cartwright@acryl.io> Co-authored-by: Andrew Sikowitz <andrew.sikowitz@acryl.io>
This commit is contained in:
parent
933d2493d7
commit
c58be155f3
@ -1064,11 +1064,19 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
||||
project_id: str,
|
||||
dataset_name: str,
|
||||
) -> Iterable[MetadataWorkUnit]:
|
||||
tags_to_add = None
|
||||
if table.labels and self.config.capture_view_label_as_tag:
|
||||
tags_to_add = [
|
||||
make_tag_urn(f"{k}:{v}")
|
||||
for k, v in table.labels.items()
|
||||
if is_tag_allowed(self.config.capture_view_label_as_tag, k)
|
||||
]
|
||||
yield from self.gen_dataset_workunits(
|
||||
table=table,
|
||||
columns=columns,
|
||||
project_id=project_id,
|
||||
dataset_name=dataset_name,
|
||||
tags_to_add=tags_to_add,
|
||||
sub_types=[DatasetSubTypes.VIEW],
|
||||
)
|
||||
|
||||
|
||||
@ -127,6 +127,11 @@ class BigQueryV2Config(
|
||||
description="Capture BigQuery table labels as DataHub tag",
|
||||
)
|
||||
|
||||
capture_view_label_as_tag: Union[bool, AllowDenyPattern] = Field(
|
||||
default=False,
|
||||
description="Capture BigQuery view labels as DataHub tag",
|
||||
)
|
||||
|
||||
capture_dataset_label_as_tag: Union[bool, AllowDenyPattern] = Field(
|
||||
default=False,
|
||||
description="Capture BigQuery dataset labels as DataHub tag",
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from typing import Optional
|
||||
import re
|
||||
from typing import Dict, Optional
|
||||
|
||||
|
||||
def unquote_and_decode_unicode_escape_seq(
|
||||
@ -17,3 +18,10 @@ def unquote_and_decode_unicode_escape_seq(
|
||||
cleaned_string = string.encode().decode("unicode-escape")
|
||||
|
||||
return cleaned_string
|
||||
|
||||
|
||||
def parse_labels(labels_str: str) -> Dict[str, str]:
|
||||
pattern = r'STRUCT\("([^"]+)", "([^"]+)"\)'
|
||||
|
||||
# Map of BigQuery label keys to label values
|
||||
return dict(re.findall(pattern, labels_str))
|
||||
|
||||
@ -14,6 +14,7 @@ from google.cloud.bigquery.table import (
|
||||
)
|
||||
|
||||
from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
|
||||
from datahub.ingestion.source.bigquery_v2.bigquery_helper import parse_labels
|
||||
from datahub.ingestion.source.bigquery_v2.bigquery_report import (
|
||||
BigQuerySchemaApiPerfReport,
|
||||
BigQueryV2Report,
|
||||
@ -54,9 +55,7 @@ class PartitionInfo:
|
||||
cls, time_partitioning: TimePartitioning
|
||||
) -> "PartitionInfo":
|
||||
return cls(
|
||||
field=time_partitioning.field
|
||||
if time_partitioning.field
|
||||
else "_PARTITIONTIME",
|
||||
field=time_partitioning.field or "_PARTITIONTIME",
|
||||
type=time_partitioning.type_,
|
||||
expiration_ms=time_partitioning.expiration_ms,
|
||||
require_partition_filter=time_partitioning.require_partition_filter,
|
||||
@ -107,6 +106,7 @@ class BigqueryTable(BaseTable):
|
||||
class BigqueryView(BaseView):
|
||||
columns: List[BigqueryColumn] = field(default_factory=list)
|
||||
materialized: bool = False
|
||||
labels: Optional[Dict[str, str]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -245,9 +245,11 @@ class BigQuerySchemaApi:
|
||||
BigqueryQuery.tables_for_dataset.format(
|
||||
project_id=project_id,
|
||||
dataset_name=dataset_name,
|
||||
table_filter=f" and t.table_name in ({filter_clause})"
|
||||
if filter_clause
|
||||
else "",
|
||||
table_filter=(
|
||||
f" and t.table_name in ({filter_clause})"
|
||||
if filter_clause
|
||||
else ""
|
||||
),
|
||||
),
|
||||
)
|
||||
else:
|
||||
@ -257,9 +259,11 @@ class BigQuerySchemaApi:
|
||||
BigqueryQuery.tables_for_dataset_without_partition_data.format(
|
||||
project_id=project_id,
|
||||
dataset_name=dataset_name,
|
||||
table_filter=f" and t.table_name in ({filter_clause})"
|
||||
if filter_clause
|
||||
else "",
|
||||
table_filter=(
|
||||
f" and t.table_name in ({filter_clause})"
|
||||
if filter_clause
|
||||
else ""
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
@ -297,20 +301,22 @@ class BigQuerySchemaApi:
|
||||
return BigqueryTable(
|
||||
name=table.table_name,
|
||||
created=table.created,
|
||||
last_altered=datetime.fromtimestamp(
|
||||
table.get("last_altered") / 1000, tz=timezone.utc
|
||||
)
|
||||
if table.get("last_altered") is not None
|
||||
else None,
|
||||
last_altered=(
|
||||
datetime.fromtimestamp(
|
||||
table.get("last_altered") / 1000, tz=timezone.utc
|
||||
)
|
||||
if table.get("last_altered") is not None
|
||||
else None
|
||||
),
|
||||
size_in_bytes=table.get("bytes"),
|
||||
rows_count=table.get("row_count"),
|
||||
comment=table.comment,
|
||||
ddl=table.ddl,
|
||||
expires=expiration,
|
||||
labels=table_basic.labels if table_basic else None,
|
||||
partition_info=PartitionInfo.from_table_info(table_basic)
|
||||
if table_basic
|
||||
else None,
|
||||
partition_info=(
|
||||
PartitionInfo.from_table_info(table_basic) if table_basic else None
|
||||
),
|
||||
clustering_fields=table_basic.clustering_fields if table_basic else None,
|
||||
max_partition_id=table.get("max_partition_id"),
|
||||
max_shard_id=shard,
|
||||
@ -361,16 +367,17 @@ class BigQuerySchemaApi:
|
||||
return BigqueryView(
|
||||
name=view.table_name,
|
||||
created=view.created,
|
||||
last_altered=datetime.fromtimestamp(
|
||||
view.get("last_altered") / 1000, tz=timezone.utc
|
||||
)
|
||||
if view.get("last_altered") is not None
|
||||
else None,
|
||||
last_altered=(
|
||||
datetime.fromtimestamp(view.get("last_altered") / 1000, tz=timezone.utc)
|
||||
if view.get("last_altered") is not None
|
||||
else None
|
||||
),
|
||||
comment=view.comment,
|
||||
view_definition=view.view_definition,
|
||||
materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW,
|
||||
size_in_bytes=view.get("size_bytes"),
|
||||
rows_count=view.get("row_count"),
|
||||
labels=parse_labels(view.labels) if hasattr(view, "labels") else None,
|
||||
)
|
||||
|
||||
def get_policy_tags_for_column(
|
||||
@ -441,14 +448,16 @@ class BigQuerySchemaApi:
|
||||
with self.report.get_columns_for_dataset:
|
||||
try:
|
||||
cur = self.get_query_result(
|
||||
BigqueryQuery.columns_for_dataset.format(
|
||||
project_id=project_id, dataset_name=dataset_name
|
||||
)
|
||||
if not run_optimized_column_query
|
||||
else BigqueryQuery.optimized_columns_for_dataset.format(
|
||||
project_id=project_id,
|
||||
dataset_name=dataset_name,
|
||||
column_limit=column_limit,
|
||||
(
|
||||
BigqueryQuery.columns_for_dataset.format(
|
||||
project_id=project_id, dataset_name=dataset_name
|
||||
)
|
||||
if not run_optimized_column_query
|
||||
else BigqueryQuery.optimized_columns_for_dataset.format(
|
||||
project_id=project_id,
|
||||
dataset_name=dataset_name,
|
||||
column_limit=column_limit,
|
||||
)
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
@ -578,11 +587,13 @@ class BigQuerySchemaApi:
|
||||
return BigqueryTableSnapshot(
|
||||
name=snapshot.table_name,
|
||||
created=snapshot.created,
|
||||
last_altered=datetime.fromtimestamp(
|
||||
snapshot.get("last_altered") / 1000, tz=timezone.utc
|
||||
)
|
||||
if snapshot.get("last_altered") is not None
|
||||
else None,
|
||||
last_altered=(
|
||||
datetime.fromtimestamp(
|
||||
snapshot.get("last_altered") / 1000, tz=timezone.utc
|
||||
)
|
||||
if snapshot.get("last_altered") is not None
|
||||
else None
|
||||
),
|
||||
comment=snapshot.comment,
|
||||
ddl=snapshot.ddl,
|
||||
snapshot_time=snapshot.snapshot_time,
|
||||
|
||||
@ -117,7 +117,8 @@ SELECT
|
||||
t.table_type as table_type,
|
||||
t.creation_time as created,
|
||||
ts.last_modified_time as last_altered,
|
||||
tos.OPTION_VALUE as comment,
|
||||
tos_description.OPTION_VALUE as comment,
|
||||
tos_labels.OPTION_VALUE as labels,
|
||||
t.is_insertable_into,
|
||||
t.ddl as view_definition,
|
||||
ts.row_count,
|
||||
@ -125,9 +126,12 @@ SELECT
|
||||
FROM
|
||||
`{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
|
||||
join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME
|
||||
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
|
||||
and t.TABLE_NAME = tos.TABLE_NAME
|
||||
and tos.OPTION_NAME = "description"
|
||||
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos_description on t.table_schema = tos_description.table_schema
|
||||
and t.TABLE_NAME = tos_description.TABLE_NAME
|
||||
and tos_description.OPTION_NAME = "description"
|
||||
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos_labels on t.table_schema = tos_labels.table_schema
|
||||
and t.TABLE_NAME = tos_labels.TABLE_NAME
|
||||
and tos_labels.OPTION_NAME = "labels"
|
||||
WHERE
|
||||
table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
|
||||
order by
|
||||
@ -142,14 +146,18 @@ SELECT
|
||||
t.table_name as table_name,
|
||||
t.table_type as table_type,
|
||||
t.creation_time as created,
|
||||
tos.OPTION_VALUE as comment,
|
||||
tos_description.OPTION_VALUE as comment,
|
||||
tos_labels.OPTION_VALUE as labels,
|
||||
t.is_insertable_into,
|
||||
t.ddl as view_definition
|
||||
FROM
|
||||
`{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
|
||||
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
|
||||
and t.TABLE_NAME = tos.TABLE_NAME
|
||||
and tos.OPTION_NAME = "description"
|
||||
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos_description on t.table_schema = tos_description.table_schema
|
||||
and t.TABLE_NAME = tos_description.TABLE_NAME
|
||||
and tos_description.OPTION_NAME = "description"
|
||||
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos_labels on t.table_schema = tos_labels.table_schema
|
||||
and t.TABLE_NAME = tos_labels.TABLE_NAME
|
||||
and tos_labels.OPTION_NAME = "labels"
|
||||
WHERE
|
||||
table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
|
||||
order by
|
||||
|
||||
@ -820,6 +820,7 @@ def bigquery_view_1() -> BigqueryView:
|
||||
comment="comment1",
|
||||
view_definition="CREATE VIEW 1",
|
||||
materialized=False,
|
||||
labels=None,
|
||||
)
|
||||
|
||||
|
||||
@ -833,6 +834,7 @@ def bigquery_view_2() -> BigqueryView:
|
||||
comment="comment2",
|
||||
view_definition="CREATE VIEW 2",
|
||||
materialized=True,
|
||||
labels=None,
|
||||
)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user