mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-09 16:03:31 +00:00
feat(ingest/bigquery): Support for View Labels (#10648)
Co-authored-by: Ethan Cartwright <ethan.cartwright@acryl.io> Co-authored-by: Andrew Sikowitz <andrew.sikowitz@acryl.io>
This commit is contained in:
parent
933d2493d7
commit
c58be155f3
@ -1064,11 +1064,19 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|||||||
project_id: str,
|
project_id: str,
|
||||||
dataset_name: str,
|
dataset_name: str,
|
||||||
) -> Iterable[MetadataWorkUnit]:
|
) -> Iterable[MetadataWorkUnit]:
|
||||||
|
tags_to_add = None
|
||||||
|
if table.labels and self.config.capture_view_label_as_tag:
|
||||||
|
tags_to_add = [
|
||||||
|
make_tag_urn(f"{k}:{v}")
|
||||||
|
for k, v in table.labels.items()
|
||||||
|
if is_tag_allowed(self.config.capture_view_label_as_tag, k)
|
||||||
|
]
|
||||||
yield from self.gen_dataset_workunits(
|
yield from self.gen_dataset_workunits(
|
||||||
table=table,
|
table=table,
|
||||||
columns=columns,
|
columns=columns,
|
||||||
project_id=project_id,
|
project_id=project_id,
|
||||||
dataset_name=dataset_name,
|
dataset_name=dataset_name,
|
||||||
|
tags_to_add=tags_to_add,
|
||||||
sub_types=[DatasetSubTypes.VIEW],
|
sub_types=[DatasetSubTypes.VIEW],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -127,6 +127,11 @@ class BigQueryV2Config(
|
|||||||
description="Capture BigQuery table labels as DataHub tag",
|
description="Capture BigQuery table labels as DataHub tag",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
capture_view_label_as_tag: Union[bool, AllowDenyPattern] = Field(
|
||||||
|
default=False,
|
||||||
|
description="Capture BigQuery view labels as DataHub tag",
|
||||||
|
)
|
||||||
|
|
||||||
capture_dataset_label_as_tag: Union[bool, AllowDenyPattern] = Field(
|
capture_dataset_label_as_tag: Union[bool, AllowDenyPattern] = Field(
|
||||||
default=False,
|
default=False,
|
||||||
description="Capture BigQuery dataset labels as DataHub tag",
|
description="Capture BigQuery dataset labels as DataHub tag",
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
from typing import Optional
|
import re
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
|
||||||
def unquote_and_decode_unicode_escape_seq(
|
def unquote_and_decode_unicode_escape_seq(
|
||||||
@ -17,3 +18,10 @@ def unquote_and_decode_unicode_escape_seq(
|
|||||||
cleaned_string = string.encode().decode("unicode-escape")
|
cleaned_string = string.encode().decode("unicode-escape")
|
||||||
|
|
||||||
return cleaned_string
|
return cleaned_string
|
||||||
|
|
||||||
|
|
||||||
|
def parse_labels(labels_str: str) -> Dict[str, str]:
|
||||||
|
pattern = r'STRUCT\("([^"]+)", "([^"]+)"\)'
|
||||||
|
|
||||||
|
# Map of BigQuery label keys to label values
|
||||||
|
return dict(re.findall(pattern, labels_str))
|
||||||
|
|||||||
@ -14,6 +14,7 @@ from google.cloud.bigquery.table import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
|
from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
|
||||||
|
from datahub.ingestion.source.bigquery_v2.bigquery_helper import parse_labels
|
||||||
from datahub.ingestion.source.bigquery_v2.bigquery_report import (
|
from datahub.ingestion.source.bigquery_v2.bigquery_report import (
|
||||||
BigQuerySchemaApiPerfReport,
|
BigQuerySchemaApiPerfReport,
|
||||||
BigQueryV2Report,
|
BigQueryV2Report,
|
||||||
@ -54,9 +55,7 @@ class PartitionInfo:
|
|||||||
cls, time_partitioning: TimePartitioning
|
cls, time_partitioning: TimePartitioning
|
||||||
) -> "PartitionInfo":
|
) -> "PartitionInfo":
|
||||||
return cls(
|
return cls(
|
||||||
field=time_partitioning.field
|
field=time_partitioning.field or "_PARTITIONTIME",
|
||||||
if time_partitioning.field
|
|
||||||
else "_PARTITIONTIME",
|
|
||||||
type=time_partitioning.type_,
|
type=time_partitioning.type_,
|
||||||
expiration_ms=time_partitioning.expiration_ms,
|
expiration_ms=time_partitioning.expiration_ms,
|
||||||
require_partition_filter=time_partitioning.require_partition_filter,
|
require_partition_filter=time_partitioning.require_partition_filter,
|
||||||
@ -107,6 +106,7 @@ class BigqueryTable(BaseTable):
|
|||||||
class BigqueryView(BaseView):
|
class BigqueryView(BaseView):
|
||||||
columns: List[BigqueryColumn] = field(default_factory=list)
|
columns: List[BigqueryColumn] = field(default_factory=list)
|
||||||
materialized: bool = False
|
materialized: bool = False
|
||||||
|
labels: Optional[Dict[str, str]] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -245,9 +245,11 @@ class BigQuerySchemaApi:
|
|||||||
BigqueryQuery.tables_for_dataset.format(
|
BigqueryQuery.tables_for_dataset.format(
|
||||||
project_id=project_id,
|
project_id=project_id,
|
||||||
dataset_name=dataset_name,
|
dataset_name=dataset_name,
|
||||||
table_filter=f" and t.table_name in ({filter_clause})"
|
table_filter=(
|
||||||
|
f" and t.table_name in ({filter_clause})"
|
||||||
if filter_clause
|
if filter_clause
|
||||||
else "",
|
else ""
|
||||||
|
),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -257,9 +259,11 @@ class BigQuerySchemaApi:
|
|||||||
BigqueryQuery.tables_for_dataset_without_partition_data.format(
|
BigqueryQuery.tables_for_dataset_without_partition_data.format(
|
||||||
project_id=project_id,
|
project_id=project_id,
|
||||||
dataset_name=dataset_name,
|
dataset_name=dataset_name,
|
||||||
table_filter=f" and t.table_name in ({filter_clause})"
|
table_filter=(
|
||||||
|
f" and t.table_name in ({filter_clause})"
|
||||||
if filter_clause
|
if filter_clause
|
||||||
else "",
|
else ""
|
||||||
|
),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -297,20 +301,22 @@ class BigQuerySchemaApi:
|
|||||||
return BigqueryTable(
|
return BigqueryTable(
|
||||||
name=table.table_name,
|
name=table.table_name,
|
||||||
created=table.created,
|
created=table.created,
|
||||||
last_altered=datetime.fromtimestamp(
|
last_altered=(
|
||||||
|
datetime.fromtimestamp(
|
||||||
table.get("last_altered") / 1000, tz=timezone.utc
|
table.get("last_altered") / 1000, tz=timezone.utc
|
||||||
)
|
)
|
||||||
if table.get("last_altered") is not None
|
if table.get("last_altered") is not None
|
||||||
else None,
|
else None
|
||||||
|
),
|
||||||
size_in_bytes=table.get("bytes"),
|
size_in_bytes=table.get("bytes"),
|
||||||
rows_count=table.get("row_count"),
|
rows_count=table.get("row_count"),
|
||||||
comment=table.comment,
|
comment=table.comment,
|
||||||
ddl=table.ddl,
|
ddl=table.ddl,
|
||||||
expires=expiration,
|
expires=expiration,
|
||||||
labels=table_basic.labels if table_basic else None,
|
labels=table_basic.labels if table_basic else None,
|
||||||
partition_info=PartitionInfo.from_table_info(table_basic)
|
partition_info=(
|
||||||
if table_basic
|
PartitionInfo.from_table_info(table_basic) if table_basic else None
|
||||||
else None,
|
),
|
||||||
clustering_fields=table_basic.clustering_fields if table_basic else None,
|
clustering_fields=table_basic.clustering_fields if table_basic else None,
|
||||||
max_partition_id=table.get("max_partition_id"),
|
max_partition_id=table.get("max_partition_id"),
|
||||||
max_shard_id=shard,
|
max_shard_id=shard,
|
||||||
@ -361,16 +367,17 @@ class BigQuerySchemaApi:
|
|||||||
return BigqueryView(
|
return BigqueryView(
|
||||||
name=view.table_name,
|
name=view.table_name,
|
||||||
created=view.created,
|
created=view.created,
|
||||||
last_altered=datetime.fromtimestamp(
|
last_altered=(
|
||||||
view.get("last_altered") / 1000, tz=timezone.utc
|
datetime.fromtimestamp(view.get("last_altered") / 1000, tz=timezone.utc)
|
||||||
)
|
|
||||||
if view.get("last_altered") is not None
|
if view.get("last_altered") is not None
|
||||||
else None,
|
else None
|
||||||
|
),
|
||||||
comment=view.comment,
|
comment=view.comment,
|
||||||
view_definition=view.view_definition,
|
view_definition=view.view_definition,
|
||||||
materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW,
|
materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW,
|
||||||
size_in_bytes=view.get("size_bytes"),
|
size_in_bytes=view.get("size_bytes"),
|
||||||
rows_count=view.get("row_count"),
|
rows_count=view.get("row_count"),
|
||||||
|
labels=parse_labels(view.labels) if hasattr(view, "labels") else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_policy_tags_for_column(
|
def get_policy_tags_for_column(
|
||||||
@ -441,6 +448,7 @@ class BigQuerySchemaApi:
|
|||||||
with self.report.get_columns_for_dataset:
|
with self.report.get_columns_for_dataset:
|
||||||
try:
|
try:
|
||||||
cur = self.get_query_result(
|
cur = self.get_query_result(
|
||||||
|
(
|
||||||
BigqueryQuery.columns_for_dataset.format(
|
BigqueryQuery.columns_for_dataset.format(
|
||||||
project_id=project_id, dataset_name=dataset_name
|
project_id=project_id, dataset_name=dataset_name
|
||||||
)
|
)
|
||||||
@ -449,6 +457,7 @@ class BigQuerySchemaApi:
|
|||||||
project_id=project_id,
|
project_id=project_id,
|
||||||
dataset_name=dataset_name,
|
dataset_name=dataset_name,
|
||||||
column_limit=column_limit,
|
column_limit=column_limit,
|
||||||
|
)
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -578,11 +587,13 @@ class BigQuerySchemaApi:
|
|||||||
return BigqueryTableSnapshot(
|
return BigqueryTableSnapshot(
|
||||||
name=snapshot.table_name,
|
name=snapshot.table_name,
|
||||||
created=snapshot.created,
|
created=snapshot.created,
|
||||||
last_altered=datetime.fromtimestamp(
|
last_altered=(
|
||||||
|
datetime.fromtimestamp(
|
||||||
snapshot.get("last_altered") / 1000, tz=timezone.utc
|
snapshot.get("last_altered") / 1000, tz=timezone.utc
|
||||||
)
|
)
|
||||||
if snapshot.get("last_altered") is not None
|
if snapshot.get("last_altered") is not None
|
||||||
else None,
|
else None
|
||||||
|
),
|
||||||
comment=snapshot.comment,
|
comment=snapshot.comment,
|
||||||
ddl=snapshot.ddl,
|
ddl=snapshot.ddl,
|
||||||
snapshot_time=snapshot.snapshot_time,
|
snapshot_time=snapshot.snapshot_time,
|
||||||
|
|||||||
@ -117,7 +117,8 @@ SELECT
|
|||||||
t.table_type as table_type,
|
t.table_type as table_type,
|
||||||
t.creation_time as created,
|
t.creation_time as created,
|
||||||
ts.last_modified_time as last_altered,
|
ts.last_modified_time as last_altered,
|
||||||
tos.OPTION_VALUE as comment,
|
tos_description.OPTION_VALUE as comment,
|
||||||
|
tos_labels.OPTION_VALUE as labels,
|
||||||
t.is_insertable_into,
|
t.is_insertable_into,
|
||||||
t.ddl as view_definition,
|
t.ddl as view_definition,
|
||||||
ts.row_count,
|
ts.row_count,
|
||||||
@ -125,9 +126,12 @@ SELECT
|
|||||||
FROM
|
FROM
|
||||||
`{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
|
`{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
|
||||||
join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME
|
join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME
|
||||||
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
|
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos_description on t.table_schema = tos_description.table_schema
|
||||||
and t.TABLE_NAME = tos.TABLE_NAME
|
and t.TABLE_NAME = tos_description.TABLE_NAME
|
||||||
and tos.OPTION_NAME = "description"
|
and tos_description.OPTION_NAME = "description"
|
||||||
|
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos_labels on t.table_schema = tos_labels.table_schema
|
||||||
|
and t.TABLE_NAME = tos_labels.TABLE_NAME
|
||||||
|
and tos_labels.OPTION_NAME = "labels"
|
||||||
WHERE
|
WHERE
|
||||||
table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
|
table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
|
||||||
order by
|
order by
|
||||||
@ -142,14 +146,18 @@ SELECT
|
|||||||
t.table_name as table_name,
|
t.table_name as table_name,
|
||||||
t.table_type as table_type,
|
t.table_type as table_type,
|
||||||
t.creation_time as created,
|
t.creation_time as created,
|
||||||
tos.OPTION_VALUE as comment,
|
tos_description.OPTION_VALUE as comment,
|
||||||
|
tos_labels.OPTION_VALUE as labels,
|
||||||
t.is_insertable_into,
|
t.is_insertable_into,
|
||||||
t.ddl as view_definition
|
t.ddl as view_definition
|
||||||
FROM
|
FROM
|
||||||
`{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
|
`{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
|
||||||
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
|
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos_description on t.table_schema = tos_description.table_schema
|
||||||
and t.TABLE_NAME = tos.TABLE_NAME
|
and t.TABLE_NAME = tos_description.TABLE_NAME
|
||||||
and tos.OPTION_NAME = "description"
|
and tos_description.OPTION_NAME = "description"
|
||||||
|
left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos_labels on t.table_schema = tos_labels.table_schema
|
||||||
|
and t.TABLE_NAME = tos_labels.TABLE_NAME
|
||||||
|
and tos_labels.OPTION_NAME = "labels"
|
||||||
WHERE
|
WHERE
|
||||||
table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
|
table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
|
||||||
order by
|
order by
|
||||||
|
|||||||
@ -820,6 +820,7 @@ def bigquery_view_1() -> BigqueryView:
|
|||||||
comment="comment1",
|
comment="comment1",
|
||||||
view_definition="CREATE VIEW 1",
|
view_definition="CREATE VIEW 1",
|
||||||
materialized=False,
|
materialized=False,
|
||||||
|
labels=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -833,6 +834,7 @@ def bigquery_view_2() -> BigqueryView:
|
|||||||
comment="comment2",
|
comment="comment2",
|
||||||
view_definition="CREATE VIEW 2",
|
view_definition="CREATE VIEW 2",
|
||||||
materialized=True,
|
materialized=True,
|
||||||
|
labels=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user