mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-12 10:35:51 +00:00
feat(ingest/bigquery): add tag to BigQuery clustering columns (#8495)
Co-authored-by: Andrew Sikowitz <andrew.sikowitz@acryl.io>
This commit is contained in:
parent
836e2f49ea
commit
c0addf6eef
@ -129,6 +129,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
||||
# Handle table snapshots
|
||||
# See https://cloud.google.com/bigquery/docs/table-snapshots-intro.
|
||||
SNAPSHOT_TABLE_REGEX = re.compile(r"^(.+)@(\d{13})$")
|
||||
CLUSTERING_COLUMN_TAG = "CLUSTERING_COLUMN"
|
||||
|
||||
|
||||
# We can't use close as it is not called if the ingestion is not successful
|
||||
@ -1151,6 +1152,21 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
||||
field.description = col.comment
|
||||
schema_fields[idx] = field
|
||||
else:
|
||||
tags = []
|
||||
if col.is_partition_column:
|
||||
tags.append(
|
||||
TagAssociationClass(make_tag_urn(Constants.TAG_PARTITION_KEY))
|
||||
)
|
||||
|
||||
if col.cluster_column_position is not None:
|
||||
tags.append(
|
||||
TagAssociationClass(
|
||||
make_tag_urn(
|
||||
f"{CLUSTERING_COLUMN_TAG}_{col.cluster_column_position}"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
field = SchemaField(
|
||||
fieldPath=col.name,
|
||||
type=SchemaFieldDataType(
|
||||
@ -1160,15 +1176,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
||||
nativeDataType=col.data_type,
|
||||
description=col.comment,
|
||||
nullable=col.is_nullable,
|
||||
globalTags=GlobalTagsClass(
|
||||
tags=[
|
||||
TagAssociationClass(
|
||||
make_tag_urn(Constants.TAG_PARTITION_KEY)
|
||||
)
|
||||
]
|
||||
)
|
||||
if col.is_partition_column
|
||||
else GlobalTagsClass(tags=[]),
|
||||
globalTags=GlobalTagsClass(tags=tags),
|
||||
)
|
||||
schema_fields.append(field)
|
||||
last_id = col.ordinal_position
|
||||
|
||||
@ -33,6 +33,7 @@ class BigqueryTableType:
|
||||
class BigqueryColumn(BaseColumn):
|
||||
field_path: str
|
||||
is_partition_column: bool
|
||||
cluster_column_position: Optional[int]
|
||||
|
||||
|
||||
RANGE_PARTITION_NAME: str = "RANGE"
|
||||
@ -285,7 +286,8 @@ select
|
||||
CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
|
||||
description as comment,
|
||||
c.is_hidden as is_hidden,
|
||||
c.is_partitioning_column as is_partitioning_column
|
||||
c.is_partitioning_column as is_partitioning_column,
|
||||
c.clustering_ordinal_position as clustering_ordinal_position,
|
||||
from
|
||||
`{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c
|
||||
join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
|
||||
@ -307,6 +309,7 @@ select * from
|
||||
description as comment,
|
||||
c.is_hidden as is_hidden,
|
||||
c.is_partitioning_column as is_partitioning_column,
|
||||
c.clustering_ordinal_position as clustering_ordinal_position,
|
||||
-- We count the columns to be able limit it later
|
||||
row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num,
|
||||
-- Getting the maximum shard for each table
|
||||
@ -333,6 +336,7 @@ select
|
||||
CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
|
||||
c.is_hidden as is_hidden,
|
||||
c.is_partitioning_column as is_partitioning_column,
|
||||
c.clustering_ordinal_position as clustering_ordinal_position,
|
||||
description as comment
|
||||
from
|
||||
`{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c
|
||||
@ -583,6 +587,7 @@ class BigQueryDataDictionary:
|
||||
data_type=column.data_type,
|
||||
comment=column.comment,
|
||||
is_partition_column=column.is_partitioning_column == "YES",
|
||||
cluster_column_position=column.clustering_ordinal_position,
|
||||
)
|
||||
)
|
||||
|
||||
@ -621,6 +626,7 @@ class BigQueryDataDictionary:
|
||||
data_type=column.data_type,
|
||||
comment=column.comment,
|
||||
is_partition_column=column.is_partitioning_column == "YES",
|
||||
cluster_column_position=column.clustering_ordinal_position,
|
||||
)
|
||||
)
|
||||
last_seen_table = column.table_name
|
||||
|
||||
@ -37,6 +37,7 @@ def test_generate_day_partitioned_partition_profiler_query():
|
||||
ordinal_position=1,
|
||||
data_type="TIMESTAMP",
|
||||
is_partition_column=True,
|
||||
cluster_column_position=None,
|
||||
comment=None,
|
||||
is_nullable=False,
|
||||
)
|
||||
@ -79,6 +80,7 @@ def test_generate_day_partitioned_partition_profiler_query_with_set_partition_ti
|
||||
ordinal_position=1,
|
||||
data_type="TIMESTAMP",
|
||||
is_partition_column=True,
|
||||
cluster_column_position=None,
|
||||
comment=None,
|
||||
is_nullable=False,
|
||||
)
|
||||
@ -120,6 +122,7 @@ def test_generate_hour_partitioned_partition_profiler_query():
|
||||
ordinal_position=1,
|
||||
data_type="TIMESTAMP",
|
||||
is_partition_column=True,
|
||||
cluster_column_position=None,
|
||||
comment=None,
|
||||
is_nullable=False,
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user