From c0addf6eefad58042d78de00aa6f7c86ee11b463 Mon Sep 17 00:00:00 2001
From: Alexander <alexj.nich@hotmail.com>
Date: Thu, 17 Aug 2023 12:44:15 -0400
Subject: [PATCH] feat(ingest/bigquery): add tag to BigQuery clustering columns
 (#8495)

Co-authored-by: Andrew Sikowitz <andrew.sikowitz@acryl.io>
---
 .../ingestion/source/bigquery_v2/bigquery.py  | 26 ++++++++++++-------
 .../source/bigquery_v2/bigquery_schema.py     |  8 +++++-
 .../tests/unit/test_bigquery_profiler.py      |  3 +++
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index d1f39a3ba1..7725d63ce0 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -129,6 +129,7 @@ logger: logging.Logger = logging.getLogger(__name__)
 # Handle table snapshots
 # See https://cloud.google.com/bigquery/docs/table-snapshots-intro.
 SNAPSHOT_TABLE_REGEX = re.compile(r"^(.+)@(\d{13})$")
+CLUSTERING_COLUMN_TAG = "CLUSTERING_COLUMN"
 
 
 # We can't use close as it is not called if the ingestion is not successful
@@ -1151,6 +1152,21 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
                             field.description = col.comment
                             schema_fields[idx] = field
             else:
+                tags = []
+                if col.is_partition_column:
+                    tags.append(
+                        TagAssociationClass(make_tag_urn(Constants.TAG_PARTITION_KEY))
+                    )
+
+                if col.cluster_column_position is not None:
+                    tags.append(
+                        TagAssociationClass(
+                            make_tag_urn(
+                                f"{CLUSTERING_COLUMN_TAG}_{col.cluster_column_position}"
+                            )
+                        )
+                    )
+
                 field = SchemaField(
                     fieldPath=col.name,
                     type=SchemaFieldDataType(
@@ -1160,15 +1176,7 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
                     nativeDataType=col.data_type,
                     description=col.comment,
                     nullable=col.is_nullable,
-                    globalTags=GlobalTagsClass(
-                        tags=[
-                            TagAssociationClass(
-                                make_tag_urn(Constants.TAG_PARTITION_KEY)
-                            )
-                        ]
-                    )
-                    if col.is_partition_column
-                    else GlobalTagsClass(tags=[]),
+                    globalTags=GlobalTagsClass(tags=tags),
                 )
                 schema_fields.append(field)
             last_id = col.ordinal_position
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
index 2450dbd0e2..f8256f8e6f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
@@ -33,6 +33,7 @@ class BigqueryTableType:
 class BigqueryColumn(BaseColumn):
     field_path: str
     is_partition_column: bool
+    cluster_column_position: Optional[int]
 
 
 RANGE_PARTITION_NAME: str = "RANGE"
@@ -285,7 +286,8 @@ select
   CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
   description as comment,
   c.is_hidden as is_hidden,
-  c.is_partitioning_column as is_partitioning_column
+  c.is_partitioning_column as is_partitioning_column,
+  c.clustering_ordinal_position as clustering_ordinal_position,
 from
   `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c
   join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
@@ -307,6 +309,7 @@ select * from
   description as comment,
   c.is_hidden as is_hidden,
   c.is_partitioning_column as is_partitioning_column,
+  c.clustering_ordinal_position as clustering_ordinal_position,
   -- We count the columns to be able limit it later
   row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num,
   -- Getting the maximum shard for each table
@@ -333,6 +336,7 @@ select
   CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
   c.is_hidden as is_hidden,
   c.is_partitioning_column as is_partitioning_column,
+  c.clustering_ordinal_position as clustering_ordinal_position,
   description as comment
 from
   `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c
@@ -583,6 +587,7 @@ class BigQueryDataDictionary:
                         data_type=column.data_type,
                         comment=column.comment,
                         is_partition_column=column.is_partitioning_column == "YES",
+                        cluster_column_position=column.clustering_ordinal_position,
                     )
                 )
 
@@ -621,6 +626,7 @@ class BigQueryDataDictionary:
                         data_type=column.data_type,
                         comment=column.comment,
                         is_partition_column=column.is_partitioning_column == "YES",
+                        cluster_column_position=column.clustering_ordinal_position,
                     )
                 )
             last_seen_table = column.table_name
diff --git a/metadata-ingestion/tests/unit/test_bigquery_profiler.py b/metadata-ingestion/tests/unit/test_bigquery_profiler.py
index a2aec8df93..44ce5f0a02 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_profiler.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_profiler.py
@@ -37,6 +37,7 @@ def test_generate_day_partitioned_partition_profiler_query():
         ordinal_position=1,
         data_type="TIMESTAMP",
         is_partition_column=True,
+        cluster_column_position=None,
         comment=None,
         is_nullable=False,
     )
@@ -79,6 +80,7 @@ def test_generate_day_partitioned_partition_profiler_query_with_set_partition_ti
         ordinal_position=1,
         data_type="TIMESTAMP",
         is_partition_column=True,
+        cluster_column_position=None,
         comment=None,
         is_nullable=False,
     )
@@ -120,6 +122,7 @@ def test_generate_hour_partitioned_partition_profiler_query():
         ordinal_position=1,
         data_type="TIMESTAMP",
         is_partition_column=True,
+        cluster_column_position=None,
         comment=None,
         is_nullable=False,
     )