feat(ingest): bigquery - ability to disable partition profiling (#4228)

2025-11-01 11:19:05 +00:00 · 2022-03-02 07:29:48 +01:00 · 2022-03-02 07:29:48 +01:00 · 2a5cf3dd07
commit 2a5cf3dd07
parent d52638a252
4 changed files with 32 additions and 8 deletions
--- a/metadata-ingestion/source_docs/bigquery.md
+++ b/metadata-ingestion/source_docs/bigquery.md
@ -156,8 +156,12 @@ Note: the bigquery_audit_metadata_datasets parameter receives a list of datasets
 Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs.

 ## Profiling
-For profiling you have to set a table schema where Great Expectation (the profiling framework we use) can create temporary
-views by setting `profiling.bigquery_temp_table_schema` property.
+Profiling can profile normal/partitioned and sharded tables as well but due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
+
+If limit/offset parameter is set or partitioning partitioned or sharded table Great Expectation (the profiling framework we use) needs to create temporary
+views. By default these views are created in the schema where the profiled table is but you can control to create all these
+tables into a predefined schema by setting `profiling.bigquery_temp_table_schema` property. 
+Temporary tables are removed after profiling.

 ```yaml
     profiling:
@ -168,7 +172,7 @@ views by setting `profiling.bigquery_temp_table_schema` property.
 :::note

 Due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
-
+You can set partition explicitly with `partition.partition_datetime` property if you want. (partition will be applied to all partitioned tables)
 :::

 # BigQuery Usage Stats
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@ -793,11 +793,21 @@ class DatahubGEProfiler:
            **kwargs,
        }

-        if self.config.bigquery_temp_table_schema is not None:
-            bigquery_temp_table = (
-                f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
-            )
-            ge_config["bigquery_temp_table"] = bigquery_temp_table
+        # We have to create temporary tables if offset or limit or custom sql is set on Bigquery
+        if custom_sql or self.config.limit or self.config.offset:
+            if self.config.bigquery_temp_table_schema:
+                bigquery_temp_table = (
+                    f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
+                )
+                ge_config["bigquery_temp_table"] = bigquery_temp_table
+            else:
+                assert table
+                table_parts = table.split(".")
+                if len(table_parts) == 2:
+                    bigquery_temp_table = (
+                        f"{schema}.{table_parts[0]}.ge-temp-{uuid.uuid4()}"
+                    )
+                    ge_config["bigquery_temp_table"] = bigquery_temp_table

        if custom_sql is not None:
            ge_config["query"] = custom_sql
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
@ -43,6 +43,7 @@ class GEProfilingConfig(ConfigModel):
    # Hidden option - used for debugging purposes.
    catch_exceptions: bool = True

+    partition_profiling_enabled: bool = True
    bigquery_temp_table_schema: Optional[str] = None
    partition_datetime: Optional[datetime.datetime]

--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@ -1168,6 +1168,15 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
                schema, table, self.config.profiling.partition_datetime
            )

+            if (
+                partition is not None
+                and not self.config.profiling.partition_profiling_enabled
+            ):
+                logger.debug(
+                    f"{dataset_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled"
+                )
+                continue
+
            self.report.report_entity_profiled(dataset_name)
            yield GEProfilerRequest(
                pretty_name=dataset_name,