feat(ingest): bigquery - ability to disable partition profiling (#4228)

2025-11-02 19:58:59 +00:00 · 2022-03-02 07:29:48 +01:00 · 2022-03-02 07:29:48 +01:00 · 2a5cf3dd07
commit 2a5cf3dd07
parent d52638a252
4 changed files with 32 additions and 8 deletions
--- a/metadata-ingestion/source_docs/bigquery.md
+++ b/metadata-ingestion/source_docs/bigquery.md
@ -156,8 +156,12 @@ Note: the bigquery_audit_metadata_datasets parameter receives a list of datasets
 Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs.
 ## Profiling
-For profiling you have to set a table schema where Great Expectation (the profiling framework we use) can create temporary
+Profiling can profile normal/partitioned and sharded tables as well but due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
-views by setting `profiling.bigquery_temp_table_schema` property.
+
 If limit/offset parameter is set or partitioning partitioned or sharded table Great Expectation (the profiling framework we use) needs to create temporary
 views. By default these views are created in the schema where the profiled table is but you can control to create all these
 tables into a predefined schema by setting `profiling.bigquery_temp_table_schema` property. 
 Temporary tables are removed after profiling.
 ```yaml
     profiling:
@ -168,7 +172,7 @@ views by setting `profiling.bigquery_temp_table_schema` property.
 :::note
 Due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
-
+You can set partition explicitly with `partition.partition_datetime` property if you want. (partition will be applied to all partitioned tables)
 :::
 # BigQuery Usage Stats
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@ -793,11 +793,21 @@ class DatahubGEProfiler:
            **kwargs,
        }
-        if self.config.bigquery_temp_table_schema is not None:
+        # We have to create temporary tables if offset or limit or custom sql is set on Bigquery
-            bigquery_temp_table = (
+        if custom_sql or self.config.limit or self.config.offset:
-                f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
+            if self.config.bigquery_temp_table_schema:
-            )
+                bigquery_temp_table = (
-            ge_config["bigquery_temp_table"] = bigquery_temp_table
+                    f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
                )
                ge_config["bigquery_temp_table"] = bigquery_temp_table
            else:
                assert table
                table_parts = table.split(".")
                if len(table_parts) == 2:
                    bigquery_temp_table = (
                        f"{schema}.{table_parts[0]}.ge-temp-{uuid.uuid4()}"
                    )
                    ge_config["bigquery_temp_table"] = bigquery_temp_table
        if custom_sql is not None:
            ge_config["query"] = custom_sql
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
@ -43,6 +43,7 @@ class GEProfilingConfig(ConfigModel):
    # Hidden option - used for debugging purposes.
    catch_exceptions: bool = True
    partition_profiling_enabled: bool = True
    bigquery_temp_table_schema: Optional[str] = None
    partition_datetime: Optional[datetime.datetime]
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@ -1168,6 +1168,15 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
                schema, table, self.config.profiling.partition_datetime
            )
            if (
                partition is not None
                and not self.config.profiling.partition_profiling_enabled
            ):
                logger.debug(
                    f"{dataset_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled"
                )
                continue
            self.report.report_entity_profiled(dataset_name)
            yield GEProfilerRequest(
                pretty_name=dataset_name,