feat(ingest): bigquery - ability to disable partition profiling (#4228)

This commit is contained in:
Tamas Nemeth 2022-03-02 07:29:48 +01:00 committed by GitHub
parent d52638a252
commit 2a5cf3dd07
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 32 additions and 8 deletions

View File

@ -156,8 +156,12 @@ Note: the bigquery_audit_metadata_datasets parameter receives a list of datasets
Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs.
## Profiling
For profiling you have to set a table schema where Great Expectation (the profiling framework we use) can create temporary
views by setting `profiling.bigquery_temp_table_schema` property.
Profiling can profile normal/partitioned and sharded tables as well but due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
If limit/offset parameter is set or partitioning partitioned or sharded table Great Expectation (the profiling framework we use) needs to create temporary
views. By default these views are created in the schema where the profiled table is but you can control to create all these
tables into a predefined schema by setting `profiling.bigquery_temp_table_schema` property.
Temporary tables are removed after profiling.
```yaml
profiling:
@ -168,7 +172,7 @@ views by setting `profiling.bigquery_temp_table_schema` property.
:::note
Due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
You can set partition explicitly with `partition.partition_datetime` property if you want. (partition will be applied to all partitioned tables)
:::
# BigQuery Usage Stats

View File

@ -793,11 +793,21 @@ class DatahubGEProfiler:
**kwargs,
}
if self.config.bigquery_temp_table_schema is not None:
bigquery_temp_table = (
f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
)
ge_config["bigquery_temp_table"] = bigquery_temp_table
# We have to create temporary tables if offset or limit or custom sql is set on Bigquery
if custom_sql or self.config.limit or self.config.offset:
if self.config.bigquery_temp_table_schema:
bigquery_temp_table = (
f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
)
ge_config["bigquery_temp_table"] = bigquery_temp_table
else:
assert table
table_parts = table.split(".")
if len(table_parts) == 2:
bigquery_temp_table = (
f"{schema}.{table_parts[0]}.ge-temp-{uuid.uuid4()}"
)
ge_config["bigquery_temp_table"] = bigquery_temp_table
if custom_sql is not None:
ge_config["query"] = custom_sql

View File

@ -43,6 +43,7 @@ class GEProfilingConfig(ConfigModel):
# Hidden option - used for debugging purposes.
catch_exceptions: bool = True
partition_profiling_enabled: bool = True
bigquery_temp_table_schema: Optional[str] = None
partition_datetime: Optional[datetime.datetime]

View File

@ -1168,6 +1168,15 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
schema, table, self.config.profiling.partition_datetime
)
if (
partition is not None
and not self.config.profiling.partition_profiling_enabled
):
logger.debug(
f"{dataset_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled"
)
continue
self.report.report_entity_profiled(dataset_name)
yield GEProfilerRequest(
pretty_name=dataset_name,