mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-01 11:19:05 +00:00
feat(ingest): bigquery - ability to disable partition profiling (#4228)
This commit is contained in:
parent
d52638a252
commit
2a5cf3dd07
@ -156,8 +156,12 @@ Note: the bigquery_audit_metadata_datasets parameter receives a list of datasets
|
||||
Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs.
|
||||
|
||||
## Profiling
|
||||
For profiling you have to set a table schema where Great Expectation (the profiling framework we use) can create temporary
|
||||
views by setting `profiling.bigquery_temp_table_schema` property.
|
||||
Profiling can profile normal/partitioned and sharded tables as well but due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
|
||||
|
||||
If limit/offset parameter is set or partitioning partitioned or sharded table Great Expectation (the profiling framework we use) needs to create temporary
|
||||
views. By default these views are created in the schema where the profiled table is but you can control to create all these
|
||||
tables into a predefined schema by setting `profiling.bigquery_temp_table_schema` property.
|
||||
Temporary tables are removed after profiling.
|
||||
|
||||
```yaml
|
||||
profiling:
|
||||
@ -168,7 +172,7 @@ views by setting `profiling.bigquery_temp_table_schema` property.
|
||||
:::note
|
||||
|
||||
Due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
|
||||
|
||||
You can set partition explicitly with `partition.partition_datetime` property if you want. (partition will be applied to all partitioned tables)
|
||||
:::
|
||||
|
||||
# BigQuery Usage Stats
|
||||
|
||||
@ -793,11 +793,21 @@ class DatahubGEProfiler:
|
||||
**kwargs,
|
||||
}
|
||||
|
||||
if self.config.bigquery_temp_table_schema is not None:
|
||||
bigquery_temp_table = (
|
||||
f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
|
||||
)
|
||||
ge_config["bigquery_temp_table"] = bigquery_temp_table
|
||||
# We have to create temporary tables if offset or limit or custom sql is set on Bigquery
|
||||
if custom_sql or self.config.limit or self.config.offset:
|
||||
if self.config.bigquery_temp_table_schema:
|
||||
bigquery_temp_table = (
|
||||
f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
|
||||
)
|
||||
ge_config["bigquery_temp_table"] = bigquery_temp_table
|
||||
else:
|
||||
assert table
|
||||
table_parts = table.split(".")
|
||||
if len(table_parts) == 2:
|
||||
bigquery_temp_table = (
|
||||
f"{schema}.{table_parts[0]}.ge-temp-{uuid.uuid4()}"
|
||||
)
|
||||
ge_config["bigquery_temp_table"] = bigquery_temp_table
|
||||
|
||||
if custom_sql is not None:
|
||||
ge_config["query"] = custom_sql
|
||||
|
||||
@ -43,6 +43,7 @@ class GEProfilingConfig(ConfigModel):
|
||||
# Hidden option - used for debugging purposes.
|
||||
catch_exceptions: bool = True
|
||||
|
||||
partition_profiling_enabled: bool = True
|
||||
bigquery_temp_table_schema: Optional[str] = None
|
||||
partition_datetime: Optional[datetime.datetime]
|
||||
|
||||
|
||||
@ -1168,6 +1168,15 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
|
||||
schema, table, self.config.profiling.partition_datetime
|
||||
)
|
||||
|
||||
if (
|
||||
partition is not None
|
||||
and not self.config.profiling.partition_profiling_enabled
|
||||
):
|
||||
logger.debug(
|
||||
f"{dataset_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled"
|
||||
)
|
||||
continue
|
||||
|
||||
self.report.report_entity_profiled(dataset_name)
|
||||
yield GEProfilerRequest(
|
||||
pretty_name=dataset_name,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user