mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-02 19:58:59 +00:00
feat(ingest): bigquery - ability to disable partition profiling (#4228)
This commit is contained in:
parent
d52638a252
commit
2a5cf3dd07
@ -156,8 +156,12 @@ Note: the bigquery_audit_metadata_datasets parameter receives a list of datasets
|
|||||||
Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs.
|
Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs.
|
||||||
|
|
||||||
## Profiling
|
## Profiling
|
||||||
For profiling you have to set a table schema where Great Expectation (the profiling framework we use) can create temporary
|
Profiling can profile normal/partitioned and sharded tables as well but due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
|
||||||
views by setting `profiling.bigquery_temp_table_schema` property.
|
|
||||||
|
If limit/offset parameter is set or partitioning partitioned or sharded table Great Expectation (the profiling framework we use) needs to create temporary
|
||||||
|
views. By default these views are created in the schema where the profiled table is but you can control to create all these
|
||||||
|
tables into a predefined schema by setting `profiling.bigquery_temp_table_schema` property.
|
||||||
|
Temporary tables are removed after profiling.
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
profiling:
|
profiling:
|
||||||
@ -168,7 +172,7 @@ views by setting `profiling.bigquery_temp_table_schema` property.
|
|||||||
:::note
|
:::note
|
||||||
|
|
||||||
Due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
|
Due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
|
||||||
|
You can set partition explicitly with `partition.partition_datetime` property if you want. (partition will be applied to all partitioned tables)
|
||||||
:::
|
:::
|
||||||
|
|
||||||
# BigQuery Usage Stats
|
# BigQuery Usage Stats
|
||||||
|
|||||||
@ -793,11 +793,21 @@ class DatahubGEProfiler:
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.config.bigquery_temp_table_schema is not None:
|
# We have to create temporary tables if offset or limit or custom sql is set on Bigquery
|
||||||
bigquery_temp_table = (
|
if custom_sql or self.config.limit or self.config.offset:
|
||||||
f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
|
if self.config.bigquery_temp_table_schema:
|
||||||
)
|
bigquery_temp_table = (
|
||||||
ge_config["bigquery_temp_table"] = bigquery_temp_table
|
f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
|
||||||
|
)
|
||||||
|
ge_config["bigquery_temp_table"] = bigquery_temp_table
|
||||||
|
else:
|
||||||
|
assert table
|
||||||
|
table_parts = table.split(".")
|
||||||
|
if len(table_parts) == 2:
|
||||||
|
bigquery_temp_table = (
|
||||||
|
f"{schema}.{table_parts[0]}.ge-temp-{uuid.uuid4()}"
|
||||||
|
)
|
||||||
|
ge_config["bigquery_temp_table"] = bigquery_temp_table
|
||||||
|
|
||||||
if custom_sql is not None:
|
if custom_sql is not None:
|
||||||
ge_config["query"] = custom_sql
|
ge_config["query"] = custom_sql
|
||||||
|
|||||||
@ -43,6 +43,7 @@ class GEProfilingConfig(ConfigModel):
|
|||||||
# Hidden option - used for debugging purposes.
|
# Hidden option - used for debugging purposes.
|
||||||
catch_exceptions: bool = True
|
catch_exceptions: bool = True
|
||||||
|
|
||||||
|
partition_profiling_enabled: bool = True
|
||||||
bigquery_temp_table_schema: Optional[str] = None
|
bigquery_temp_table_schema: Optional[str] = None
|
||||||
partition_datetime: Optional[datetime.datetime]
|
partition_datetime: Optional[datetime.datetime]
|
||||||
|
|
||||||
|
|||||||
@ -1168,6 +1168,15 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
|
|||||||
schema, table, self.config.profiling.partition_datetime
|
schema, table, self.config.profiling.partition_datetime
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
partition is not None
|
||||||
|
and not self.config.profiling.partition_profiling_enabled
|
||||||
|
):
|
||||||
|
logger.debug(
|
||||||
|
f"{dataset_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
self.report.report_entity_profiled(dataset_name)
|
self.report.report_entity_profiled(dataset_name)
|
||||||
yield GEProfilerRequest(
|
yield GEProfilerRequest(
|
||||||
pretty_name=dataset_name,
|
pretty_name=dataset_name,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user