mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-11-04 04:39:10 +00:00 
			
		
		
		
	feat(ingest): bigquery - ability to disable partition profiling (#4228)
This commit is contained in:
		
							parent
							
								
									d52638a252
								
							
						
					
					
						commit
						2a5cf3dd07
					
				@ -156,8 +156,12 @@ Note: the bigquery_audit_metadata_datasets parameter receives a list of datasets
 | 
				
			|||||||
Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs.
 | 
					Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Profiling
 | 
					## Profiling
 | 
				
			||||||
For profiling you have to set a table schema where Great Expectation (the profiling framework we use) can create temporary
 | 
					Profiling can profile normal/partitioned and sharded tables as well but due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
 | 
				
			||||||
views by setting `profiling.bigquery_temp_table_schema` property.
 | 
					
 | 
				
			||||||
 | 
					If limit/offset parameter is set or partitioning partitioned or sharded table Great Expectation (the profiling framework we use) needs to create temporary
 | 
				
			||||||
 | 
					views. By default these views are created in the schema where the profiled table is but you can control to create all these
 | 
				
			||||||
 | 
					tables into a predefined schema by setting `profiling.bigquery_temp_table_schema` property. 
 | 
				
			||||||
 | 
					Temporary tables are removed after profiling.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```yaml
 | 
					```yaml
 | 
				
			||||||
     profiling:
 | 
					     profiling:
 | 
				
			||||||
@ -168,7 +172,7 @@ views by setting `profiling.bigquery_temp_table_schema` property.
 | 
				
			|||||||
:::note
 | 
					:::note
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
 | 
					Due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
 | 
				
			||||||
 | 
					You can set partition explicitly with `partition.partition_datetime` property if you want. (partition will be applied to all partitioned tables)
 | 
				
			||||||
:::
 | 
					:::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# BigQuery Usage Stats
 | 
					# BigQuery Usage Stats
 | 
				
			||||||
 | 
				
			|||||||
@ -793,11 +793,21 @@ class DatahubGEProfiler:
 | 
				
			|||||||
            **kwargs,
 | 
					            **kwargs,
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if self.config.bigquery_temp_table_schema is not None:
 | 
					        # We have to create temporary tables if offset or limit or custom sql is set on Bigquery
 | 
				
			||||||
 | 
					        if custom_sql or self.config.limit or self.config.offset:
 | 
				
			||||||
 | 
					            if self.config.bigquery_temp_table_schema:
 | 
				
			||||||
                bigquery_temp_table = (
 | 
					                bigquery_temp_table = (
 | 
				
			||||||
                    f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
 | 
					                    f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                ge_config["bigquery_temp_table"] = bigquery_temp_table
 | 
					                ge_config["bigquery_temp_table"] = bigquery_temp_table
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                assert table
 | 
				
			||||||
 | 
					                table_parts = table.split(".")
 | 
				
			||||||
 | 
					                if len(table_parts) == 2:
 | 
				
			||||||
 | 
					                    bigquery_temp_table = (
 | 
				
			||||||
 | 
					                        f"{schema}.{table_parts[0]}.ge-temp-{uuid.uuid4()}"
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					                    ge_config["bigquery_temp_table"] = bigquery_temp_table
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if custom_sql is not None:
 | 
					        if custom_sql is not None:
 | 
				
			||||||
            ge_config["query"] = custom_sql
 | 
					            ge_config["query"] = custom_sql
 | 
				
			||||||
 | 
				
			|||||||
@ -43,6 +43,7 @@ class GEProfilingConfig(ConfigModel):
 | 
				
			|||||||
    # Hidden option - used for debugging purposes.
 | 
					    # Hidden option - used for debugging purposes.
 | 
				
			||||||
    catch_exceptions: bool = True
 | 
					    catch_exceptions: bool = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    partition_profiling_enabled: bool = True
 | 
				
			||||||
    bigquery_temp_table_schema: Optional[str] = None
 | 
					    bigquery_temp_table_schema: Optional[str] = None
 | 
				
			||||||
    partition_datetime: Optional[datetime.datetime]
 | 
					    partition_datetime: Optional[datetime.datetime]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -1168,6 +1168,15 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
 | 
				
			|||||||
                schema, table, self.config.profiling.partition_datetime
 | 
					                schema, table, self.config.profiling.partition_datetime
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if (
 | 
				
			||||||
 | 
					                partition is not None
 | 
				
			||||||
 | 
					                and not self.config.profiling.partition_profiling_enabled
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
 | 
					                logger.debug(
 | 
				
			||||||
 | 
					                    f"{dataset_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled"
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.report.report_entity_profiled(dataset_name)
 | 
					            self.report.report_entity_profiled(dataset_name)
 | 
				
			||||||
            yield GEProfilerRequest(
 | 
					            yield GEProfilerRequest(
 | 
				
			||||||
                pretty_name=dataset_name,
 | 
					                pretty_name=dataset_name,
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user