mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-31 10:49:00 +00:00 
			
		
		
		
	feat(ingest/snowflake): improve accuracy of computed sample-based pro… (#9600)
This commit is contained in:
		
							parent
							
								
									3f9b90158f
								
							
						
					
					
						commit
						af866eaf95
					
				| @ -680,14 +680,12 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase): | ||||
|         assert profile.rowCount is not None | ||||
|         row_count: int  # used for null counts calculation | ||||
|         if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition: | ||||
|             # We can alternatively use `self._get_dataset_rows(profile)` to get | ||||
|             # exact count of rows in sample, as actual rows involved in sample | ||||
|             # may be slightly different (more or less) than configured `sample_size`. | ||||
|             # However not doing so to start with, as that adds another query overhead | ||||
|             # plus approximate metrics should work for sampling based profiling. | ||||
|             row_count = self.config.sample_size | ||||
|         else: | ||||
|             row_count = profile.rowCount | ||||
|             # Querying exact row count of sample using `_get_dataset_rows`. | ||||
|             # We are not using `self.config.sample_size` directly as actual row count | ||||
|             # in sample may be slightly different (more or less) than configured `sample_size`. | ||||
|             self._get_dataset_rows(profile) | ||||
| 
 | ||||
|         row_count = profile.rowCount | ||||
| 
 | ||||
|         for column_spec in columns_profiling_queue: | ||||
|             column = column_spec.column | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Mayuri Nehate
						Mayuri Nehate