mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-11-04 04:39:10 +00:00 
			
		
		
		
	fix(profiling): don't stop if some steps fail (#5095)
This commit is contained in:
		
							parent
							
								
									64c4f518a4
								
							
						
					
					
						commit
						c43ad54a64
					
				@ -305,8 +305,17 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
 | 
				
			|||||||
    def _get_column_cardinality(
 | 
					    def _get_column_cardinality(
 | 
				
			||||||
        self, column_spec: _SingleColumnSpec, column: str
 | 
					        self, column_spec: _SingleColumnSpec, column: str
 | 
				
			||||||
    ) -> None:
 | 
					    ) -> None:
 | 
				
			||||||
        nonnull_count = self.dataset.get_column_nonnull_count(column)
 | 
					        try:
 | 
				
			||||||
        column_spec.nonnull_count = nonnull_count
 | 
					            nonnull_count = self.dataset.get_column_nonnull_count(column)
 | 
				
			||||||
 | 
					            column_spec.nonnull_count = nonnull_count
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            logger.debug(
 | 
				
			||||||
 | 
					                f"Caught exception while attempting to get column cardinality for column {column}. {e}"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            self.report.report_warning(
 | 
				
			||||||
 | 
					                "Profiling - Unable to get column cardinality",
 | 
				
			||||||
 | 
					                f"{self.dataset_name}.{column}",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        unique_count = None
 | 
					        unique_count = None
 | 
				
			||||||
        pct_unique = None
 | 
					        pct_unique = None
 | 
				
			||||||
@ -352,21 +361,43 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
 | 
				
			|||||||
    def _get_dataset_column_median(
 | 
					    def _get_dataset_column_median(
 | 
				
			||||||
        self, column_profile: DatasetFieldProfileClass, column: str
 | 
					        self, column_profile: DatasetFieldProfileClass, column: str
 | 
				
			||||||
    ) -> None:
 | 
					    ) -> None:
 | 
				
			||||||
        if self.config.include_field_median_value:
 | 
					        if not self.config.include_field_median_value:
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
            column_profile.median = str(self.dataset.get_column_median(column))
 | 
					            column_profile.median = str(self.dataset.get_column_median(column))
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            logger.debug(
 | 
				
			||||||
 | 
					                f"Caught exception while attempting to get column median for column {column}. {e}"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            self.report.report_warning(
 | 
				
			||||||
 | 
					                "Profiling - Unable to get column medians",
 | 
				
			||||||
 | 
					                f"{self.dataset_name}.{column}",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @_run_with_query_combiner
 | 
					    @_run_with_query_combiner
 | 
				
			||||||
    def _get_dataset_column_stdev(
 | 
					    def _get_dataset_column_stdev(
 | 
				
			||||||
        self, column_profile: DatasetFieldProfileClass, column: str
 | 
					        self, column_profile: DatasetFieldProfileClass, column: str
 | 
				
			||||||
    ) -> None:
 | 
					    ) -> None:
 | 
				
			||||||
        if self.config.include_field_stddev_value:
 | 
					        if not self.config.include_field_stddev_value:
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
            column_profile.stdev = str(self.dataset.get_column_stdev(column))
 | 
					            column_profile.stdev = str(self.dataset.get_column_stdev(column))
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            logger.debug(
 | 
				
			||||||
 | 
					                f"Caught exception while attempting to get column stddev for column {column}. {e}"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            self.report.report_warning(
 | 
				
			||||||
 | 
					                "Profiling - Unable to get column stddev",
 | 
				
			||||||
 | 
					                f"{self.dataset_name}.{column}",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @_run_with_query_combiner
 | 
					    @_run_with_query_combiner
 | 
				
			||||||
    def _get_dataset_column_quantiles(
 | 
					    def _get_dataset_column_quantiles(
 | 
				
			||||||
        self, column_profile: DatasetFieldProfileClass, column: str
 | 
					        self, column_profile: DatasetFieldProfileClass, column: str
 | 
				
			||||||
    ) -> None:
 | 
					    ) -> None:
 | 
				
			||||||
        if self.config.include_field_quantiles:
 | 
					        if not self.config.include_field_quantiles:
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
            # FIXME: Eventually we'd like to switch to using the quantile method directly.
 | 
					            # FIXME: Eventually we'd like to switch to using the quantile method directly.
 | 
				
			||||||
            # However, that method seems to be throwing an error in some cases whereas
 | 
					            # However, that method seems to be throwing an error in some cases whereas
 | 
				
			||||||
            # this does not.
 | 
					            # this does not.
 | 
				
			||||||
@ -391,6 +422,14 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
 | 
				
			|||||||
                        res["observed_value"]["values"],
 | 
					                        res["observed_value"]["values"],
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
                ]
 | 
					                ]
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            logger.debug(
 | 
				
			||||||
 | 
					                f"Caught exception while attempting to get column quantiles for column {column}. {e}"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            self.report.report_warning(
 | 
				
			||||||
 | 
					                "Profiling - Unable to get column quantiles",
 | 
				
			||||||
 | 
					                f"{self.dataset_name}.{column}",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @_run_with_query_combiner
 | 
					    @_run_with_query_combiner
 | 
				
			||||||
    def _get_dataset_column_distinct_value_frequencies(
 | 
					    def _get_dataset_column_distinct_value_frequencies(
 | 
				
			||||||
@ -406,7 +445,9 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
 | 
				
			|||||||
    def _get_dataset_column_histogram(
 | 
					    def _get_dataset_column_histogram(
 | 
				
			||||||
        self, column_profile: DatasetFieldProfileClass, column: str
 | 
					        self, column_profile: DatasetFieldProfileClass, column: str
 | 
				
			||||||
    ) -> None:
 | 
					    ) -> None:
 | 
				
			||||||
        if self.config.include_field_histogram:
 | 
					        if not self.config.include_field_histogram:
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
            self.dataset.set_config_value("interactive_evaluation", True)
 | 
					            self.dataset.set_config_value("interactive_evaluation", True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            res = self.dataset.expect_column_kl_divergence_to_be_less_than(
 | 
					            res = self.dataset.expect_column_kl_divergence_to_be_less_than(
 | 
				
			||||||
@ -425,6 +466,14 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
 | 
				
			|||||||
                        partition["tail_weights"][1],
 | 
					                        partition["tail_weights"][1],
 | 
				
			||||||
                    ],
 | 
					                    ],
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            logger.debug(
 | 
				
			||||||
 | 
					                f"Caught exception while attempting to get column histogram for column {column}. {e}"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            self.report.report_warning(
 | 
				
			||||||
 | 
					                "Profiling - Unable to get column histogram",
 | 
				
			||||||
 | 
					                f"{self.dataset_name}.{column}",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @_run_with_query_combiner
 | 
					    @_run_with_query_combiner
 | 
				
			||||||
    def _get_dataset_column_sample_values(
 | 
					    def _get_dataset_column_sample_values(
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user