From 867fc01fc99cc7d4220f26e1e8c5c7f21fb16edc Mon Sep 17 00:00:00 2001 From: Michael Maltese Date: Wed, 6 Aug 2025 03:30:06 -0400 Subject: [PATCH] feat(ingest/profiler): use approx_count_distinct on Databricks (#14337) --- .../src/datahub/ingestion/source/ge_data_profiler.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 3e22522461..e58fe9fef4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -216,6 +216,14 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in ) ).scalar() ) + elif self.engine.dialect.name.lower() == DATABRICKS: + return convert_to_json_serializable( + self.engine.execute( + sa.select(sa.func.approx_count_distinct(sa.column(column))).select_from( + self._table + ) + ).scalar() + ) return convert_to_json_serializable( self.engine.execute( sa.select([sa.func.count(sa.func.distinct(sa.column(column)))]).select_from(