diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 4ea721f6fd..6faa29f264 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -360,18 +360,22 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase): @_run_with_query_combiner def _get_dataset_rows(self, dataset_profile: DatasetProfileClass) -> None: if self.config.profile_table_row_count_estimate_only: - schema_name = self.dataset_name.split(".")[1] - table_name = self.dataset_name.split(".")[2] - logger.debug( - f"Getting estimated rowcounts for table:{self.dataset_name}, schema:{schema_name}, table:{table_name}" - ) - dialect_name = self.dataset.engine.dialect.name.lower() if dialect_name == "postgresql": + schema_name = self.dataset_name.split(".")[1] + table_name = self.dataset_name.split(".")[2] + logger.debug( + f"Getting estimated rowcounts for table:{self.dataset_name}, schema:{schema_name}, table:{table_name}" + ) get_estimate_script = sa.text( f"SELECT c.reltuples AS estimate FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.relname = '{table_name}' AND n.nspname = '{schema_name}'" ) elif dialect_name == "mysql": + schema_name = self.dataset_name.split(".")[0] + table_name = self.dataset_name.split(".")[1] + logger.debug( + f"Getting estimated rowcounts for table:{self.dataset_name}, schema:{schema_name}, table:{table_name}" + ) get_estimate_script = sa.text( f"SELECT table_rows AS estimate FROM information_schema.tables WHERE table_schema = '{schema_name}' AND table_name = '{table_name}'" ) diff --git a/metadata-ingestion/tests/integration/mysql/mysql_table_row_count_estimate_only.json b/metadata-ingestion/tests/integration/mysql/mysql_table_row_count_estimate_only.json index e668525b93..7597013bd8 100644 --- a/metadata-ingestion/tests/integration/mysql/mysql_table_row_count_estimate_only.json +++ b/metadata-ingestion/tests/integration/mysql/mysql_table_row_count_estimate_only.json @@ -400,5 +400,159 @@ "lastObserved": 1586847600000, "runId": "mysql-2020_04_14-07_00_00" } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1586847600000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "rowCount": 0, + "columnCount": 6, + "fieldProfiles": [ + { + "fieldPath": "id", + "uniqueCount": 5, + "uniqueProportion": 1, + "nullCount": 0, + "min": "1", + "max": "5", + "mean": "3.0", + "median": "3", + "stdev": "1.5811388300841898", + "sampleValues": [ + "1", + "2", + "3", + "4", + "5" + ] + }, + { + "fieldPath": "company", + "uniqueCount": 5, + "uniqueProportion": 1, + "nullCount": 0, + "sampleValues": [ + "Company A", + "Company B", + "Company C", + "Company D", + "Company E" + ] + }, + { + "fieldPath": "last_name", + "uniqueCount": 5, + "uniqueProportion": 1, + "nullCount": 0, + "sampleValues": [ + "Axen", + "Bedecs", + "Donnell", + "Gratacos Solsona", + "Lee" + ] + }, + { + "fieldPath": "first_name", + "uniqueCount": 5, + "uniqueProportion": 1, + "nullCount": 0, + "sampleValues": [ + "Anna", + "Antonio", + "Christina", + "Martin", + "Thomas" + ] + }, + { + "fieldPath": "email_address", + "uniqueCount": 0, + "nullCount": 0, + "sampleValues": [] + }, + { + "fieldPath": "priority", + "uniqueCount": 3, + "uniqueProportion": 0.75, + "nullCount": 0, + "min": "3.8", + "max": "4.9", + "mean": "4.175000011920929", + "median": "4.0", + "stdev": "0.49244294899530355", + "sampleValues": [ + "4.0", + "4.9", + "4.0", + "3.8" + ] + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1586847600000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "rowCount": 0, + "columnCount": 3, + "fieldProfiles": [ + { + "fieldPath": "id", + "uniqueCount": 0, + "nullCount": 0, + "min": "None", + "max": "None", + "mean": "None", + "median": "None", + "stdev": "0.0", + "sampleValues": [] + }, + { + "fieldPath": "description", + "uniqueCount": 0, + "nullCount": 0, + "sampleValues": [] + }, + { + "fieldPath": "customer_id", + "uniqueCount": 0, + "nullCount": 0, + "min": "None", + "max": "None", + "mean": "None", + "median": "None", + "stdev": "0.0", + "sampleValues": [] + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } } ] \ No newline at end of file