From d734b2849e514b1c539d195123602b11cefdfc20 Mon Sep 17 00:00:00 2001 From: Ellie O'Neil <110510035+eboneil@users.noreply.github.com> Date: Wed, 19 Jul 2023 22:36:26 -0400 Subject: [PATCH] feat(ingest/mysql): Add estimate row count for mysql (#8420) --- .../ingestion/source/ge_data_profiler.py | 29 +- .../ingestion/source/ge_profiling_config.py | 2 +- ..._profile_table_row_count_estimate_only.yml | 14 + .../mysql_table_row_count_estimate_only.json | 404 ++++++++++++++++++ .../tests/integration/mysql/test_mysql.py | 4 + 5 files changed, 443 insertions(+), 10 deletions(-) create mode 100644 metadata-ingestion/tests/integration/mysql/mysql_profile_table_row_count_estimate_only.yml create mode 100644 metadata-ingestion/tests/integration/mysql/mysql_table_row_count_estimate_only.json diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index ab259b9b16..4ea721f6fd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -359,21 +359,32 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase): @_run_with_query_combiner def _get_dataset_rows(self, dataset_profile: DatasetProfileClass) -> None: - if ( - self.config.profile_table_row_count_estimate_only - and self.dataset.engine.dialect.name.lower() == "postgresql" - ): + if self.config.profile_table_row_count_estimate_only: schema_name = self.dataset_name.split(".")[1] table_name = self.dataset_name.split(".")[2] logger.debug( f"Getting estimated rowcounts for table:{self.dataset_name}, schema:{schema_name}, table:{table_name}" ) + + dialect_name = self.dataset.engine.dialect.name.lower() + if dialect_name == "postgresql": + get_estimate_script = sa.text( + f"SELECT c.reltuples AS estimate FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.relname = '{table_name}' AND n.nspname = '{schema_name}'" + ) + elif dialect_name == "mysql": + get_estimate_script = sa.text( + f"SELECT table_rows AS estimate FROM information_schema.tables WHERE table_schema = '{schema_name}' AND table_name = '{table_name}'" + ) + else: + logger.debug( + f"Dialect {dialect_name} not supported for feature " + f"profile_table_row_count_estimate_only. Proceeding with full row count." + ) + dataset_profile.rowCount = self.dataset.get_row_count() + return + dataset_profile.rowCount = int( - self.dataset.engine.execute( - sa.text( - f"SELECT c.reltuples AS estimate FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.relname = '{table_name}' AND n.nspname = '{schema_name}'" - ) - ).scalar() + self.dataset.engine.execute(get_estimate_script).scalar() ) else: dataset_profile.rowCount = self.dataset.get_row_count() diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 3f61a99bfd..8c5f1646c1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -118,7 +118,7 @@ class GEProfilingConfig(ConfigModel): profile_table_row_count_estimate_only: bool = Field( default=False, description="Use an approximate query for row count. This will be much faster but slightly " - "less accurate. Only supported for Postgres. ", + "less accurate. Only supported for Postgres and MySQL. ", ) # The default of (5 * cpu_count) is adopted from the default max_workers diff --git a/metadata-ingestion/tests/integration/mysql/mysql_profile_table_row_count_estimate_only.yml b/metadata-ingestion/tests/integration/mysql/mysql_profile_table_row_count_estimate_only.yml new file mode 100644 index 0000000000..d6a9ed85a8 --- /dev/null +++ b/metadata-ingestion/tests/integration/mysql/mysql_profile_table_row_count_estimate_only.yml @@ -0,0 +1,14 @@ +source: + type: mysql + config: + username: root + password: example + host_port: localhost:53307 + database: northwind + profiling: + enabled: True + profile_table_row_count_estimate_only: true +sink: + type: file + config: + filename: "./mysql_mces.json" diff --git a/metadata-ingestion/tests/integration/mysql/mysql_table_row_count_estimate_only.json b/metadata-ingestion/tests/integration/mysql/mysql_table_row_count_estimate_only.json new file mode 100644 index 0000000000..e668525b93 --- /dev/null +++ b/metadata-ingestion/tests/integration/mysql/mysql_table_row_count_estimate_only.json @@ -0,0 +1,404 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mysql", + "env": "PROD", + "database": "northwind" + }, + "name": "northwind" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mysql" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "customers", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "northwind.customers", + "platform": "urn:li:dataPlatform:mysql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": true + }, + { + "fieldPath": "company", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=50)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "last_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=50)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "first_name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=50)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "email_address", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=50)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "priority", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "FLOAT()", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", + "urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "orders", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "northwind.orders", + "platform": "urn:li:dataPlatform:mysql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": true + }, + { + "fieldPath": "description", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=50)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + } + ], + "foreignKeys": [ + { + "name": "fk_order_customer", + "foreignFields": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD),id)" + ], + "sourceFields": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD),customer_id)" + ], + "foreignDataset": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f", + "urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-2020_04_14-07_00_00" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/mysql/test_mysql.py b/metadata-ingestion/tests/integration/mysql/test_mysql.py index bf5806bc57..8c8626a2d2 100644 --- a/metadata-ingestion/tests/integration/mysql/test_mysql.py +++ b/metadata-ingestion/tests/integration/mysql/test_mysql.py @@ -48,6 +48,10 @@ def mysql_runner(docker_compose_runner, pytestconfig, test_resources_dir): ("mysql_to_file_with_db.yml", "mysql_mces_with_db_golden.json"), ("mysql_to_file_no_db.yml", "mysql_mces_no_db_golden.json"), ("mysql_profile_table_level_only.yml", "mysql_table_level_only.json"), + ( + "mysql_profile_table_row_count_estimate_only.yml", + "mysql_table_row_count_estimate_only.json", + ), ], ) @freeze_time(FROZEN_TIME)