feat(ingest/mysql): Add estimate row count for mysql (#8420)

This commit is contained in:
Ellie O'Neil 2023-07-19 22:36:26 -04:00 committed by GitHub
parent 93fde6bf93
commit d734b2849e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 443 additions and 10 deletions

View File

@ -359,21 +359,32 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
@_run_with_query_combiner
def _get_dataset_rows(self, dataset_profile: DatasetProfileClass) -> None:
if (
self.config.profile_table_row_count_estimate_only
and self.dataset.engine.dialect.name.lower() == "postgresql"
):
if self.config.profile_table_row_count_estimate_only:
schema_name = self.dataset_name.split(".")[1]
table_name = self.dataset_name.split(".")[2]
logger.debug(
f"Getting estimated rowcounts for table:{self.dataset_name}, schema:{schema_name}, table:{table_name}"
)
dialect_name = self.dataset.engine.dialect.name.lower()
if dialect_name == "postgresql":
get_estimate_script = sa.text(
f"SELECT c.reltuples AS estimate FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.relname = '{table_name}' AND n.nspname = '{schema_name}'"
)
elif dialect_name == "mysql":
get_estimate_script = sa.text(
f"SELECT table_rows AS estimate FROM information_schema.tables WHERE table_schema = '{schema_name}' AND table_name = '{table_name}'"
)
else:
logger.debug(
f"Dialect {dialect_name} not supported for feature "
f"profile_table_row_count_estimate_only. Proceeding with full row count."
)
dataset_profile.rowCount = self.dataset.get_row_count()
return
dataset_profile.rowCount = int(
self.dataset.engine.execute(
sa.text(
f"SELECT c.reltuples AS estimate FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.relname = '{table_name}' AND n.nspname = '{schema_name}'"
)
).scalar()
self.dataset.engine.execute(get_estimate_script).scalar()
)
else:
dataset_profile.rowCount = self.dataset.get_row_count()

View File

@ -118,7 +118,7 @@ class GEProfilingConfig(ConfigModel):
profile_table_row_count_estimate_only: bool = Field(
default=False,
description="Use an approximate query for row count. This will be much faster but slightly "
"less accurate. Only supported for Postgres. ",
"less accurate. Only supported for Postgres and MySQL. ",
)
# The default of (5 * cpu_count) is adopted from the default max_workers

View File

@ -0,0 +1,14 @@
source:
type: mysql
config:
username: root
password: example
host_port: localhost:53307
database: northwind
profiling:
enabled: True
profile_table_row_count_estimate_only: true
sink:
type: file
config:
filename: "./mysql_mces.json"

View File

@ -0,0 +1,404 @@
[
{
"entityType": "container",
"entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
"json": {
"customProperties": {
"platform": "mysql",
"env": "PROD",
"database": "northwind"
},
"name": "northwind"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": false
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
"json": {
"platform": "urn:li:dataPlatform:mysql"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Database"
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": []
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
"container": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {},
"name": "customers",
"tags": []
}
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "northwind.customers",
"platform": "urn:li:dataPlatform:mysql",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "id",
"nullable": false,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"isPartOfKey": true
},
{
"fieldPath": "company",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR(length=50)",
"recursive": false,
"isPartOfKey": false
},
{
"fieldPath": "last_name",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR(length=50)",
"recursive": false,
"isPartOfKey": false
},
{
"fieldPath": "first_name",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR(length=50)",
"recursive": false,
"isPartOfKey": false
},
{
"fieldPath": "email_address",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR(length=50)",
"recursive": false,
"isPartOfKey": false
},
{
"fieldPath": "priority",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "FLOAT()",
"recursive": false,
"isPartOfKey": false
}
]
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Table"
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
"container": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {},
"name": "orders",
"tags": []
}
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "northwind.orders",
"platform": "urn:li:dataPlatform:mysql",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
"tableSchema": ""
}
},
"fields": [
{
"fieldPath": "id",
"nullable": false,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"isPartOfKey": true
},
{
"fieldPath": "description",
"nullable": true,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "VARCHAR(length=50)",
"recursive": false,
"isPartOfKey": false
},
{
"fieldPath": "customer_id",
"nullable": false,
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "INTEGER()",
"recursive": false,
"isPartOfKey": false
}
],
"foreignKeys": [
{
"name": "fk_order_customer",
"foreignFields": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD),id)"
],
"sourceFields": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD),customer_id)"
],
"foreignDataset": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)"
}
]
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Table"
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
"urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "mysql-2020_04_14-07_00_00"
}
}
]

View File

@ -48,6 +48,10 @@ def mysql_runner(docker_compose_runner, pytestconfig, test_resources_dir):
("mysql_to_file_with_db.yml", "mysql_mces_with_db_golden.json"),
("mysql_to_file_no_db.yml", "mysql_mces_no_db_golden.json"),
("mysql_profile_table_level_only.yml", "mysql_table_level_only.json"),
(
"mysql_profile_table_row_count_estimate_only.yml",
"mysql_table_row_count_estimate_only.json",
),
],
)
@freeze_time(FROZEN_TIME)