mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-05 15:13:21 +00:00
feat(ingest/mysql): Add estimate row count for mysql (#8420)
This commit is contained in:
parent
93fde6bf93
commit
d734b2849e
@ -359,21 +359,32 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
||||
|
||||
@_run_with_query_combiner
|
||||
def _get_dataset_rows(self, dataset_profile: DatasetProfileClass) -> None:
|
||||
if (
|
||||
self.config.profile_table_row_count_estimate_only
|
||||
and self.dataset.engine.dialect.name.lower() == "postgresql"
|
||||
):
|
||||
if self.config.profile_table_row_count_estimate_only:
|
||||
schema_name = self.dataset_name.split(".")[1]
|
||||
table_name = self.dataset_name.split(".")[2]
|
||||
logger.debug(
|
||||
f"Getting estimated rowcounts for table:{self.dataset_name}, schema:{schema_name}, table:{table_name}"
|
||||
)
|
||||
|
||||
dialect_name = self.dataset.engine.dialect.name.lower()
|
||||
if dialect_name == "postgresql":
|
||||
get_estimate_script = sa.text(
|
||||
f"SELECT c.reltuples AS estimate FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.relname = '{table_name}' AND n.nspname = '{schema_name}'"
|
||||
)
|
||||
elif dialect_name == "mysql":
|
||||
get_estimate_script = sa.text(
|
||||
f"SELECT table_rows AS estimate FROM information_schema.tables WHERE table_schema = '{schema_name}' AND table_name = '{table_name}'"
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
f"Dialect {dialect_name} not supported for feature "
|
||||
f"profile_table_row_count_estimate_only. Proceeding with full row count."
|
||||
)
|
||||
dataset_profile.rowCount = self.dataset.get_row_count()
|
||||
return
|
||||
|
||||
dataset_profile.rowCount = int(
|
||||
self.dataset.engine.execute(
|
||||
sa.text(
|
||||
f"SELECT c.reltuples AS estimate FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.relname = '{table_name}' AND n.nspname = '{schema_name}'"
|
||||
)
|
||||
).scalar()
|
||||
self.dataset.engine.execute(get_estimate_script).scalar()
|
||||
)
|
||||
else:
|
||||
dataset_profile.rowCount = self.dataset.get_row_count()
|
||||
|
||||
@ -118,7 +118,7 @@ class GEProfilingConfig(ConfigModel):
|
||||
profile_table_row_count_estimate_only: bool = Field(
|
||||
default=False,
|
||||
description="Use an approximate query for row count. This will be much faster but slightly "
|
||||
"less accurate. Only supported for Postgres. ",
|
||||
"less accurate. Only supported for Postgres and MySQL. ",
|
||||
)
|
||||
|
||||
# The default of (5 * cpu_count) is adopted from the default max_workers
|
||||
|
||||
@ -0,0 +1,14 @@
|
||||
source:
|
||||
type: mysql
|
||||
config:
|
||||
username: root
|
||||
password: example
|
||||
host_port: localhost:53307
|
||||
database: northwind
|
||||
profiling:
|
||||
enabled: True
|
||||
profile_table_row_count_estimate_only: true
|
||||
sink:
|
||||
type: file
|
||||
config:
|
||||
filename: "./mysql_mces.json"
|
||||
@ -0,0 +1,404 @@
|
||||
[
|
||||
{
|
||||
"entityType": "container",
|
||||
"entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "containerProperties",
|
||||
"aspect": {
|
||||
"json": {
|
||||
"customProperties": {
|
||||
"platform": "mysql",
|
||||
"env": "PROD",
|
||||
"database": "northwind"
|
||||
},
|
||||
"name": "northwind"
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1586847600000,
|
||||
"runId": "mysql-2020_04_14-07_00_00"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "container",
|
||||
"entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "status",
|
||||
"aspect": {
|
||||
"json": {
|
||||
"removed": false
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1586847600000,
|
||||
"runId": "mysql-2020_04_14-07_00_00"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "container",
|
||||
"entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "dataPlatformInstance",
|
||||
"aspect": {
|
||||
"json": {
|
||||
"platform": "urn:li:dataPlatform:mysql"
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1586847600000,
|
||||
"runId": "mysql-2020_04_14-07_00_00"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "container",
|
||||
"entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "subTypes",
|
||||
"aspect": {
|
||||
"json": {
|
||||
"typeNames": [
|
||||
"Database"
|
||||
]
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1586847600000,
|
||||
"runId": "mysql-2020_04_14-07_00_00"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "container",
|
||||
"entityUrn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "browsePathsV2",
|
||||
"aspect": {
|
||||
"json": {
|
||||
"path": []
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1586847600000,
|
||||
"runId": "mysql-2020_04_14-07_00_00"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "dataset",
|
||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "container",
|
||||
"aspect": {
|
||||
"json": {
|
||||
"container": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1586847600000,
|
||||
"runId": "mysql-2020_04_14-07_00_00"
|
||||
}
|
||||
},
|
||||
{
|
||||
"proposedSnapshot": {
|
||||
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||
"urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)",
|
||||
"aspects": [
|
||||
{
|
||||
"com.linkedin.pegasus2avro.common.Status": {
|
||||
"removed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {},
|
||||
"name": "customers",
|
||||
"tags": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
|
||||
"schemaName": "northwind.customers",
|
||||
"platform": "urn:li:dataPlatform:mysql",
|
||||
"version": 0,
|
||||
"created": {
|
||||
"time": 0,
|
||||
"actor": "urn:li:corpuser:unknown"
|
||||
},
|
||||
"lastModified": {
|
||||
"time": 0,
|
||||
"actor": "urn:li:corpuser:unknown"
|
||||
},
|
||||
"hash": "",
|
||||
"platformSchema": {
|
||||
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
|
||||
"tableSchema": ""
|
||||
}
|
||||
},
|
||||
"fields": [
|
||||
{
|
||||
"fieldPath": "id",
|
||||
"nullable": false,
|
||||
"type": {
|
||||
"type": {
|
||||
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||
}
|
||||
},
|
||||
"nativeDataType": "INTEGER()",
|
||||
"recursive": false,
|
||||
"isPartOfKey": true
|
||||
},
|
||||
{
|
||||
"fieldPath": "company",
|
||||
"nullable": true,
|
||||
"type": {
|
||||
"type": {
|
||||
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||
}
|
||||
},
|
||||
"nativeDataType": "VARCHAR(length=50)",
|
||||
"recursive": false,
|
||||
"isPartOfKey": false
|
||||
},
|
||||
{
|
||||
"fieldPath": "last_name",
|
||||
"nullable": true,
|
||||
"type": {
|
||||
"type": {
|
||||
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||
}
|
||||
},
|
||||
"nativeDataType": "VARCHAR(length=50)",
|
||||
"recursive": false,
|
||||
"isPartOfKey": false
|
||||
},
|
||||
{
|
||||
"fieldPath": "first_name",
|
||||
"nullable": true,
|
||||
"type": {
|
||||
"type": {
|
||||
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||
}
|
||||
},
|
||||
"nativeDataType": "VARCHAR(length=50)",
|
||||
"recursive": false,
|
||||
"isPartOfKey": false
|
||||
},
|
||||
{
|
||||
"fieldPath": "email_address",
|
||||
"nullable": true,
|
||||
"type": {
|
||||
"type": {
|
||||
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||
}
|
||||
},
|
||||
"nativeDataType": "VARCHAR(length=50)",
|
||||
"recursive": false,
|
||||
"isPartOfKey": false
|
||||
},
|
||||
{
|
||||
"fieldPath": "priority",
|
||||
"nullable": true,
|
||||
"type": {
|
||||
"type": {
|
||||
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||
}
|
||||
},
|
||||
"nativeDataType": "FLOAT()",
|
||||
"recursive": false,
|
||||
"isPartOfKey": false
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1586847600000,
|
||||
"runId": "mysql-2020_04_14-07_00_00"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "dataset",
|
||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "subTypes",
|
||||
"aspect": {
|
||||
"json": {
|
||||
"typeNames": [
|
||||
"Table"
|
||||
]
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1586847600000,
|
||||
"runId": "mysql-2020_04_14-07_00_00"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "dataset",
|
||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "browsePathsV2",
|
||||
"aspect": {
|
||||
"json": {
|
||||
"path": [
|
||||
{
|
||||
"id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
|
||||
"urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1586847600000,
|
||||
"runId": "mysql-2020_04_14-07_00_00"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "dataset",
|
||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "container",
|
||||
"aspect": {
|
||||
"json": {
|
||||
"container": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1586847600000,
|
||||
"runId": "mysql-2020_04_14-07_00_00"
|
||||
}
|
||||
},
|
||||
{
|
||||
"proposedSnapshot": {
|
||||
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
|
||||
"urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)",
|
||||
"aspects": [
|
||||
{
|
||||
"com.linkedin.pegasus2avro.common.Status": {
|
||||
"removed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {},
|
||||
"name": "orders",
|
||||
"tags": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
|
||||
"schemaName": "northwind.orders",
|
||||
"platform": "urn:li:dataPlatform:mysql",
|
||||
"version": 0,
|
||||
"created": {
|
||||
"time": 0,
|
||||
"actor": "urn:li:corpuser:unknown"
|
||||
},
|
||||
"lastModified": {
|
||||
"time": 0,
|
||||
"actor": "urn:li:corpuser:unknown"
|
||||
},
|
||||
"hash": "",
|
||||
"platformSchema": {
|
||||
"com.linkedin.pegasus2avro.schema.MySqlDDL": {
|
||||
"tableSchema": ""
|
||||
}
|
||||
},
|
||||
"fields": [
|
||||
{
|
||||
"fieldPath": "id",
|
||||
"nullable": false,
|
||||
"type": {
|
||||
"type": {
|
||||
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||
}
|
||||
},
|
||||
"nativeDataType": "INTEGER()",
|
||||
"recursive": false,
|
||||
"isPartOfKey": true
|
||||
},
|
||||
{
|
||||
"fieldPath": "description",
|
||||
"nullable": true,
|
||||
"type": {
|
||||
"type": {
|
||||
"com.linkedin.pegasus2avro.schema.StringType": {}
|
||||
}
|
||||
},
|
||||
"nativeDataType": "VARCHAR(length=50)",
|
||||
"recursive": false,
|
||||
"isPartOfKey": false
|
||||
},
|
||||
{
|
||||
"fieldPath": "customer_id",
|
||||
"nullable": false,
|
||||
"type": {
|
||||
"type": {
|
||||
"com.linkedin.pegasus2avro.schema.NumberType": {}
|
||||
}
|
||||
},
|
||||
"nativeDataType": "INTEGER()",
|
||||
"recursive": false,
|
||||
"isPartOfKey": false
|
||||
}
|
||||
],
|
||||
"foreignKeys": [
|
||||
{
|
||||
"name": "fk_order_customer",
|
||||
"foreignFields": [
|
||||
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD),id)"
|
||||
],
|
||||
"sourceFields": [
|
||||
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD),customer_id)"
|
||||
],
|
||||
"foreignDataset": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1586847600000,
|
||||
"runId": "mysql-2020_04_14-07_00_00"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "dataset",
|
||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "subTypes",
|
||||
"aspect": {
|
||||
"json": {
|
||||
"typeNames": [
|
||||
"Table"
|
||||
]
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1586847600000,
|
||||
"runId": "mysql-2020_04_14-07_00_00"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "dataset",
|
||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "browsePathsV2",
|
||||
"aspect": {
|
||||
"json": {
|
||||
"path": [
|
||||
{
|
||||
"id": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f",
|
||||
"urn": "urn:li:container:dc2ae101b66746b9c2b6df8ee89ca88f"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1586847600000,
|
||||
"runId": "mysql-2020_04_14-07_00_00"
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -48,6 +48,10 @@ def mysql_runner(docker_compose_runner, pytestconfig, test_resources_dir):
|
||||
("mysql_to_file_with_db.yml", "mysql_mces_with_db_golden.json"),
|
||||
("mysql_to_file_no_db.yml", "mysql_mces_no_db_golden.json"),
|
||||
("mysql_profile_table_level_only.yml", "mysql_table_level_only.json"),
|
||||
(
|
||||
"mysql_profile_table_row_count_estimate_only.yml",
|
||||
"mysql_table_row_count_estimate_only.json",
|
||||
),
|
||||
],
|
||||
)
|
||||
@freeze_time(FROZEN_TIME)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user