mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-08-01 21:57:55 +00:00
Refactor converter, unique count, add registry (#12939)
This commit is contained in:
parent
74e8f0b32c
commit
1e8ea26bbd
@ -14,14 +14,12 @@ Unique Count Metric definition
|
|||||||
"""
|
"""
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from sqlalchemy import NVARCHAR, TEXT, column, func, literal_column
|
from sqlalchemy import column, func
|
||||||
from sqlalchemy.orm import DeclarativeMeta, Session
|
from sqlalchemy.orm import DeclarativeMeta, Session
|
||||||
|
|
||||||
from metadata.profiler.metrics.core import QueryMetric
|
from metadata.profiler.metrics.core import QueryMetric
|
||||||
from metadata.profiler.orm.converter.mssql.converter import cast_dict
|
from metadata.profiler.orm.functions.unique_count import _unique_count_query_mapper
|
||||||
from metadata.profiler.orm.functions.count import CountFn
|
|
||||||
from metadata.profiler.orm.registry import NOT_COMPUTE
|
from metadata.profiler.orm.registry import NOT_COMPUTE
|
||||||
from metadata.profiler.orm.types.custom_image import CustomImage
|
|
||||||
from metadata.utils.logger import profiler_logger
|
from metadata.utils.logger import profiler_logger
|
||||||
|
|
||||||
logger = profiler_logger()
|
logger = profiler_logger()
|
||||||
@ -58,28 +56,10 @@ class UniqueCount(QueryMetric):
|
|||||||
|
|
||||||
# Run all queries on top of the sampled data
|
# Run all queries on top of the sampled data
|
||||||
col = column(self.col.name, self.col.type)
|
col = column(self.col.name, self.col.type)
|
||||||
|
unique_count_query = _unique_count_query_mapper[session.bind.dialect.name](
|
||||||
is_mssql = (
|
col, session, sample
|
||||||
hasattr(session.bind, "dialect") and session.bind.dialect.name == "mssql"
|
|
||||||
)
|
)
|
||||||
is_mssql_deprecated_datatype = isinstance(
|
only_once_cte = unique_count_query.cte("only_once")
|
||||||
self.col.type, (CustomImage, TEXT, NVARCHAR)
|
|
||||||
)
|
|
||||||
|
|
||||||
count_fn = CountFn(col) if is_mssql and is_mssql_deprecated_datatype else col
|
|
||||||
group_by_col = (
|
|
||||||
func.convert(literal_column(cast_dict.get(type(self.col.type))), col)
|
|
||||||
if is_mssql and is_mssql_deprecated_datatype
|
|
||||||
else col
|
|
||||||
)
|
|
||||||
|
|
||||||
only_once = (
|
|
||||||
session.query(func.count(count_fn))
|
|
||||||
.select_from(sample)
|
|
||||||
.group_by(group_by_col)
|
|
||||||
.having(func.count(count_fn) == 1)
|
|
||||||
)
|
|
||||||
only_once_cte = only_once.cte("only_once")
|
|
||||||
return session.query(func.count().label(self.name())).select_from(only_once_cte)
|
return session.query(func.count().label(self.name())).select_from(only_once_cte)
|
||||||
|
|
||||||
def df_fn(self, dfs=None):
|
def df_fn(self, dfs=None):
|
||||||
|
@ -15,13 +15,15 @@ to an SQLAlchemy ORM class.
|
|||||||
"""
|
"""
|
||||||
from typing import Optional, cast
|
from typing import Optional, cast
|
||||||
|
|
||||||
|
import sqlalchemy
|
||||||
from sqlalchemy import MetaData
|
from sqlalchemy import MetaData
|
||||||
from sqlalchemy.orm import DeclarativeMeta, declarative_base
|
from sqlalchemy.orm import DeclarativeMeta, declarative_base
|
||||||
|
|
||||||
from metadata.generated.schema.entity.data.database import Database, databaseService
|
from metadata.generated.schema.entity.data.database import Database, databaseService
|
||||||
from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema
|
from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema
|
||||||
from metadata.generated.schema.entity.data.table import Table
|
from metadata.generated.schema.entity.data.table import Column, Table
|
||||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
|
from metadata.profiler.orm.converter.converter_registry import converter_registry
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|
||||||
@ -62,6 +64,32 @@ def check_if_should_quote_column_name(table_service_type) -> Optional[bool]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def build_orm_col(idx: int, col: Column, table_service_type) -> sqlalchemy.Column:
|
||||||
|
"""
|
||||||
|
Cook the ORM column from our metadata instance
|
||||||
|
information.
|
||||||
|
|
||||||
|
The first parsed column will be used arbitrarily
|
||||||
|
as the PK, as SQLAlchemy forces us to specify
|
||||||
|
at least one PK.
|
||||||
|
|
||||||
|
As this is only used for INSERT/UPDATE/DELETE,
|
||||||
|
there is no impact for our read-only purposes.
|
||||||
|
"""
|
||||||
|
return sqlalchemy.Column(
|
||||||
|
name=str(col.name.__root__),
|
||||||
|
type_=converter_registry[table_service_type]().map_types(
|
||||||
|
col, table_service_type
|
||||||
|
),
|
||||||
|
primary_key=not bool(idx), # The first col seen is used as PK
|
||||||
|
quote=check_if_should_quote_column_name(table_service_type)
|
||||||
|
or check_snowflake_case_sensitive(table_service_type, col.name.__root__),
|
||||||
|
key=str(
|
||||||
|
col.name.__root__
|
||||||
|
).lower(), # Add lowercase column name as key for snowflake case sensitive columns
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def ometa_to_sqa_orm(
|
def ometa_to_sqa_orm(
|
||||||
table: Table, metadata: OpenMetadata, sqa_metadata_obj: Optional[MetaData] = None
|
table: Table, metadata: OpenMetadata, sqa_metadata_obj: Optional[MetaData] = None
|
||||||
) -> DeclarativeMeta:
|
) -> DeclarativeMeta:
|
||||||
@ -74,9 +102,6 @@ def ometa_to_sqa_orm(
|
|||||||
`type` and passing SQLAlchemy `Base` class
|
`type` and passing SQLAlchemy `Base` class
|
||||||
as the bases tuple for inheritance.
|
as the bases tuple for inheritance.
|
||||||
"""
|
"""
|
||||||
# pylint: disable=import-outside-toplevel,cyclic-import
|
|
||||||
from metadata.profiler.orm.converter.dispatch_converter import build_orm_col
|
|
||||||
|
|
||||||
table.serviceType = cast(
|
table.serviceType = cast(
|
||||||
databaseService.DatabaseServiceType, table.serviceType
|
databaseService.DatabaseServiceType, table.serviceType
|
||||||
) # satisfy mypy
|
) # satisfy mypy
|
||||||
|
@ -0,0 +1,26 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Dispatch logic to map an Converter base based on dialect
|
||||||
|
"""
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from metadata.generated.schema.entity.services.databaseService import (
|
||||||
|
DatabaseServiceType,
|
||||||
|
)
|
||||||
|
from metadata.profiler.orm.converter.bigquery.converter import BigqueryMapTypes
|
||||||
|
from metadata.profiler.orm.converter.common import CommonMapTypes
|
||||||
|
from metadata.profiler.orm.converter.snowflake.converter import SnowflakeMapTypes
|
||||||
|
|
||||||
|
converter_registry = defaultdict(lambda: CommonMapTypes)
|
||||||
|
converter_registry[DatabaseServiceType.BigQuery] = BigqueryMapTypes
|
||||||
|
converter_registry[DatabaseServiceType.Snowflake] = SnowflakeMapTypes
|
@ -1,60 +0,0 @@
|
|||||||
# Copyright 2021 Collate
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
"""
|
|
||||||
Dispatch logic to map an Converter base based on dialect
|
|
||||||
"""
|
|
||||||
import sqlalchemy
|
|
||||||
|
|
||||||
from metadata.generated.schema.entity.data.table import Column
|
|
||||||
from metadata.generated.schema.entity.services.databaseService import (
|
|
||||||
DatabaseServiceType,
|
|
||||||
)
|
|
||||||
from metadata.profiler.orm.converter.bigquery.converter import BigqueryMapTypes
|
|
||||||
from metadata.profiler.orm.converter.snowflake.converter import SnowflakeMapTypes
|
|
||||||
|
|
||||||
converter_registry = {
|
|
||||||
DatabaseServiceType.BigQuery: BigqueryMapTypes,
|
|
||||||
DatabaseServiceType.Snowflake: SnowflakeMapTypes,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def build_orm_col(idx: int, col: Column, table_service_type) -> sqlalchemy.Column:
|
|
||||||
"""
|
|
||||||
Cook the ORM column from our metadata instance
|
|
||||||
information.
|
|
||||||
|
|
||||||
The first parsed column will be used arbitrarily
|
|
||||||
as the PK, as SQLAlchemy forces us to specify
|
|
||||||
at least one PK.
|
|
||||||
|
|
||||||
As this is only used for INSERT/UPDATE/DELETE,
|
|
||||||
there is no impact for our read-only purposes.
|
|
||||||
"""
|
|
||||||
# pylint: disable=import-outside-toplevel
|
|
||||||
from metadata.profiler.orm.converter.base import (
|
|
||||||
check_if_should_quote_column_name,
|
|
||||||
check_snowflake_case_sensitive,
|
|
||||||
)
|
|
||||||
from metadata.profiler.orm.converter.common import CommonMapTypes
|
|
||||||
|
|
||||||
return sqlalchemy.Column(
|
|
||||||
name=str(col.name.__root__),
|
|
||||||
type_=converter_registry.get(table_service_type, CommonMapTypes)().map_types(
|
|
||||||
col, table_service_type
|
|
||||||
),
|
|
||||||
primary_key=not bool(idx), # The first col seen is used as PK
|
|
||||||
quote=check_if_should_quote_column_name(table_service_type)
|
|
||||||
or check_snowflake_case_sensitive(table_service_type, col.name.__root__),
|
|
||||||
key=str(
|
|
||||||
col.name.__root__
|
|
||||||
).lower(), # Add lowercase column name as key for snowflake case sensitive columns
|
|
||||||
)
|
|
@ -1,14 +0,0 @@
|
|||||||
# Copyright 2021 Collate
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
"""
|
|
||||||
Registry logic to map an Converter base based on dialect
|
|
||||||
"""
|
|
@ -10,11 +10,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Define Random Number function
|
Define Sum function
|
||||||
|
|
||||||
Returns a column with random values
|
|
||||||
between 0 and 100 to help us draw sample
|
|
||||||
data.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from sqlalchemy.ext.compiler import compiles
|
from sqlalchemy.ext.compiler import compiles
|
||||||
|
@ -0,0 +1,57 @@
|
|||||||
|
# Copyright 2021 Collate
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Unique Count Metric functions
|
||||||
|
"""
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from sqlalchemy import NVARCHAR, TEXT, func, literal_column
|
||||||
|
|
||||||
|
from metadata.profiler.orm.converter.mssql.converter import cast_dict
|
||||||
|
from metadata.profiler.orm.functions.count import CountFn
|
||||||
|
from metadata.profiler.orm.registry import Dialects
|
||||||
|
from metadata.profiler.orm.types.custom_image import CustomImage
|
||||||
|
|
||||||
|
|
||||||
|
def _unique_count_query(col, session, sample):
|
||||||
|
return (
|
||||||
|
session.query(func.count(col))
|
||||||
|
.select_from(sample)
|
||||||
|
.group_by(col)
|
||||||
|
.having(func.count(col) == 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _unique_count_query_mssql(col, session, sample):
|
||||||
|
# The ntext, text, and image data types will be removed in a future version of SQL Server.
|
||||||
|
# Avoid using these data types in new development work, and plan to modify applications that currently use them.
|
||||||
|
# Use nvarchar(max), varchar(max), and varbinary(max) instead.
|
||||||
|
# ref:https://learn.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql?view=sql-server-ver16
|
||||||
|
is_mssql_deprecated_datatype = isinstance(col.type, (CustomImage, TEXT, NVARCHAR))
|
||||||
|
|
||||||
|
if is_mssql_deprecated_datatype:
|
||||||
|
count_fn = CountFn(col)
|
||||||
|
group_by_col = func.convert(literal_column(cast_dict.get(type(col.type))), col)
|
||||||
|
else:
|
||||||
|
count_fn = col
|
||||||
|
group_by_col = col
|
||||||
|
return (
|
||||||
|
session.query(func.count(count_fn))
|
||||||
|
.select_from(sample)
|
||||||
|
.group_by(group_by_col)
|
||||||
|
.having(func.count(count_fn) == 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_unique_count_query_mapper = defaultdict(lambda: _unique_count_query)
|
||||||
|
_unique_count_query_mapper[Dialects.MSSQL] = _unique_count_query_mssql
|
Loading…
x
Reference in New Issue
Block a user