mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-07-31 05:04:39 +00:00
Refactor converter, unique count, add registry (#12939)
This commit is contained in:
parent
74e8f0b32c
commit
1e8ea26bbd
@ -14,14 +14,12 @@ Unique Count Metric definition
|
||||
"""
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy import NVARCHAR, TEXT, column, func, literal_column
|
||||
from sqlalchemy import column, func
|
||||
from sqlalchemy.orm import DeclarativeMeta, Session
|
||||
|
||||
from metadata.profiler.metrics.core import QueryMetric
|
||||
from metadata.profiler.orm.converter.mssql.converter import cast_dict
|
||||
from metadata.profiler.orm.functions.count import CountFn
|
||||
from metadata.profiler.orm.functions.unique_count import _unique_count_query_mapper
|
||||
from metadata.profiler.orm.registry import NOT_COMPUTE
|
||||
from metadata.profiler.orm.types.custom_image import CustomImage
|
||||
from metadata.utils.logger import profiler_logger
|
||||
|
||||
logger = profiler_logger()
|
||||
@ -58,28 +56,10 @@ class UniqueCount(QueryMetric):
|
||||
|
||||
# Run all queries on top of the sampled data
|
||||
col = column(self.col.name, self.col.type)
|
||||
|
||||
is_mssql = (
|
||||
hasattr(session.bind, "dialect") and session.bind.dialect.name == "mssql"
|
||||
unique_count_query = _unique_count_query_mapper[session.bind.dialect.name](
|
||||
col, session, sample
|
||||
)
|
||||
is_mssql_deprecated_datatype = isinstance(
|
||||
self.col.type, (CustomImage, TEXT, NVARCHAR)
|
||||
)
|
||||
|
||||
count_fn = CountFn(col) if is_mssql and is_mssql_deprecated_datatype else col
|
||||
group_by_col = (
|
||||
func.convert(literal_column(cast_dict.get(type(self.col.type))), col)
|
||||
if is_mssql and is_mssql_deprecated_datatype
|
||||
else col
|
||||
)
|
||||
|
||||
only_once = (
|
||||
session.query(func.count(count_fn))
|
||||
.select_from(sample)
|
||||
.group_by(group_by_col)
|
||||
.having(func.count(count_fn) == 1)
|
||||
)
|
||||
only_once_cte = only_once.cte("only_once")
|
||||
only_once_cte = unique_count_query.cte("only_once")
|
||||
return session.query(func.count().label(self.name())).select_from(only_once_cte)
|
||||
|
||||
def df_fn(self, dfs=None):
|
||||
|
@ -15,13 +15,15 @@ to an SQLAlchemy ORM class.
|
||||
"""
|
||||
from typing import Optional, cast
|
||||
|
||||
import sqlalchemy
|
||||
from sqlalchemy import MetaData
|
||||
from sqlalchemy.orm import DeclarativeMeta, declarative_base
|
||||
|
||||
from metadata.generated.schema.entity.data.database import Database, databaseService
|
||||
from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema
|
||||
from metadata.generated.schema.entity.data.table import Table
|
||||
from metadata.generated.schema.entity.data.table import Column, Table
|
||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||
from metadata.profiler.orm.converter.converter_registry import converter_registry
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
@ -62,6 +64,32 @@ def check_if_should_quote_column_name(table_service_type) -> Optional[bool]:
|
||||
return None
|
||||
|
||||
|
||||
def build_orm_col(idx: int, col: Column, table_service_type) -> sqlalchemy.Column:
|
||||
"""
|
||||
Cook the ORM column from our metadata instance
|
||||
information.
|
||||
|
||||
The first parsed column will be used arbitrarily
|
||||
as the PK, as SQLAlchemy forces us to specify
|
||||
at least one PK.
|
||||
|
||||
As this is only used for INSERT/UPDATE/DELETE,
|
||||
there is no impact for our read-only purposes.
|
||||
"""
|
||||
return sqlalchemy.Column(
|
||||
name=str(col.name.__root__),
|
||||
type_=converter_registry[table_service_type]().map_types(
|
||||
col, table_service_type
|
||||
),
|
||||
primary_key=not bool(idx), # The first col seen is used as PK
|
||||
quote=check_if_should_quote_column_name(table_service_type)
|
||||
or check_snowflake_case_sensitive(table_service_type, col.name.__root__),
|
||||
key=str(
|
||||
col.name.__root__
|
||||
).lower(), # Add lowercase column name as key for snowflake case sensitive columns
|
||||
)
|
||||
|
||||
|
||||
def ometa_to_sqa_orm(
|
||||
table: Table, metadata: OpenMetadata, sqa_metadata_obj: Optional[MetaData] = None
|
||||
) -> DeclarativeMeta:
|
||||
@ -74,9 +102,6 @@ def ometa_to_sqa_orm(
|
||||
`type` and passing SQLAlchemy `Base` class
|
||||
as the bases tuple for inheritance.
|
||||
"""
|
||||
# pylint: disable=import-outside-toplevel,cyclic-import
|
||||
from metadata.profiler.orm.converter.dispatch_converter import build_orm_col
|
||||
|
||||
table.serviceType = cast(
|
||||
databaseService.DatabaseServiceType, table.serviceType
|
||||
) # satisfy mypy
|
||||
|
@ -0,0 +1,26 @@
|
||||
# Copyright 2021 Collate
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Dispatch logic to map an Converter base based on dialect
|
||||
"""
|
||||
from collections import defaultdict
|
||||
|
||||
from metadata.generated.schema.entity.services.databaseService import (
|
||||
DatabaseServiceType,
|
||||
)
|
||||
from metadata.profiler.orm.converter.bigquery.converter import BigqueryMapTypes
|
||||
from metadata.profiler.orm.converter.common import CommonMapTypes
|
||||
from metadata.profiler.orm.converter.snowflake.converter import SnowflakeMapTypes
|
||||
|
||||
converter_registry = defaultdict(lambda: CommonMapTypes)
|
||||
converter_registry[DatabaseServiceType.BigQuery] = BigqueryMapTypes
|
||||
converter_registry[DatabaseServiceType.Snowflake] = SnowflakeMapTypes
|
@ -1,60 +0,0 @@
|
||||
# Copyright 2021 Collate
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Dispatch logic to map an Converter base based on dialect
|
||||
"""
|
||||
import sqlalchemy
|
||||
|
||||
from metadata.generated.schema.entity.data.table import Column
|
||||
from metadata.generated.schema.entity.services.databaseService import (
|
||||
DatabaseServiceType,
|
||||
)
|
||||
from metadata.profiler.orm.converter.bigquery.converter import BigqueryMapTypes
|
||||
from metadata.profiler.orm.converter.snowflake.converter import SnowflakeMapTypes
|
||||
|
||||
converter_registry = {
|
||||
DatabaseServiceType.BigQuery: BigqueryMapTypes,
|
||||
DatabaseServiceType.Snowflake: SnowflakeMapTypes,
|
||||
}
|
||||
|
||||
|
||||
def build_orm_col(idx: int, col: Column, table_service_type) -> sqlalchemy.Column:
|
||||
"""
|
||||
Cook the ORM column from our metadata instance
|
||||
information.
|
||||
|
||||
The first parsed column will be used arbitrarily
|
||||
as the PK, as SQLAlchemy forces us to specify
|
||||
at least one PK.
|
||||
|
||||
As this is only used for INSERT/UPDATE/DELETE,
|
||||
there is no impact for our read-only purposes.
|
||||
"""
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from metadata.profiler.orm.converter.base import (
|
||||
check_if_should_quote_column_name,
|
||||
check_snowflake_case_sensitive,
|
||||
)
|
||||
from metadata.profiler.orm.converter.common import CommonMapTypes
|
||||
|
||||
return sqlalchemy.Column(
|
||||
name=str(col.name.__root__),
|
||||
type_=converter_registry.get(table_service_type, CommonMapTypes)().map_types(
|
||||
col, table_service_type
|
||||
),
|
||||
primary_key=not bool(idx), # The first col seen is used as PK
|
||||
quote=check_if_should_quote_column_name(table_service_type)
|
||||
or check_snowflake_case_sensitive(table_service_type, col.name.__root__),
|
||||
key=str(
|
||||
col.name.__root__
|
||||
).lower(), # Add lowercase column name as key for snowflake case sensitive columns
|
||||
)
|
@ -1,14 +0,0 @@
|
||||
# Copyright 2021 Collate
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Registry logic to map an Converter base based on dialect
|
||||
"""
|
@ -10,11 +10,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Define Random Number function
|
||||
|
||||
Returns a column with random values
|
||||
between 0 and 100 to help us draw sample
|
||||
data.
|
||||
Define Sum function
|
||||
"""
|
||||
|
||||
from sqlalchemy.ext.compiler import compiles
|
||||
|
@ -0,0 +1,57 @@
|
||||
# Copyright 2021 Collate
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Unique Count Metric functions
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from sqlalchemy import NVARCHAR, TEXT, func, literal_column
|
||||
|
||||
from metadata.profiler.orm.converter.mssql.converter import cast_dict
|
||||
from metadata.profiler.orm.functions.count import CountFn
|
||||
from metadata.profiler.orm.registry import Dialects
|
||||
from metadata.profiler.orm.types.custom_image import CustomImage
|
||||
|
||||
|
||||
def _unique_count_query(col, session, sample):
|
||||
return (
|
||||
session.query(func.count(col))
|
||||
.select_from(sample)
|
||||
.group_by(col)
|
||||
.having(func.count(col) == 1)
|
||||
)
|
||||
|
||||
|
||||
def _unique_count_query_mssql(col, session, sample):
|
||||
# The ntext, text, and image data types will be removed in a future version of SQL Server.
|
||||
# Avoid using these data types in new development work, and plan to modify applications that currently use them.
|
||||
# Use nvarchar(max), varchar(max), and varbinary(max) instead.
|
||||
# ref:https://learn.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql?view=sql-server-ver16
|
||||
is_mssql_deprecated_datatype = isinstance(col.type, (CustomImage, TEXT, NVARCHAR))
|
||||
|
||||
if is_mssql_deprecated_datatype:
|
||||
count_fn = CountFn(col)
|
||||
group_by_col = func.convert(literal_column(cast_dict.get(type(col.type))), col)
|
||||
else:
|
||||
count_fn = col
|
||||
group_by_col = col
|
||||
return (
|
||||
session.query(func.count(count_fn))
|
||||
.select_from(sample)
|
||||
.group_by(group_by_col)
|
||||
.having(func.count(count_fn) == 1)
|
||||
)
|
||||
|
||||
|
||||
_unique_count_query_mapper = defaultdict(lambda: _unique_count_query)
|
||||
_unique_count_query_mapper[Dialects.MSSQL] = _unique_count_query_mssql
|
Loading…
x
Reference in New Issue
Block a user