mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-11-02 11:39:12 +00:00
Add Clickhouse profiler fix (#12531)
This commit is contained in:
parent
cd347299d7
commit
246bf15476
@ -48,6 +48,9 @@ Map = create_sqlalchemy_type("Map")
|
||||
Array = create_sqlalchemy_type("Array")
|
||||
Enum = create_sqlalchemy_type("Enum")
|
||||
Tuple = create_sqlalchemy_type("Tuple")
|
||||
BIGINT = create_sqlalchemy_type("BIGINT")
|
||||
SMALLINT = create_sqlalchemy_type("SMALLINT")
|
||||
INTEGER = create_sqlalchemy_type("INTEGER")
|
||||
|
||||
ischema_names.update(
|
||||
{
|
||||
@ -58,18 +61,20 @@ ischema_names.update(
|
||||
"Enum": Enum,
|
||||
"Date32": Date,
|
||||
"SimpleAggregateFunction": create_sqlalchemy_type("SimpleAggregateFunction"),
|
||||
"Int256": create_sqlalchemy_type("BIGINT"),
|
||||
"Int128": create_sqlalchemy_type("BIGINT"),
|
||||
"Int64": create_sqlalchemy_type("BIGINT"),
|
||||
"Int32": create_sqlalchemy_type("INTEGER"),
|
||||
"Int16": create_sqlalchemy_type("SMALLINT"),
|
||||
"Int8": create_sqlalchemy_type("SMALLINT"),
|
||||
"UInt256": create_sqlalchemy_type("BIGINT"),
|
||||
"UInt128": create_sqlalchemy_type("BIGINT"),
|
||||
"UInt64": create_sqlalchemy_type("BIGINT"),
|
||||
"UInt32": create_sqlalchemy_type("INTEGER"),
|
||||
"UInt16": create_sqlalchemy_type("SMALLINT"),
|
||||
"UInt8": create_sqlalchemy_type("SMALLINT"),
|
||||
"Int256": BIGINT,
|
||||
"Int128": BIGINT,
|
||||
"Int64": BIGINT,
|
||||
"Int32": INTEGER,
|
||||
"Int16": SMALLINT,
|
||||
"Int8": SMALLINT,
|
||||
"UInt256": BIGINT,
|
||||
"UInt128": BIGINT,
|
||||
"UInt64": BIGINT,
|
||||
"UInt32": INTEGER,
|
||||
"UInt16": SMALLINT,
|
||||
"UInt8": SMALLINT,
|
||||
"IPv4": create_sqlalchemy_type("IPv4"),
|
||||
"IPv6": create_sqlalchemy_type("IPv6"),
|
||||
}
|
||||
)
|
||||
|
||||
@ -109,9 +114,6 @@ def _get_column_type(
|
||||
if spec.startswith("DateTime"):
|
||||
return self.ischema_names["DateTime"]
|
||||
|
||||
if spec.startswith("IP"):
|
||||
return self.ischema_names["String"]
|
||||
|
||||
if spec.lower().startswith("decimal"):
|
||||
coltype = self.ischema_names["Decimal"]
|
||||
return coltype(*self._parse_decimal_params(spec))
|
||||
|
||||
@ -228,6 +228,8 @@ class ColumnTypeParser:
|
||||
"LOWCARDINALITY": "LOWCARDINALITY",
|
||||
"DATETIME64": "DATETIME",
|
||||
"SimpleAggregateFunction()": "AGGREGATEFUNCTION",
|
||||
"IPV4": "IPV4",
|
||||
"IPV6": "IPV6",
|
||||
# Databricks
|
||||
"VOID": "NULL",
|
||||
# mysql
|
||||
@ -293,7 +295,7 @@ class ColumnTypeParser:
|
||||
for func in [
|
||||
ColumnTypeParser.get_column_type_mapping,
|
||||
ColumnTypeParser.get_source_type_mapping,
|
||||
ColumnTypeParser.get_source_type_containes_brackets,
|
||||
ColumnTypeParser.get_source_type_contains_brackets,
|
||||
]:
|
||||
column_type_result = func(column_type)
|
||||
if column_type_result:
|
||||
@ -309,7 +311,7 @@ class ColumnTypeParser:
|
||||
return ColumnTypeParser._SOURCE_TYPE_TO_OM_TYPE.get(str(column_type), None)
|
||||
|
||||
@staticmethod
|
||||
def get_source_type_containes_brackets(column_type: Any) -> str:
|
||||
def get_source_type_contains_brackets(column_type: Any) -> str:
|
||||
return ColumnTypeParser._SOURCE_TYPE_TO_OM_TYPE.get(
|
||||
str(column_type).split("(", maxsplit=1)[0].split("<")[0].upper(), None
|
||||
)
|
||||
|
||||
@ -254,6 +254,9 @@ class SqlColumnHandlerMixin:
|
||||
precision = ColumnTypeParser.check_col_precision(
|
||||
col_type, column["type"]
|
||||
)
|
||||
# Clickhouse nullable if true, data type as Null
|
||||
if column.get("nullable"):
|
||||
col_type = DataType.NULL.name
|
||||
if col_type is None:
|
||||
col_type = DataType.UNKNOWN.name
|
||||
data_type_display = col_type.lower()
|
||||
|
||||
@ -167,7 +167,7 @@ class Histogram(HybridMetric):
|
||||
if is_concatenable(self.col.type):
|
||||
col = LenFn(column(self.col.name, self.col.type))
|
||||
else:
|
||||
col = column(self.col.name) # type: ignore
|
||||
col = column(self.col.name, self.col.type) # type: ignore
|
||||
|
||||
case_stmts = []
|
||||
for bin_num in range(num_bins):
|
||||
|
||||
@ -56,7 +56,11 @@ class CountInSet(StaticMetric):
|
||||
|
||||
try:
|
||||
set_values = set(self.values)
|
||||
return SumFn(case([(column(self.col.name).in_(set_values), 1)], else_=0))
|
||||
return SumFn(
|
||||
case(
|
||||
[(column(self.col.name, self.col.type).in_(set_values), 1)], else_=0
|
||||
)
|
||||
)
|
||||
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
logger.debug(traceback.format_exc())
|
||||
|
||||
@ -47,4 +47,9 @@ class ILikeCount(StaticMetric):
|
||||
raise AttributeError(
|
||||
"ILike Count requires an expression to be set: add_props(expression=...)(Metrics.ILIKE_COUNT)"
|
||||
)
|
||||
return SumFn(case([(column(self.col.name).ilike(self.expression), 1)], else_=0))
|
||||
return SumFn(
|
||||
case(
|
||||
[(column(self.col.name, self.col.type).ilike(self.expression), 1)],
|
||||
else_=0,
|
||||
)
|
||||
)
|
||||
|
||||
@ -47,4 +47,9 @@ class LikeCount(StaticMetric):
|
||||
raise AttributeError(
|
||||
"Like Count requires an expression to be set: add_props(expression=...)(Metrics.LIKE_COUNT)"
|
||||
)
|
||||
return SumFn(case([(column(self.col.name).like(self.expression), 1)], else_=0))
|
||||
return SumFn(
|
||||
case(
|
||||
[(column(self.col.name, self.col.type).like(self.expression), 1)],
|
||||
else_=0,
|
||||
)
|
||||
)
|
||||
|
||||
@ -57,7 +57,7 @@ class Max(StaticMetric):
|
||||
return MaxFn(LenFn(column(self.col.name, self.col.type)))
|
||||
if (not is_quantifiable(self.col.type)) and (not is_date_time(self.col.type)):
|
||||
return None
|
||||
return MaxFn(column(self.col.name))
|
||||
return MaxFn(column(self.col.name, self.col.type))
|
||||
|
||||
def df_fn(self, dfs=None):
|
||||
"""pandas function"""
|
||||
|
||||
@ -74,7 +74,7 @@ class Mean(StaticMetric):
|
||||
def fn(self):
|
||||
"""sqlalchemy function"""
|
||||
if is_quantifiable(self.col.type):
|
||||
return func.avg(column(self.col.name))
|
||||
return func.avg(column(self.col.name, self.col.type))
|
||||
|
||||
if is_concatenable(self.col.type):
|
||||
return func.avg(LenFn(column(self.col.name, self.col.type)))
|
||||
|
||||
@ -57,7 +57,7 @@ class Min(StaticMetric):
|
||||
|
||||
if (not is_quantifiable(self.col.type)) and (not is_date_time(self.col.type)):
|
||||
return None
|
||||
return MinFn(column(self.col.name))
|
||||
return MinFn(column(self.col.name, self.col.type))
|
||||
|
||||
def df_fn(self, dfs=None):
|
||||
"""pandas function"""
|
||||
|
||||
@ -48,5 +48,8 @@ class NotLikeCount(StaticMetric):
|
||||
"Not Like Count requires an expression to be set: add_props(expression=...)(Metrics.NOT_LIKE_COUNT)"
|
||||
)
|
||||
return SumFn(
|
||||
case([(column(self.col.name).not_like(self.expression), 0)], else_=1)
|
||||
case(
|
||||
[(column(self.col.name, self.col.type).not_like(self.expression), 0)],
|
||||
else_=1,
|
||||
)
|
||||
)
|
||||
|
||||
@ -54,7 +54,16 @@ class NotRegexCount(StaticMetric):
|
||||
)
|
||||
return SumFn(
|
||||
case(
|
||||
[(not_(column(self.col.name).regexp_match(self.expression)), 0)],
|
||||
[
|
||||
(
|
||||
not_(
|
||||
column(self.col.name, self.col.type).regexp_match(
|
||||
self.expression
|
||||
)
|
||||
),
|
||||
0,
|
||||
)
|
||||
],
|
||||
else_=1,
|
||||
)
|
||||
)
|
||||
|
||||
@ -47,7 +47,9 @@ class NullCount(StaticMetric):
|
||||
@_label
|
||||
def fn(self):
|
||||
"""sqlalchemy function"""
|
||||
return SumFn(case([(column(self.col.name).is_(None), 1)], else_=0))
|
||||
return SumFn(
|
||||
case([(column(self.col.name, self.col.type).is_(None), 1)], else_=0)
|
||||
)
|
||||
|
||||
def df_fn(self, dfs=None):
|
||||
"""pandas function"""
|
||||
|
||||
@ -53,7 +53,17 @@ class RegexCount(StaticMetric):
|
||||
"Regex Count requires an expression to be set: add_props(expression=...)(Metrics.REGEX_COUNT)"
|
||||
)
|
||||
return SumFn(
|
||||
case([(column(self.col.name).regexp_match(self.expression), 1)], else_=0)
|
||||
case(
|
||||
[
|
||||
(
|
||||
column(self.col.name, self.col.type).regexp_match(
|
||||
self.expression
|
||||
),
|
||||
1,
|
||||
)
|
||||
],
|
||||
else_=0,
|
||||
)
|
||||
)
|
||||
|
||||
def df_fn(self, dfs):
|
||||
|
||||
@ -83,7 +83,7 @@ class StdDev(StaticMetric):
|
||||
def fn(self):
|
||||
"""sqlalchemy function"""
|
||||
if is_quantifiable(self.col.type):
|
||||
return StdDevFn(column(self.col.name))
|
||||
return StdDevFn(column(self.col.name, self.col.type))
|
||||
|
||||
if is_concatenable(self.col.type):
|
||||
return StdDevFn(LenFn(column(self.col.name, self.col.type)))
|
||||
|
||||
@ -39,7 +39,7 @@ class Sum(StaticMetric):
|
||||
def fn(self):
|
||||
"""sqlalchemy function"""
|
||||
if is_quantifiable(self.col.type):
|
||||
return SumFn(column(self.col.name))
|
||||
return SumFn(column(self.col.name, self.col.type))
|
||||
|
||||
if is_concatenable(self.col.type):
|
||||
return SumFn(LenFn(column(self.col.name, self.col.type)))
|
||||
|
||||
@ -54,7 +54,7 @@ class FirstQuartile(StaticMetric):
|
||||
if is_quantifiable(self.col.type):
|
||||
# col fullname is only needed for MySQL and SQLite
|
||||
return MedianFn(
|
||||
column(self.col.name),
|
||||
column(self.col.name, self.col.type),
|
||||
self.col.table.fullname if self.col.table is not None else None,
|
||||
0.25,
|
||||
)
|
||||
|
||||
@ -54,7 +54,7 @@ class Median(StaticMetric):
|
||||
if is_quantifiable(self.col.type):
|
||||
# col fullname is only needed for MySQL and SQLite
|
||||
return MedianFn(
|
||||
column(self.col.name),
|
||||
column(self.col.name, self.col.type),
|
||||
self.col.table.fullname if self.col.table is not None else None,
|
||||
0.5,
|
||||
)
|
||||
|
||||
@ -54,7 +54,7 @@ class ThirdQuartile(StaticMetric):
|
||||
if is_quantifiable(self.col.type):
|
||||
# col fullname is only needed for MySQL and SQLite
|
||||
return MedianFn(
|
||||
column(self.col.name),
|
||||
column(self.col.name, self.col.type),
|
||||
self.col.table.fullname if self.col.table is not None else None,
|
||||
0.75,
|
||||
)
|
||||
|
||||
@ -65,6 +65,8 @@ class CommonMapTypes:
|
||||
DataType.BYTEA: CustomTypes.BYTEA.value,
|
||||
DataType.NTEXT: sqlalchemy.NVARCHAR,
|
||||
DataType.IMAGE: CustomTypes.IMAGE.value,
|
||||
DataType.IPV4: CustomTypes.IP.value,
|
||||
DataType.IPV6: CustomTypes.IP.value,
|
||||
}
|
||||
|
||||
def map_types(self, col: Column, table_service_type):
|
||||
|
||||
@ -61,6 +61,8 @@ def _(element, compiler, **kw):
|
||||
@compiles(LenFn, Dialects.ClickHouse)
|
||||
def _(element, compiler, **kw):
|
||||
"""Handles lenght function for ClickHouse"""
|
||||
if isinstance(element.clauses.clauses[0].type, sqltypes.Enum):
|
||||
return "length(cast(%s, 'String'))" % compiler.process(element.clauses, **kw)
|
||||
return "length(%s)" % compiler.process(element.clauses, **kw)
|
||||
|
||||
|
||||
|
||||
@ -16,6 +16,7 @@ Define Median function
|
||||
# pylint: disable=consider-using-f-string,duplicate-code
|
||||
from sqlalchemy.ext.compiler import compiles
|
||||
from sqlalchemy.sql.functions import FunctionElement
|
||||
from sqlalchemy.sql.sqltypes import DECIMAL
|
||||
|
||||
from metadata.profiler.metrics.core import CACHE
|
||||
from metadata.profiler.orm.registry import Dialects
|
||||
@ -48,7 +49,11 @@ def _(elements, compiler, **kwargs):
|
||||
col, _, percentile = [
|
||||
compiler.process(element, **kwargs) for element in elements.clauses
|
||||
]
|
||||
return "if(isNaN(quantile(%s)(%s)),null,quantile(%s)(%s))" % ((percentile, col) * 2)
|
||||
quantile_str = f"quantile({percentile})({col})"
|
||||
null_check = (
|
||||
"isNull" if isinstance(elements.clauses.clauses[0].type, DECIMAL) else "isNaN"
|
||||
)
|
||||
return f"if({null_check}({quantile_str}), null, {quantile_str})"
|
||||
|
||||
|
||||
# pylint: disable=unused-argument
|
||||
|
||||
@ -22,6 +22,7 @@ from metadata.ingestion.source import sqa_types
|
||||
from metadata.profiler.orm.types.bytea_to_string import ByteaToHex
|
||||
from metadata.profiler.orm.types.custom_array import CustomArray
|
||||
from metadata.profiler.orm.types.custom_image import CustomImage
|
||||
from metadata.profiler.orm.types.custom_ip import CustomIP
|
||||
from metadata.profiler.orm.types.custom_timestamp import CustomTimestamp
|
||||
from metadata.profiler.orm.types.hex_byte_string import HexByteString
|
||||
from metadata.profiler.orm.types.uuid import UUIDString
|
||||
@ -36,6 +37,7 @@ class CustomTypes(TypeRegistry):
|
||||
ARRAY = CustomArray
|
||||
TIMESTAMP = CustomTimestamp
|
||||
IMAGE = CustomImage
|
||||
IP = CustomIP
|
||||
|
||||
|
||||
class Dialects(Enum):
|
||||
|
||||
30
ingestion/src/metadata/profiler/orm/types/custom_ip.py
Normal file
30
ingestion/src/metadata/profiler/orm/types/custom_ip.py
Normal file
@ -0,0 +1,30 @@
|
||||
# Copyright 2021 Collate
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# pylint: disable=abstract-method
|
||||
|
||||
"""
|
||||
Expand sqlalchemy types to map them to OpenMetadata DataType
|
||||
"""
|
||||
from sqlalchemy.sql.sqltypes import String, TypeDecorator
|
||||
|
||||
from metadata.utils.logger import profiler_logger
|
||||
|
||||
logger = profiler_logger()
|
||||
|
||||
|
||||
class CustomIP(TypeDecorator):
|
||||
"""
|
||||
Convert RowVersion
|
||||
"""
|
||||
|
||||
impl = String
|
||||
cache_ok = True
|
||||
@ -148,7 +148,9 @@
|
||||
"SPATIAL",
|
||||
"TABLE",
|
||||
"NTEXT",
|
||||
"IMAGE"
|
||||
"IMAGE",
|
||||
"IPV4",
|
||||
"IPV6"
|
||||
]
|
||||
},
|
||||
"constraint": {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user