2022-07-06 10:12:29 +02:00
|
|
|
# Copyright 2021 Collate
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
"""
|
|
|
|
Define Median function
|
|
|
|
"""
|
|
|
|
# Keep SQA docs style defining custom constructs
|
|
|
|
# pylint: disable=consider-using-f-string,duplicate-code
|
|
|
|
from sqlalchemy.ext.compiler import compiles
|
|
|
|
from sqlalchemy.sql.functions import FunctionElement
|
|
|
|
|
2023-03-01 08:20:38 +01:00
|
|
|
from metadata.profiler.metrics.core import CACHE
|
|
|
|
from metadata.profiler.orm.registry import Dialects
|
2022-07-06 10:12:29 +02:00
|
|
|
from metadata.utils.logger import profiler_logger
|
|
|
|
|
|
|
|
logger = profiler_logger()
|
|
|
|
|
|
|
|
|
|
|
|
class MedianFn(FunctionElement):
|
|
|
|
inherit_cache = CACHE
|
|
|
|
|
|
|
|
|
|
|
|
@compiles(MedianFn)
|
2022-10-11 09:36:36 +02:00
|
|
|
def _(elements, compiler, **kwargs): # pylint: disable=unused-argument
|
2023-01-13 08:12:04 +01:00
|
|
|
col = compiler.process(elements.clauses.clauses[0])
|
2023-03-03 21:56:32 +01:00
|
|
|
percentile = elements.clauses.clauses[2].value
|
|
|
|
return "percentile_cont(%.1f) WITHIN GROUP (ORDER BY %s ASC)" % (percentile, col)
|
2022-07-06 10:12:29 +02:00
|
|
|
|
|
|
|
|
|
|
|
@compiles(MedianFn, Dialects.BigQuery)
|
|
|
|
def _(elements, compiler, **kwargs):
|
2023-03-03 21:56:32 +01:00
|
|
|
col, _, percentile = [
|
|
|
|
compiler.process(element, **kwargs) for element in elements.clauses
|
|
|
|
]
|
2023-03-13 11:34:40 +01:00
|
|
|
return "percentile_cont(%s , %s) OVER()" % (col, percentile)
|
2022-07-06 10:12:29 +02:00
|
|
|
|
|
|
|
|
|
|
|
@compiles(MedianFn, Dialects.ClickHouse)
|
|
|
|
def _(elements, compiler, **kwargs):
|
2023-03-03 21:56:32 +01:00
|
|
|
col, _, percentile = [
|
|
|
|
compiler.process(element, **kwargs) for element in elements.clauses
|
|
|
|
]
|
2023-03-13 11:34:40 +01:00
|
|
|
return "quantile(%s)(%s)" % (percentile, col)
|
2022-07-06 10:12:29 +02:00
|
|
|
|
|
|
|
|
2022-10-11 15:57:25 +02:00
|
|
|
# pylint: disable=unused-argument
|
2022-12-13 13:03:22 +01:00
|
|
|
@compiles(MedianFn, Dialects.Athena)
|
2022-07-08 15:55:50 +02:00
|
|
|
@compiles(MedianFn, Dialects.Trino)
|
2022-07-06 10:12:29 +02:00
|
|
|
@compiles(MedianFn, Dialects.Presto)
|
|
|
|
def _(elements, compiler, **kwargs):
|
2022-10-04 21:22:13 +02:00
|
|
|
col = elements.clauses.clauses[0].name
|
2023-03-03 21:56:32 +01:00
|
|
|
percentile = elements.clauses.clauses[2].value
|
|
|
|
return 'approx_percentile("%s", %.1f)' % (col, percentile)
|
2022-07-06 10:12:29 +02:00
|
|
|
|
|
|
|
|
2022-07-13 14:43:56 +02:00
|
|
|
@compiles(MedianFn, Dialects.MSSQL)
|
|
|
|
def _(elements, compiler, **kwargs):
|
|
|
|
"""Median computation for MSSQL"""
|
2022-11-18 09:41:36 +01:00
|
|
|
col = elements.clauses.clauses[0].name
|
2023-03-03 21:56:32 +01:00
|
|
|
percentile = elements.clauses.clauses[2].value
|
|
|
|
return "percentile_cont(%.1f) WITHIN GROUP (ORDER BY %s ASC) OVER()" % (
|
|
|
|
percentile,
|
|
|
|
col,
|
|
|
|
)
|
2022-07-13 14:43:56 +02:00
|
|
|
|
|
|
|
|
2022-09-24 00:48:09 +02:00
|
|
|
@compiles(MedianFn, Dialects.Hive)
|
2023-03-08 14:13:06 +01:00
|
|
|
@compiles(MedianFn, Dialects.Impala)
|
2022-09-24 00:48:09 +02:00
|
|
|
def _(elements, compiler, **kwargs):
|
|
|
|
"""Median computation for Hive"""
|
2023-03-03 21:56:32 +01:00
|
|
|
col, _, percentile = [
|
|
|
|
compiler.process(element, **kwargs) for element in elements.clauses
|
|
|
|
]
|
2023-03-13 11:34:40 +01:00
|
|
|
return "percentile(cast(%s as BIGINT), %s)" % (col, percentile)
|
2022-09-24 00:48:09 +02:00
|
|
|
|
|
|
|
|
2022-07-29 10:41:53 +02:00
|
|
|
@compiles(MedianFn, Dialects.MySQL)
|
2023-01-20 20:36:03 +01:00
|
|
|
def _(elements, compiler, **kwargs): # pylint: disable=unused-argument
|
2022-07-29 10:41:53 +02:00
|
|
|
"""Median computation for MySQL currently not supported
|
|
|
|
Needs to be tackled in https://github.com/open-metadata/OpenMetadata/issues/6340
|
|
|
|
"""
|
|
|
|
return "NULL"
|
|
|
|
|
|
|
|
|
2022-07-06 10:12:29 +02:00
|
|
|
@compiles(MedianFn, Dialects.SQLite)
|
2022-10-11 09:36:36 +02:00
|
|
|
def _(elements, compiler, **kwargs): # pylint: disable=unused-argument
|
2023-03-03 21:56:32 +01:00
|
|
|
col = compiler.process(elements.clauses.clauses[0])
|
|
|
|
table = elements.clauses.clauses[1].value
|
|
|
|
percentile = elements.clauses.clauses[2].value
|
|
|
|
|
2022-07-06 10:12:29 +02:00
|
|
|
return """
|
|
|
|
(SELECT
|
2023-03-03 21:56:32 +01:00
|
|
|
{col}
|
|
|
|
FROM {table}
|
|
|
|
WHERE {col} IS NOT NULL
|
|
|
|
ORDER BY {col}
|
|
|
|
LIMIT 1
|
|
|
|
OFFSET (
|
|
|
|
SELECT ROUND(COUNT(*) * {percentile} -1)
|
2022-07-06 10:12:29 +02:00
|
|
|
FROM {table}
|
2023-03-03 21:56:32 +01:00
|
|
|
WHERE {col} IS NOT NULL
|
|
|
|
)
|
2022-07-06 10:12:29 +02:00
|
|
|
)
|
2023-03-03 21:56:32 +01:00
|
|
|
""".format(
|
|
|
|
col=col, table=table, percentile=percentile
|
2023-01-20 20:36:03 +01:00
|
|
|
)
|