2022-07-06 10:12:29 +02:00
|
|
|
# Copyright 2021 Collate
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
"""
|
|
|
|
Define Median function
|
|
|
|
"""
|
|
|
|
# Keep SQA docs style defining custom constructs
|
|
|
|
# pylint: disable=consider-using-f-string,duplicate-code
|
|
|
|
from sqlalchemy.ext.compiler import compiles
|
|
|
|
from sqlalchemy.sql.functions import FunctionElement
|
|
|
|
|
2023-03-01 08:20:38 +01:00
|
|
|
from metadata.profiler.metrics.core import CACHE
|
|
|
|
from metadata.profiler.orm.registry import Dialects
|
2022-07-06 10:12:29 +02:00
|
|
|
from metadata.utils.logger import profiler_logger
|
|
|
|
|
|
|
|
logger = profiler_logger()
|
|
|
|
|
|
|
|
|
|
|
|
class MedianFn(FunctionElement):
|
|
|
|
inherit_cache = CACHE
|
|
|
|
|
|
|
|
|
|
|
|
@compiles(MedianFn)
|
2022-10-11 09:36:36 +02:00
|
|
|
def _(elements, compiler, **kwargs): # pylint: disable=unused-argument
|
2023-01-13 08:12:04 +01:00
|
|
|
col = compiler.process(elements.clauses.clauses[0])
|
2023-03-03 21:56:32 +01:00
|
|
|
percentile = elements.clauses.clauses[2].value
|
2023-05-02 12:45:26 +02:00
|
|
|
return "percentile_cont(%.2f) WITHIN GROUP (ORDER BY %s ASC)" % (percentile, col)
|
2022-07-06 10:12:29 +02:00
|
|
|
|
|
|
|
|
|
|
|
@compiles(MedianFn, Dialects.BigQuery)
|
|
|
|
def _(elements, compiler, **kwargs):
|
2023-03-03 21:56:32 +01:00
|
|
|
col, _, percentile = [
|
|
|
|
compiler.process(element, **kwargs) for element in elements.clauses
|
|
|
|
]
|
2023-03-13 11:34:40 +01:00
|
|
|
return "percentile_cont(%s , %s) OVER()" % (col, percentile)
|
2022-07-06 10:12:29 +02:00
|
|
|
|
|
|
|
|
|
|
|
@compiles(MedianFn, Dialects.ClickHouse)
|
|
|
|
def _(elements, compiler, **kwargs):
|
2023-03-03 21:56:32 +01:00
|
|
|
col, _, percentile = [
|
|
|
|
compiler.process(element, **kwargs) for element in elements.clauses
|
|
|
|
]
|
2023-03-13 11:34:40 +01:00
|
|
|
return "quantile(%s)(%s)" % (percentile, col)
|
2022-07-06 10:12:29 +02:00
|
|
|
|
|
|
|
|
2022-10-11 15:57:25 +02:00
|
|
|
# pylint: disable=unused-argument
|
2022-12-13 13:03:22 +01:00
|
|
|
@compiles(MedianFn, Dialects.Athena)
|
2022-07-08 15:55:50 +02:00
|
|
|
@compiles(MedianFn, Dialects.Trino)
|
2022-07-06 10:12:29 +02:00
|
|
|
@compiles(MedianFn, Dialects.Presto)
|
|
|
|
def _(elements, compiler, **kwargs):
|
2023-05-02 12:45:26 +02:00
|
|
|
col = compiler.process(elements.clauses.clauses[0])
|
2023-03-03 21:56:32 +01:00
|
|
|
percentile = elements.clauses.clauses[2].value
|
2023-05-02 12:45:26 +02:00
|
|
|
return 'approx_percentile("%s", %.2f)' % (col, percentile)
|
2022-07-06 10:12:29 +02:00
|
|
|
|
|
|
|
|
2022-07-13 14:43:56 +02:00
|
|
|
@compiles(MedianFn, Dialects.MSSQL)
|
|
|
|
def _(elements, compiler, **kwargs):
|
|
|
|
"""Median computation for MSSQL"""
|
2023-05-02 12:45:26 +02:00
|
|
|
col = compiler.process(elements.clauses.clauses[0])
|
2023-03-03 21:56:32 +01:00
|
|
|
percentile = elements.clauses.clauses[2].value
|
2023-05-02 12:45:26 +02:00
|
|
|
return "percentile_cont(%.2f) WITHIN GROUP (ORDER BY %s ASC) OVER()" % (
|
2023-03-03 21:56:32 +01:00
|
|
|
percentile,
|
|
|
|
col,
|
|
|
|
)
|
2022-07-13 14:43:56 +02:00
|
|
|
|
|
|
|
|
2022-09-24 00:48:09 +02:00
|
|
|
@compiles(MedianFn, Dialects.Hive)
|
|
|
|
def _(elements, compiler, **kwargs):
|
|
|
|
"""Median computation for Hive"""
|
2023-03-03 21:56:32 +01:00
|
|
|
col, _, percentile = [
|
|
|
|
compiler.process(element, **kwargs) for element in elements.clauses
|
|
|
|
]
|
2023-03-13 11:34:40 +01:00
|
|
|
return "percentile(cast(%s as BIGINT), %s)" % (col, percentile)
|
2022-09-24 00:48:09 +02:00
|
|
|
|
|
|
|
|
2023-04-06 11:07:42 -05:00
|
|
|
@compiles(MedianFn, Dialects.Impala)
|
|
|
|
def _(elements, compiler, **kwargs):
|
|
|
|
"""Median computation for Impala
|
|
|
|
Median compution for Impala uses the appx_median function.
|
|
|
|
OM uses this median function to also compute first and third quartiles.
|
|
|
|
These calculations are not supported with a simple function inside Impala.
|
|
|
|
The if statement returns null when we are not looking for the .5 precentile
|
|
|
|
In Impala to get the first quartile a full SQL statement like this is necessary:
|
|
|
|
with ntiles as
|
|
|
|
(
|
|
|
|
select filesize, ntile(4) over (order by filesize) as quarter
|
|
|
|
from hdfs_files
|
|
|
|
)
|
|
|
|
, quarters as
|
|
|
|
(
|
|
|
|
select 1 as grp, max(filesize) as quartile_value, quarter
|
|
|
|
from ntiles
|
|
|
|
group by quarter
|
|
|
|
)
|
|
|
|
select max(case when quarter = 1 then quartile_value end) as first_q
|
|
|
|
, max(case when quarter = 2 then quartile_value end) as second_q
|
|
|
|
, max(case when quarter = 3 then quartile_value end) as third_q
|
|
|
|
, max(case when quarter = 4 then quartile_value end) as fourth_q
|
|
|
|
from quarters
|
|
|
|
group by grp
|
|
|
|
;
|
|
|
|
"""
|
|
|
|
col, _, percentile = [
|
|
|
|
compiler.process(element, **kwargs) for element in elements.clauses
|
|
|
|
]
|
2023-05-05 14:45:30 -05:00
|
|
|
return f"if({percentile} = .5, appx_median({col}), null)"
|
2023-04-06 11:07:42 -05:00
|
|
|
|
|
|
|
|
2022-07-29 10:41:53 +02:00
|
|
|
@compiles(MedianFn, Dialects.MySQL)
|
2023-01-20 20:36:03 +01:00
|
|
|
def _(elements, compiler, **kwargs): # pylint: disable=unused-argument
|
2023-04-11 23:07:36 -07:00
|
|
|
"""Median computation for MySQL"""
|
|
|
|
col = compiler.process(elements.clauses.clauses[0])
|
|
|
|
table = elements.clauses.clauses[1].value
|
|
|
|
percentile = elements.clauses.clauses[2].value
|
|
|
|
|
|
|
|
return """
|
|
|
|
(SELECT
|
|
|
|
{col}
|
|
|
|
FROM (
|
|
|
|
SELECT
|
2023-04-30 03:03:56 -05:00
|
|
|
{col},
|
2023-04-11 23:07:36 -07:00
|
|
|
ROW_NUMBER() OVER () AS row_num
|
|
|
|
FROM
|
2023-04-30 03:03:56 -05:00
|
|
|
{table},
|
2023-04-11 23:07:36 -07:00
|
|
|
(SELECT @counter := COUNT(*) FROM {table}) t_count
|
|
|
|
ORDER BY {col}
|
|
|
|
) temp
|
|
|
|
WHERE temp.row_num = ROUND({percentile} * @counter)
|
|
|
|
)
|
|
|
|
""".format(
|
|
|
|
col=col, table=table, percentile=percentile
|
|
|
|
)
|
2022-07-29 10:41:53 +02:00
|
|
|
|
|
|
|
|
2022-07-06 10:12:29 +02:00
|
|
|
@compiles(MedianFn, Dialects.SQLite)
|
2022-10-11 09:36:36 +02:00
|
|
|
def _(elements, compiler, **kwargs): # pylint: disable=unused-argument
|
2023-03-03 21:56:32 +01:00
|
|
|
col = compiler.process(elements.clauses.clauses[0])
|
|
|
|
table = elements.clauses.clauses[1].value
|
|
|
|
percentile = elements.clauses.clauses[2].value
|
|
|
|
|
2022-07-06 10:12:29 +02:00
|
|
|
return """
|
|
|
|
(SELECT
|
2023-03-03 21:56:32 +01:00
|
|
|
{col}
|
|
|
|
FROM {table}
|
|
|
|
WHERE {col} IS NOT NULL
|
|
|
|
ORDER BY {col}
|
|
|
|
LIMIT 1
|
|
|
|
OFFSET (
|
|
|
|
SELECT ROUND(COUNT(*) * {percentile} -1)
|
2022-07-06 10:12:29 +02:00
|
|
|
FROM {table}
|
2023-03-03 21:56:32 +01:00
|
|
|
WHERE {col} IS NOT NULL
|
|
|
|
)
|
2022-07-06 10:12:29 +02:00
|
|
|
)
|
2023-03-03 21:56:32 +01:00
|
|
|
""".format(
|
|
|
|
col=col, table=table, percentile=percentile
|
2023-01-20 20:36:03 +01:00
|
|
|
)
|