2022-03-30 08:54:27 +02:00
|
|
|
# Copyright 2021 Collate
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
"""
|
|
|
|
Helper module to handle data sampling
|
|
|
|
for the profiler
|
|
|
|
"""
|
2022-06-14 21:37:44 +02:00
|
|
|
from typing import Dict, Optional, Union
|
2022-03-30 08:54:27 +02:00
|
|
|
|
2022-06-28 19:27:55 +02:00
|
|
|
from sqlalchemy import column, inspect, text
|
2022-06-08 16:10:40 +02:00
|
|
|
from sqlalchemy.orm import DeclarativeMeta, Query, Session, aliased
|
2022-03-30 08:54:27 +02:00
|
|
|
from sqlalchemy.orm.util import AliasedClass
|
|
|
|
|
2022-06-08 16:10:40 +02:00
|
|
|
from metadata.generated.schema.entity.data.table import TableData
|
|
|
|
from metadata.orm_profiler.orm.functions.modulo import ModuloFn
|
2022-03-30 08:54:27 +02:00
|
|
|
from metadata.orm_profiler.orm.functions.random_num import RandomNumFn
|
2022-09-20 08:55:39 +02:00
|
|
|
from metadata.orm_profiler.orm.registry import Dialects
|
2022-06-14 21:37:44 +02:00
|
|
|
from metadata.orm_profiler.profiler.handle_partition import partition_filter_handler
|
2022-03-30 08:54:27 +02:00
|
|
|
|
2022-06-08 16:10:40 +02:00
|
|
|
RANDOM_LABEL = "random"
|
|
|
|
|
2022-03-30 08:54:27 +02:00
|
|
|
|
|
|
|
class Sampler:
|
|
|
|
"""
|
|
|
|
Generates a sample of the data to not
|
|
|
|
run the query in the whole table.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
2022-11-15 20:31:10 +05:30
|
|
|
session: Optional[Session],
|
2022-03-30 08:54:27 +02:00
|
|
|
table: DeclarativeMeta,
|
|
|
|
profile_sample: Optional[float] = None,
|
2022-06-14 21:37:44 +02:00
|
|
|
partition_details: Optional[Dict] = None,
|
2022-06-24 14:46:34 +02:00
|
|
|
profile_sample_query: Optional[str] = None,
|
2022-03-30 08:54:27 +02:00
|
|
|
):
|
|
|
|
self.profile_sample = profile_sample
|
|
|
|
self.session = session
|
|
|
|
self.table = table
|
2022-06-14 21:37:44 +02:00
|
|
|
self._partition_details = partition_details
|
2022-06-24 14:46:34 +02:00
|
|
|
self._profile_sample_query = profile_sample_query
|
2022-06-08 16:10:40 +02:00
|
|
|
self.sample_limit = 100
|
2022-11-15 20:31:10 +05:30
|
|
|
self._sample_rows = None
|
2022-06-08 16:10:40 +02:00
|
|
|
|
2022-06-14 21:37:44 +02:00
|
|
|
@partition_filter_handler(build_sample=True)
|
2022-06-08 16:10:40 +02:00
|
|
|
def get_sample_query(self) -> Query:
|
2022-09-20 08:55:39 +02:00
|
|
|
return (
|
|
|
|
self.session.query(
|
|
|
|
self.table, (ModuloFn(RandomNumFn(), 100)).label(RANDOM_LABEL)
|
|
|
|
)
|
|
|
|
.suffix_with(
|
2022-10-24 18:26:00 +02:00
|
|
|
f"SAMPLE BERNOULLI ({self.profile_sample or 100})",
|
2022-10-11 15:57:25 +02:00
|
|
|
dialect=Dialects.Snowflake,
|
2022-09-20 08:55:39 +02:00
|
|
|
)
|
|
|
|
.cte(f"{self.table.__tablename__}_rnd")
|
|
|
|
)
|
2022-06-08 16:10:40 +02:00
|
|
|
|
2022-03-30 08:54:27 +02:00
|
|
|
def random_sample(self) -> Union[DeclarativeMeta, AliasedClass]:
|
|
|
|
"""
|
|
|
|
Either return a sampled CTE of table, or
|
|
|
|
the full table if no sampling is required.
|
|
|
|
"""
|
2022-08-19 10:52:08 +02:00
|
|
|
if self._profile_sample_query:
|
|
|
|
return self._fetch_sample_data_with_query_object()
|
2022-06-24 14:46:34 +02:00
|
|
|
|
2022-06-28 19:27:55 +02:00
|
|
|
if not self.profile_sample:
|
|
|
|
if self._partition_details:
|
|
|
|
return self._random_sample_for_partitioned_tables()
|
|
|
|
|
|
|
|
return self.table
|
|
|
|
|
2022-03-30 08:54:27 +02:00
|
|
|
# Add new RandomNumFn column
|
2022-06-08 16:10:40 +02:00
|
|
|
rnd = self.get_sample_query()
|
2022-03-30 08:54:27 +02:00
|
|
|
|
|
|
|
# Prepare sampled CTE
|
|
|
|
sampled = (
|
|
|
|
self.session.query(rnd)
|
|
|
|
.where(rnd.c.random <= self.profile_sample)
|
|
|
|
.cte(f"{self.table.__tablename__}_sample")
|
|
|
|
)
|
|
|
|
|
|
|
|
# Assign as an alias
|
|
|
|
return aliased(self.table, sampled)
|
2022-06-08 16:10:40 +02:00
|
|
|
|
2022-11-15 20:31:10 +05:30
|
|
|
def fetch_sqa_sample_data(self) -> TableData:
|
2022-06-08 16:10:40 +02:00
|
|
|
"""
|
2022-11-15 20:31:10 +05:30
|
|
|
Use the sampler to retrieve sample data rows as per limit given by user
|
2022-06-08 16:10:40 +02:00
|
|
|
:return: TableData to be added to the Table Entity
|
|
|
|
"""
|
2022-06-24 14:46:34 +02:00
|
|
|
if self._profile_sample_query:
|
|
|
|
return self._fetch_sample_data_from_user_query()
|
2022-06-08 16:10:40 +02:00
|
|
|
|
|
|
|
# Add new RandomNumFn column
|
|
|
|
rnd = self.get_sample_query()
|
|
|
|
sqa_columns = [col for col in inspect(rnd).c if col.name != RANDOM_LABEL]
|
|
|
|
|
|
|
|
sqa_sample = (
|
|
|
|
self.session.query(*sqa_columns)
|
|
|
|
.select_from(rnd)
|
|
|
|
.limit(self.sample_limit)
|
|
|
|
.all()
|
|
|
|
)
|
|
|
|
|
|
|
|
return TableData(
|
|
|
|
columns=[column.name for column in sqa_columns],
|
|
|
|
rows=[list(row) for row in sqa_sample],
|
|
|
|
)
|
2022-06-24 14:46:34 +02:00
|
|
|
|
|
|
|
def _fetch_sample_data_from_user_query(self) -> TableData:
|
|
|
|
"""Returns a table data object using results from query execution"""
|
|
|
|
rnd = self.session.execute(f"{self._profile_sample_query}")
|
|
|
|
try:
|
|
|
|
columns = [col.name for col in rnd.cursor.description]
|
|
|
|
except AttributeError:
|
|
|
|
columns = list(rnd.keys())
|
|
|
|
return TableData(
|
|
|
|
columns=columns,
|
|
|
|
rows=[list(row) for row in rnd.fetchmany(100)],
|
|
|
|
)
|
|
|
|
|
|
|
|
def _fetch_sample_data_with_query_object(self) -> Query:
|
|
|
|
"""Returns sql alchemy object to use when running profiling"""
|
|
|
|
return self.session.query(self.table).from_statement(
|
|
|
|
text(f"{self._profile_sample_query}")
|
|
|
|
)
|
2022-06-28 19:27:55 +02:00
|
|
|
|
|
|
|
def _random_sample_for_partitioned_tables(self) -> Query:
|
|
|
|
"""Return the Query object for partitioned tables"""
|
|
|
|
partition_field = self._partition_details["partition_field"]
|
|
|
|
if not self._partition_details.get("partition_values"):
|
|
|
|
sample = (
|
|
|
|
self.session.query(self.table)
|
|
|
|
.filter(
|
|
|
|
column(partition_field)
|
|
|
|
>= self._partition_details["partition_start"].strftime("%Y-%m-%d"),
|
|
|
|
column(partition_field)
|
|
|
|
<= self._partition_details["partition_end"].strftime("%Y-%m-%d"),
|
|
|
|
)
|
|
|
|
.subquery()
|
|
|
|
)
|
|
|
|
return aliased(self.table, sample)
|
|
|
|
sample = (
|
|
|
|
self.session.query(self.table)
|
|
|
|
.filter(
|
|
|
|
column(partition_field).in_(self._partition_details["partition_values"])
|
|
|
|
)
|
|
|
|
.subquery()
|
|
|
|
)
|
|
|
|
return aliased(self.table, sample)
|