mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-10-06 14:26:28 +00:00
MINOR: row sampling error (#21488)
* fix: row sampling error * fix: return sample query (cherry picked from commit 859f24aba7c2b0bcbaf85b149b2a02d60fa4e201)
This commit is contained in:
parent
38724bf2fe
commit
68b0eb34b7
@ -15,7 +15,7 @@ for the profiler
|
|||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from sqlalchemy import Column, Table, text
|
from sqlalchemy import Column, Table, text
|
||||||
from sqlalchemy.orm import Query
|
from sqlalchemy.sql.selectable import CTE
|
||||||
|
|
||||||
from metadata.generated.schema.entity.data.table import TableData, TableType
|
from metadata.generated.schema.entity.data.table import TableData, TableType
|
||||||
from metadata.sampler.sqlalchemy.sampler import ProfileSampleType, SQASampler
|
from metadata.sampler.sqlalchemy.sampler import ProfileSampleType, SQASampler
|
||||||
@ -49,13 +49,12 @@ class AzureSQLSampler(SQASampler):
|
|||||||
|
|
||||||
return selectable
|
return selectable
|
||||||
|
|
||||||
def get_sample_query(self, *, column=None) -> Query:
|
def get_sample_query(self, *, column=None) -> CTE:
|
||||||
"""get query for sample data"""
|
"""Override the base method as ROWS or PERCENT sampling handled through the tablesample clause"""
|
||||||
rnd = self._base_sample_query(column).cte(
|
rnd = self._base_sample_query(column).cte(
|
||||||
f"{self.get_sampler_table_name()}_rnd"
|
f"{self.get_sampler_table_name()}_rnd"
|
||||||
)
|
)
|
||||||
with self.get_client() as client:
|
query = self.get_client().query(rnd)
|
||||||
query = client.query(rnd)
|
|
||||||
return query.cte(f"{self.get_sampler_table_name()}_sample")
|
return query.cte(f"{self.get_sampler_table_name()}_sample")
|
||||||
|
|
||||||
def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData:
|
def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData:
|
||||||
|
@ -44,10 +44,9 @@ class MssqlSampler(SQASampler):
|
|||||||
return selectable
|
return selectable
|
||||||
|
|
||||||
def get_sample_query(self, *, column=None) -> CTE:
|
def get_sample_query(self, *, column=None) -> CTE:
|
||||||
"""get query for sample data"""
|
"""Override the base method as ROWS or PERCENT sampling handled through the tablesample clause"""
|
||||||
rnd = self._base_sample_query(column).cte(
|
rnd = self._base_sample_query(column).cte(
|
||||||
f"{self.get_sampler_table_name()}_rnd"
|
f"{self.get_sampler_table_name()}_rnd"
|
||||||
)
|
)
|
||||||
with self.get_client() as client:
|
query = self.get_client().query(rnd)
|
||||||
query = client.query(rnd)
|
|
||||||
return query.cte(f"{self.get_sampler_table_name()}_sample")
|
return query.cte(f"{self.get_sampler_table_name()}_sample")
|
||||||
|
@ -136,6 +136,8 @@ class SQASampler(SamplerInterface, SQAInterfaceMixin):
|
|||||||
).cte(f"{self.get_sampler_table_name()}_sample")
|
).cte(f"{self.get_sampler_table_name()}_sample")
|
||||||
|
|
||||||
table_query = client.query(self.raw_dataset)
|
table_query = client.query(self.raw_dataset)
|
||||||
|
if self.partition_details:
|
||||||
|
table_query = self.get_partitioned_query(table_query)
|
||||||
session_query = self._base_sample_query(
|
session_query = self._base_sample_query(
|
||||||
column,
|
column,
|
||||||
(ModuloFn(RandomNumFn(), table_query.count())).label(RANDOM_LABEL)
|
(ModuloFn(RandomNumFn(), table_query.count())).label(RANDOM_LABEL)
|
||||||
|
@ -87,10 +87,9 @@ class SnowflakeSampler(SQASampler):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def get_sample_query(self, *, column=None) -> CTE:
|
def get_sample_query(self, *, column=None) -> CTE:
|
||||||
"""get query for sample data"""
|
"""Override the base method as ROWS or PERCENT sampling handled through the tablesample clause"""
|
||||||
rnd = self._base_sample_query(column).cte(
|
rnd = self._base_sample_query(column).cte(
|
||||||
f"{self.get_sampler_table_name()}_rnd"
|
f"{self.get_sampler_table_name()}_rnd"
|
||||||
)
|
)
|
||||||
with self.get_client() as client:
|
query = self.get_client().query(rnd)
|
||||||
query = client.query(rnd)
|
|
||||||
return query.cte(f"{self.get_sampler_table_name()}_sample")
|
return query.cte(f"{self.get_sampler_table_name()}_sample")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user