MINOR: Remove default 100 when profileSample is None (#19672)

* fix: remove default 100% percent

* fix: use get_dataset

* fix: orm_profiler tests
This commit is contained in:
Teddy 2025-02-05 19:14:31 +01:00 committed by GitHub
parent 76935f5c2e
commit 28bd01c471
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 7 additions and 11 deletions

View File

@ -76,9 +76,6 @@ class SamplerInterface(ABC):
self._columns: Optional[List[SQALikeColumn]] = None self._columns: Optional[List[SQALikeColumn]] = None
self.sample_config = sample_config self.sample_config = sample_config
if not self.sample_config.profileSample:
self.sample_config.profileSample = 100
self.entity = entity self.entity = entity
self.include_columns = include_columns self.include_columns = include_columns
self.exclude_columns = exclude_columns self.exclude_columns = exclude_columns

View File

@ -162,23 +162,23 @@ class SQASampler(SamplerInterface, SQAInterfaceMixin):
return self._fetch_sample_data_from_user_query() return self._fetch_sample_data_from_user_query()
# Add new RandomNumFn column # Add new RandomNumFn column
rnd = self.get_sample_query() ds = self.get_dataset()
if not columns: if not columns:
sqa_columns = [col for col in inspect(rnd).c if col.name != RANDOM_LABEL] sqa_columns = [col for col in inspect(ds).c if col.name != RANDOM_LABEL]
else: else:
# we can't directly use columns as it is bound to self.raw_dataset and not the rnd table. # we can't directly use columns as it is bound to self.raw_dataset and not the rnd table.
# If we use it, it will result in a cross join between self.raw_dataset and rnd table # If we use it, it will result in a cross join between self.raw_dataset and rnd table
names = [col.name for col in columns] names = [col.name for col in columns]
sqa_columns = [ sqa_columns = [
col col
for col in inspect(rnd).c for col in inspect(ds).c
if col.name != RANDOM_LABEL and col.name in names if col.name != RANDOM_LABEL and col.name in names
] ]
try: try:
sqa_sample = ( sqa_sample = (
self.client.query(*sqa_columns) self.client.query(*sqa_columns)
.select_from(rnd) .select_from(ds)
.limit(self.sample_limit) .limit(self.sample_limit)
.all() .all()
) )

View File

@ -549,8 +549,7 @@ def test_workflow_values_partition(ingest, metadata, service_name):
profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile
assert profile.rowCount == 4.0 assert profile.rowCount == 4.0
# If we don't have any sample, default to 100 assert profile.profileSample == None
assert profile.profileSample == 100.0
workflow_config["processor"] = { workflow_config["processor"] = {
"type": "orm-profiler", "type": "orm-profiler",

View File

@ -65,7 +65,7 @@ class ProfilerTestParameters:
ColumnProfile( ColumnProfile(
name="three", name="three",
timestamp=Timestamp(0), timestamp=Timestamp(0),
valuesCount=1, valuesCount=2,
nullCount=1, nullCount=1,
) )
], ],
@ -101,7 +101,7 @@ class ProfilerTestParameters:
ColumnProfile( ColumnProfile(
name="gender", name="gender",
timestamp=Timestamp(0), timestamp=Timestamp(0),
valuesCount=932, valuesCount=1000,
nullCount=0, nullCount=0,
) )
], ],