MINOR: Remove default 100 when profileSample is None (#19672)

* fix: remove default 100% percent

* fix: use get_dataset

* fix: orm_profiler tests
This commit is contained in:
Teddy 2025-02-05 19:14:31 +01:00 committed by GitHub
parent 76935f5c2e
commit 28bd01c471
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 7 additions and 11 deletions

View File

@ -76,9 +76,6 @@ class SamplerInterface(ABC):
self._columns: Optional[List[SQALikeColumn]] = None
self.sample_config = sample_config
if not self.sample_config.profileSample:
self.sample_config.profileSample = 100
self.entity = entity
self.include_columns = include_columns
self.exclude_columns = exclude_columns

View File

@ -162,23 +162,23 @@ class SQASampler(SamplerInterface, SQAInterfaceMixin):
return self._fetch_sample_data_from_user_query()
# Add new RandomNumFn column
rnd = self.get_sample_query()
ds = self.get_dataset()
if not columns:
sqa_columns = [col for col in inspect(rnd).c if col.name != RANDOM_LABEL]
sqa_columns = [col for col in inspect(ds).c if col.name != RANDOM_LABEL]
else:
# we can't directly use columns as it is bound to self.raw_dataset and not the rnd table.
# If we use it, it will result in a cross join between self.raw_dataset and rnd table
names = [col.name for col in columns]
sqa_columns = [
col
for col in inspect(rnd).c
for col in inspect(ds).c
if col.name != RANDOM_LABEL and col.name in names
]
try:
sqa_sample = (
self.client.query(*sqa_columns)
.select_from(rnd)
.select_from(ds)
.limit(self.sample_limit)
.all()
)

View File

@ -549,8 +549,7 @@ def test_workflow_values_partition(ingest, metadata, service_name):
profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile
assert profile.rowCount == 4.0
# If we don't have any sample, default to 100
assert profile.profileSample == 100.0
assert profile.profileSample == None
workflow_config["processor"] = {
"type": "orm-profiler",

View File

@ -65,7 +65,7 @@ class ProfilerTestParameters:
ColumnProfile(
name="three",
timestamp=Timestamp(0),
valuesCount=1,
valuesCount=2,
nullCount=1,
)
],
@ -101,7 +101,7 @@ class ProfilerTestParameters:
ColumnProfile(
name="gender",
timestamp=Timestamp(0),
valuesCount=932,
valuesCount=1000,
nullCount=0,
)
],