From 28bd01c471844d00183a97d67443e02e54e2be2b Mon Sep 17 00:00:00 2001 From: Teddy Date: Wed, 5 Feb 2025 19:14:31 +0100 Subject: [PATCH] MINOR: Remove default 100 when `profileSample` is None (#19672) * fix: remove default 100% percent * fix: use get_dataset * fix: orm_profiler tests --- ingestion/src/metadata/sampler/sampler_interface.py | 3 --- ingestion/src/metadata/sampler/sqlalchemy/sampler.py | 8 ++++---- .../integration/orm_profiler/test_orm_profiler_e2e.py | 3 +-- ingestion/tests/integration/trino/test_profiler.py | 4 ++-- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/ingestion/src/metadata/sampler/sampler_interface.py b/ingestion/src/metadata/sampler/sampler_interface.py index 970793cb2fe..fe363816d01 100644 --- a/ingestion/src/metadata/sampler/sampler_interface.py +++ b/ingestion/src/metadata/sampler/sampler_interface.py @@ -76,9 +76,6 @@ class SamplerInterface(ABC): self._columns: Optional[List[SQALikeColumn]] = None self.sample_config = sample_config - if not self.sample_config.profileSample: - self.sample_config.profileSample = 100 - self.entity = entity self.include_columns = include_columns self.exclude_columns = exclude_columns diff --git a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py index cd87b79368b..8c15f5015d3 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py @@ -162,23 +162,23 @@ class SQASampler(SamplerInterface, SQAInterfaceMixin): return self._fetch_sample_data_from_user_query() # Add new RandomNumFn column - rnd = self.get_sample_query() + ds = self.get_dataset() if not columns: - sqa_columns = [col for col in inspect(rnd).c if col.name != RANDOM_LABEL] + sqa_columns = [col for col in inspect(ds).c if col.name != RANDOM_LABEL] else: # we can't directly use columns as it is bound to self.raw_dataset and not the rnd table. # If we use it, it will result in a cross join between self.raw_dataset and rnd table names = [col.name for col in columns] sqa_columns = [ col - for col in inspect(rnd).c + for col in inspect(ds).c if col.name != RANDOM_LABEL and col.name in names ] try: sqa_sample = ( self.client.query(*sqa_columns) - .select_from(rnd) + .select_from(ds) .limit(self.sample_limit) .all() ) diff --git a/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py b/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py index 2d1d976ec72..d6ca5e8a7e0 100644 --- a/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py +++ b/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py @@ -549,8 +549,7 @@ def test_workflow_values_partition(ingest, metadata, service_name): profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile assert profile.rowCount == 4.0 - # If we don't have any sample, default to 100 - assert profile.profileSample == 100.0 + assert profile.profileSample == None workflow_config["processor"] = { "type": "orm-profiler", diff --git a/ingestion/tests/integration/trino/test_profiler.py b/ingestion/tests/integration/trino/test_profiler.py index 6e12ca7b7b0..e8092c1ded7 100644 --- a/ingestion/tests/integration/trino/test_profiler.py +++ b/ingestion/tests/integration/trino/test_profiler.py @@ -65,7 +65,7 @@ class ProfilerTestParameters: ColumnProfile( name="three", timestamp=Timestamp(0), - valuesCount=1, + valuesCount=2, nullCount=1, ) ], @@ -101,7 +101,7 @@ class ProfilerTestParameters: ColumnProfile( name="gender", timestamp=Timestamp(0), - valuesCount=932, + valuesCount=1000, nullCount=0, ) ],