From 28bd01c471844d00183a97d67443e02e54e2be2b Mon Sep 17 00:00:00 2001
From: Teddy <teddy.crepineau@gmail.com>
Date: Wed, 5 Feb 2025 19:14:31 +0100
Subject: [PATCH] MINOR: Remove default 100 when `profileSample` is None
 (#19672)

* fix: remove default 100% percent

* fix: use get_dataset

* fix: orm_profiler tests
---
 ingestion/src/metadata/sampler/sampler_interface.py       | 3 ---
 ingestion/src/metadata/sampler/sqlalchemy/sampler.py      | 8 ++++----
 .../integration/orm_profiler/test_orm_profiler_e2e.py     | 3 +--
 ingestion/tests/integration/trino/test_profiler.py        | 4 ++--
 4 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/ingestion/src/metadata/sampler/sampler_interface.py b/ingestion/src/metadata/sampler/sampler_interface.py
index 970793cb2fe..fe363816d01 100644
--- a/ingestion/src/metadata/sampler/sampler_interface.py
+++ b/ingestion/src/metadata/sampler/sampler_interface.py
@@ -76,9 +76,6 @@ class SamplerInterface(ABC):
         self._columns: Optional[List[SQALikeColumn]] = None
         self.sample_config = sample_config
 
-        if not self.sample_config.profileSample:
-            self.sample_config.profileSample = 100
-
         self.entity = entity
         self.include_columns = include_columns
         self.exclude_columns = exclude_columns
diff --git a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py
index cd87b79368b..8c15f5015d3 100644
--- a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py
+++ b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py
@@ -162,23 +162,23 @@ class SQASampler(SamplerInterface, SQAInterfaceMixin):
             return self._fetch_sample_data_from_user_query()
 
         # Add new RandomNumFn column
-        rnd = self.get_sample_query()
+        ds = self.get_dataset()
         if not columns:
-            sqa_columns = [col for col in inspect(rnd).c if col.name != RANDOM_LABEL]
+            sqa_columns = [col for col in inspect(ds).c if col.name != RANDOM_LABEL]
         else:
             # we can't directly use columns as it is bound to self.raw_dataset and not the rnd table.
             # If we use it, it will result in a cross join between self.raw_dataset and rnd table
             names = [col.name for col in columns]
             sqa_columns = [
                 col
-                for col in inspect(rnd).c
+                for col in inspect(ds).c
                 if col.name != RANDOM_LABEL and col.name in names
             ]
 
         try:
             sqa_sample = (
                 self.client.query(*sqa_columns)
-                .select_from(rnd)
+                .select_from(ds)
                 .limit(self.sample_limit)
                 .all()
             )
diff --git a/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py b/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py
index 2d1d976ec72..d6ca5e8a7e0 100644
--- a/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py
+++ b/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py
@@ -549,8 +549,7 @@ def test_workflow_values_partition(ingest, metadata, service_name):
     profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile
 
     assert profile.rowCount == 4.0
-    # If we don't have any sample, default to 100
-    assert profile.profileSample == 100.0
+    assert profile.profileSample == None
 
     workflow_config["processor"] = {
         "type": "orm-profiler",
diff --git a/ingestion/tests/integration/trino/test_profiler.py b/ingestion/tests/integration/trino/test_profiler.py
index 6e12ca7b7b0..e8092c1ded7 100644
--- a/ingestion/tests/integration/trino/test_profiler.py
+++ b/ingestion/tests/integration/trino/test_profiler.py
@@ -65,7 +65,7 @@ class ProfilerTestParameters:
                 ColumnProfile(
                     name="three",
                     timestamp=Timestamp(0),
-                    valuesCount=1,
+                    valuesCount=2,
                     nullCount=1,
                 )
             ],
@@ -101,7 +101,7 @@ class ProfilerTestParameters:
                 ColumnProfile(
                     name="gender",
                     timestamp=Timestamp(0),
-                    valuesCount=932,
+                    valuesCount=1000,
                     nullCount=0,
                 )
             ],