MINOR: Remove default 100 when profileSample is None (#19672)

* fix: remove default 100% percent * fix: use get_dataset * fix: orm_profiler tests
2025-12-04 11:33:07 +00:00 · 2025-02-05 19:14:31 +01:00 · 2025-02-05 19:14:31 +01:00 · 28bd01c471
commit 28bd01c471
parent 76935f5c2e
4 changed files with 7 additions and 11 deletions
--- a/ingestion/src/metadata/sampler/sampler_interface.py
+++ b/ingestion/src/metadata/sampler/sampler_interface.py
@ -76,9 +76,6 @@ class SamplerInterface(ABC):
        self._columns: Optional[List[SQALikeColumn]] = None
        self.sample_config = sample_config
        if not self.sample_config.profileSample:
            self.sample_config.profileSample = 100
        self.entity = entity
        self.include_columns = include_columns
        self.exclude_columns = exclude_columns
--- a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py
+++ b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py
@ -162,23 +162,23 @@ class SQASampler(SamplerInterface, SQAInterfaceMixin):
            return self._fetch_sample_data_from_user_query()
        # Add new RandomNumFn column
-        rnd = self.get_sample_query()
+        ds = self.get_dataset()
        if not columns:
-            sqa_columns = [col for col in inspect(rnd).c if col.name != RANDOM_LABEL]
+            sqa_columns = [col for col in inspect(ds).c if col.name != RANDOM_LABEL]
        else:
            # we can't directly use columns as it is bound to self.raw_dataset and not the rnd table.
            # If we use it, it will result in a cross join between self.raw_dataset and rnd table
            names = [col.name for col in columns]
            sqa_columns = [
                col
-                for col in inspect(rnd).c
+                for col in inspect(ds).c
                if col.name != RANDOM_LABEL and col.name in names
            ]
        try:
            sqa_sample = (
                self.client.query(*sqa_columns)
-                .select_from(rnd)
+                .select_from(ds)
                .limit(self.sample_limit)
                .all()
            )
--- a/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py
+++ b/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py
@ -549,8 +549,7 @@ def test_workflow_values_partition(ingest, metadata, service_name):
    profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile
    assert profile.rowCount == 4.0
-    # If we don't have any sample, default to 100
+    assert profile.profileSample == None
    assert profile.profileSample == 100.0
    workflow_config["processor"] = {
        "type": "orm-profiler",
--- a/ingestion/tests/integration/trino/test_profiler.py
+++ b/ingestion/tests/integration/trino/test_profiler.py
@ -65,7 +65,7 @@ class ProfilerTestParameters:
                ColumnProfile(
                    name="three",
                    timestamp=Timestamp(0),
-                    valuesCount=1,
+                    valuesCount=2,
                    nullCount=1,
                )
            ],
@ -101,7 +101,7 @@ class ProfilerTestParameters:
                ColumnProfile(
                    name="gender",
                    timestamp=Timestamp(0),
-                    valuesCount=932,
+                    valuesCount=1000,
                    nullCount=0,
                )
            ],