From 31d2595e4f4060b3ec90cf85036458f3e6b76a2e Mon Sep 17 00:00:00 2001
From: Teddy <teddy.crepineau@gmail.com>
Date: Fri, 13 Oct 2023 11:27:28 +0200
Subject: [PATCH] fix: pass rnd table bound columns to sample query (#13561)

---
 .../processor/sampler/sqlalchemy/sampler.py   |  9 ++-
 .../orm_profiler/test_orm_profiler_e2e.py     | 75 +++++++++++++++++++
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py
index 8c86ce06ecc..ac544e9878e 100644
--- a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py
+++ b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py
@@ -134,7 +134,14 @@ class SQASampler(SamplerInterface):
         if not columns:
             sqa_columns = [col for col in inspect(rnd).c if col.name != RANDOM_LABEL]
         else:
-            sqa_columns = list(columns)  # copy columns
+            # we can't directly use columns as it is bound to self.table and not the rnd table.
+            # If we use it, it will result in a cross join between self.table and rnd table
+            names = [col.name for col in columns]
+            sqa_columns = [
+                col
+                for col in inspect(rnd).c
+                if col.name != RANDOM_LABEL and col.name in names
+            ]
 
         sqa_sample = (
             self.client.query(*sqa_columns)
diff --git a/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py b/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py
index 53558f1532f..0c80fdd5305 100644
--- a/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py
+++ b/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py
@@ -690,3 +690,78 @@ class ProfilerWorkflowTest(TestCase):
         assert sorted([c.__root__ for c in sample_data.sampleData.columns]) == sorted(
             ["id", "age"]
         )
+
+    def test_sample_data_ingestion(self):
+        """test the rows of the sample data are what we expect"""
+        workflow_config = deepcopy(ingestion_config)
+        workflow_config["source"]["sourceConfig"]["config"].update(
+            {
+                "type": "Profiler",
+                "tableFilterPattern": {"includes": ["users"]},
+            }
+        )
+        workflow_config["processor"] = {
+            "type": "orm-profiler",
+            "config": {
+                "profiler": {
+                    "name": "my_profiler",
+                    "timeout_seconds": 60,
+                    "metrics": ["row_count", "min", "max", "COUNT", "null_count"],
+                },
+                "tableConfig": [
+                    {
+                        "fullyQualifiedName": "test_sqlite.main.main.users",
+                    }
+                ],
+            },
+        }
+
+        profiler_workflow = ProfilerWorkflow.create(workflow_config)
+        profiler_workflow.execute()
+        status = profiler_workflow.result_status()
+        profiler_workflow.stop()
+
+        assert status == 0
+
+        table = self.metadata.get_by_name(
+            entity=Table,
+            fqn="test_sqlite.main.main.users",
+        )
+
+        # Test we are getting the expected sample data
+        expected_sample_data = [
+            [
+                1,
+                "John",
+                "John Doe",
+                "johnny b goode",
+                30,
+            ],
+            [
+                2,
+                "Jane",
+                "Jone Doe",
+                None,
+                31,
+            ],
+            [
+                3,
+                "Joh",
+                "Joh Doe",
+                None,
+                37,
+            ],
+            [
+                4,
+                "Jae",
+                "Jae Doe",
+                None,
+                38,
+            ],
+        ]
+        sample_data = self.metadata.get_sample_data(table).sampleData.rows
+        sample_data = [data[:-1] for data in sample_data]  # remove timestamp as dynamic
+        self.assertListEqual(
+            sorted(sample_data),
+            sorted(expected_sample_data),
+        )