From 31d2595e4f4060b3ec90cf85036458f3e6b76a2e Mon Sep 17 00:00:00 2001 From: Teddy Date: Fri, 13 Oct 2023 11:27:28 +0200 Subject: [PATCH] fix: pass rnd table bound columns to sample query (#13561) --- .../processor/sampler/sqlalchemy/sampler.py | 9 ++- .../orm_profiler/test_orm_profiler_e2e.py | 75 +++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py index 8c86ce06ecc..ac544e9878e 100644 --- a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py +++ b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py @@ -134,7 +134,14 @@ class SQASampler(SamplerInterface): if not columns: sqa_columns = [col for col in inspect(rnd).c if col.name != RANDOM_LABEL] else: - sqa_columns = list(columns) # copy columns + # we can't directly use columns as it is bound to self.table and not the rnd table. + # If we use it, it will result in a cross join between self.table and rnd table + names = [col.name for col in columns] + sqa_columns = [ + col + for col in inspect(rnd).c + if col.name != RANDOM_LABEL and col.name in names + ] sqa_sample = ( self.client.query(*sqa_columns) diff --git a/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py b/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py index 53558f1532f..0c80fdd5305 100644 --- a/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py +++ b/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py @@ -690,3 +690,78 @@ class ProfilerWorkflowTest(TestCase): assert sorted([c.__root__ for c in sample_data.sampleData.columns]) == sorted( ["id", "age"] ) + + def test_sample_data_ingestion(self): + """test the rows of the sample data are what we expect""" + workflow_config = deepcopy(ingestion_config) + workflow_config["source"]["sourceConfig"]["config"].update( + { + "type": "Profiler", + "tableFilterPattern": {"includes": ["users"]}, + } + ) + workflow_config["processor"] = { + "type": "orm-profiler", + "config": { + "profiler": { + "name": "my_profiler", + "timeout_seconds": 60, + "metrics": ["row_count", "min", "max", "COUNT", "null_count"], + }, + "tableConfig": [ + { + "fullyQualifiedName": "test_sqlite.main.main.users", + } + ], + }, + } + + profiler_workflow = ProfilerWorkflow.create(workflow_config) + profiler_workflow.execute() + status = profiler_workflow.result_status() + profiler_workflow.stop() + + assert status == 0 + + table = self.metadata.get_by_name( + entity=Table, + fqn="test_sqlite.main.main.users", + ) + + # Test we are getting the expected sample data + expected_sample_data = [ + [ + 1, + "John", + "John Doe", + "johnny b goode", + 30, + ], + [ + 2, + "Jane", + "Jone Doe", + None, + 31, + ], + [ + 3, + "Joh", + "Joh Doe", + None, + 37, + ], + [ + 4, + "Jae", + "Jae Doe", + None, + 38, + ], + ] + sample_data = self.metadata.get_sample_data(table).sampleData.rows + sample_data = [data[:-1] for data in sample_data] # remove timestamp as dynamic + self.assertListEqual( + sorted(sample_data), + sorted(expected_sample_data), + )