From bad772db397d5af7ebfb53173d25bd5613f1f5b8 Mon Sep 17 00:00:00 2001 From: IceS2 Date: Fri, 25 Jul 2025 15:17:38 +0200 Subject: [PATCH] FIX #22099: enable 'Column values to be in set' test case for boolean columns (#22491) * fix(dq): enable ''Column values to be in set'' test case for boolean columns Add BOOLEAN to supportedDataTypes array in columnValuesToBeInSet.json to allow boolean column validation with predefined allowed values. This enables users to enforce strict true/false validation on boolean columns directly at the column level, resolving issue #22099. Co-authored-by: IceS2 * Add tests to the new feature * Add migrations and columnValuesToBeNotInSet --------- Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com> Co-authored-by: IceS2 --- .../native/1.9.0/mysql/schemaChanges.sql | 4 ++++ .../native/1.9.0/postgres/schemaChanges.sql | 5 ++++ ingestion/tests/unit/test_suite/conftest.py | 21 +++++++++++++++++ .../test_suite/test_validations_databases.py | 10 ++++++-- .../test_suite/test_validations_datalake.py | 23 +++++++++++++++++-- .../data/tests/columnValuesToBeInSet.json | 2 +- .../data/tests/columnValuesToBeNotInSet.json | 2 +- 7 files changed, 61 insertions(+), 6 deletions(-) diff --git a/bootstrap/sql/migrations/native/1.9.0/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.9.0/mysql/schemaChanges.sql index 3673259380e..6f084e4ff54 100644 --- a/bootstrap/sql/migrations/native/1.9.0/mysql/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.9.0/mysql/schemaChanges.sql @@ -137,6 +137,10 @@ CREATE TABLE IF NOT EXISTS entity_deletion_lock ( INDEX idx_deletion_lock_time (lockedAt) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +-- Update columnValuesToBeInSet test definition to include BOOLEAN in supportedDataTypes and update parameterDefinition +UPDATE test_definition + SET json = JSON_SET(json, '$.supportedDataTypes', JSON_ARRAY('NUMBER', 'INT', 'FLOAT', 'DOUBLE', 'DECIMAL', 'TINYINT', 'SMALLINT', 'BIGINT', 'BYTEINT', 'BYTES', 'STRING', 'MEDIUMTEXT', 'TEXT', 'CHAR', 'VARCHAR', 'BOOLEAN')) +WHERE name in ('columnValuesToBeInSet', 'columnValuesToBeNotInSet'); -- 1. Add generated classificationHash column to support fast lookup and grouping by classification fqnHash ALTER TABLE tag diff --git a/bootstrap/sql/migrations/native/1.9.0/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.9.0/postgres/schemaChanges.sql index 950a7af992c..b28e8fe39c4 100644 --- a/bootstrap/sql/migrations/native/1.9.0/postgres/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.9.0/postgres/schemaChanges.sql @@ -171,6 +171,11 @@ CREATE TABLE IF NOT EXISTS entity_deletion_lock ( CREATE INDEX IF NOT EXISTS idx_deletion_lock_fqn ON entity_deletion_lock(entityFqn); CREATE INDEX IF NOT EXISTS idx_deletion_lock_time ON entity_deletion_lock(lockedAt); +-- Update columnValuesToBeInSet test definition to include BOOLEAN in supportedDataTypes and update parameterDefinition +UPDATE test_definition + SET json = jsonb_set(json, '{supportedDataTypes}', '["NUMBER", "INT", "FLOAT", "DOUBLE", "DECIMAL", "TINYINT", "SMALLINT", "BIGINT", "BYTEINT", "BYTES", "STRING", "MEDIUMTEXT", "TEXT", "CHAR", "VARCHAR", "BOOLEAN"]'::jsonb) +WHERE name in ('columnValuesToBeInSet', 'columnValuesToBeNotInSet'); + -- 1. Add classificationHash column to support fast lookup and grouping by classification fqnHash ALTER TABLE tag ADD COLUMN classificationHash TEXT diff --git a/ingestion/tests/unit/test_suite/conftest.py b/ingestion/tests/unit/test_suite/conftest.py index e7e99708fd8..c2f12b57e2c 100644 --- a/ingestion/tests/unit/test_suite/conftest.py +++ b/ingestion/tests/unit/test_suite/conftest.py @@ -45,6 +45,8 @@ ENTITY_LINK_NAME = "<#E::table::service.db.users::columns::name>" ENTITY_LINK_USER = "<#E::table::service.db.users>" ENTITY_LINK_INSERTED_DATE = "<#E::table::service.db.users::columns::inserted_date>" ENTITY_LINK_EXPECTED_LOCATION = "<#E::table::service.db.users::columns::postal_code>" +ENTITY_LINK_IS_ACTIVE = "<#E::table::service.db.users::columns::is_active>" + TABLE = Table( id=uuid4(), @@ -61,6 +63,7 @@ TABLE = Table( Column(name="postal_code", dataType=DataType.INT), # type: ignore Column(name="lat", dataType=DataType.DECIMAL), # type: ignore Column(name="lon", dataType=DataType.DECIMAL), # type: ignore + Column(name="is_active", dataType=DataType.BOOLEAN), # type: ignore ], database=EntityReference(id=uuid4(), name="db", type="database"), # type: ignore ) # type: ignore @@ -78,6 +81,7 @@ class User(Base): postal_code = sqa.Column(sqa.INT) lat = sqa.Column(sqa.DECIMAL) lon = sqa.Column(sqa.DECIMAL) + is_active = sqa.Column(sqa.BOOLEAN) @pytest.fixture @@ -122,6 +126,7 @@ def create_sqlite_table(): postal_code=60001, lat=49.6852237, lon=1.7743058, + is_active=True, ), User( name="Jane", @@ -133,6 +138,7 @@ def create_sqlite_table(): postal_code=19005, lat=45.2589385, lon=1.4731471, + is_active=False, ), User( name="John", @@ -144,6 +150,7 @@ def create_sqlite_table(): postal_code=11008, lat=42.9974445, lon=2.2518325, + is_active=None, ), ] session.add_all(data) @@ -746,3 +753,17 @@ def test_case_column_values_to_be_at_expected_location(): ], computePassedFailedRowCount=True, ) # type: ignore + + +@pytest.fixture +def test_case_column_value_in_set_boolean(): + return TestCase( + name=TEST_CASE_NAME, + entityLink=ENTITY_LINK_IS_ACTIVE, + testSuite=EntityReference(id=uuid4(), type="TestSuite"), # type: ignore + testDefinition=EntityReference(id=uuid4(), type="TestDefinition"), # type: ignore + parameterValues=[ + TestCaseParameterValue(name="allowedValues", value="[True, False]"), + ], + computePassedFailedRowCount=True, + ) diff --git a/ingestion/tests/unit/test_suite/test_validations_databases.py b/ingestion/tests/unit/test_suite/test_validations_databases.py index 01855a468b1..c5d8b82c63c 100644 --- a/ingestion/tests/unit/test_suite/test_validations_databases.py +++ b/ingestion/tests/unit/test_suite/test_validations_databases.py @@ -305,7 +305,7 @@ EXECUTION_DATE = datetime.strptime("2021-07-03", "%Y-%m-%d") "TABLE", ( TestCaseResult, - "10", + "11", None, TestCaseStatus.Success, None, @@ -318,7 +318,7 @@ EXECUTION_DATE = datetime.strptime("2021-07-03", "%Y-%m-%d") "test_case_table_column_count_to_equal", "tableColumnCountToEqual", "TABLE", - (TestCaseResult, "10", None, TestCaseStatus.Failed, None, None, None, None), + (TestCaseResult, "11", None, TestCaseStatus.Failed, None, None, None, None), ), ( "test_case_table_column_name_to_exist", @@ -431,6 +431,12 @@ EXECUTION_DATE = datetime.strptime("2021-07-03", "%Y-%m-%d") None, ), ), + ( + "test_case_column_value_in_set_boolean", + "columnValuesToBeInSet", + "COLUMN", + (TestCaseResult, "20", None, TestCaseStatus.Success, 20.0, 0.0, 66.67, 0.0), + ), ], ) def test_suite_validation_database( diff --git a/ingestion/tests/unit/test_suite/test_validations_datalake.py b/ingestion/tests/unit/test_suite/test_validations_datalake.py index 2a097aa9716..edb10872843 100644 --- a/ingestion/tests/unit/test_suite/test_validations_datalake.py +++ b/ingestion/tests/unit/test_suite/test_validations_datalake.py @@ -36,6 +36,7 @@ DL_DATA = ( 60001, 49.6852237, 1.7743058, + True, ], [ "2", @@ -48,6 +49,7 @@ DL_DATA = ( 19005, 45.2589385, 1.4731471, + False, ], [ "3", @@ -60,6 +62,7 @@ DL_DATA = ( 11008, 42.9974445, 2.2518325, + None, ], ) @@ -77,6 +80,7 @@ DATALAKE_DATA_FRAME = lambda times_increase_sample_data: DataFrame( "postal_code", "lat", "lon", + "is_active", ], ) @@ -403,7 +407,7 @@ DATALAKE_DATA_FRAME = lambda times_increase_sample_data: DataFrame( "TABLE", ( TestCaseResult, - "10", + "11", None, TestCaseStatus.Success, None, @@ -416,7 +420,7 @@ DATALAKE_DATA_FRAME = lambda times_increase_sample_data: DataFrame( "test_case_table_column_count_to_equal", "tableColumnCountToEqual", "TABLE", - (TestCaseResult, "10", None, TestCaseStatus.Failed, None, None, None, None), + (TestCaseResult, "11", None, TestCaseStatus.Failed, None, None, None, None), ), ( "test_case_table_column_name_to_exist", @@ -517,6 +521,21 @@ DATALAKE_DATA_FRAME = lambda times_increase_sample_data: DataFrame( None, ), ), + ( + "test_case_column_value_in_set_boolean", + "columnValuesToBeInSet", + "COLUMN", + ( + TestCaseResult, + "4000", + None, + TestCaseStatus.Success, + 4000.0, + 0.0, + 66.67, + 0.0, + ), + ), ], ) def test_suite_validation_datalake( diff --git a/openmetadata-service/src/main/resources/json/data/tests/columnValuesToBeInSet.json b/openmetadata-service/src/main/resources/json/data/tests/columnValuesToBeInSet.json index 869ef35a22a..eac603b70ea 100644 --- a/openmetadata-service/src/main/resources/json/data/tests/columnValuesToBeInSet.json +++ b/openmetadata-service/src/main/resources/json/data/tests/columnValuesToBeInSet.json @@ -5,7 +5,7 @@ "description": "This schema defines the test ColumnValuesToBeInSet. Test the column values are in the set.", "entityType": "COLUMN", "testPlatforms": ["OpenMetadata"], - "supportedDataTypes": ["NUMBER", "INT", "FLOAT", "DOUBLE", "DECIMAL", "TINYINT", "SMALLINT", "BIGINT", "BYTEINT", "BYTES", "STRING", "MEDIUMTEXT", "TEXT", "CHAR", "VARCHAR"], + "supportedDataTypes": ["NUMBER", "INT", "FLOAT", "DOUBLE", "DECIMAL", "TINYINT", "SMALLINT", "BIGINT", "BYTEINT", "BYTES", "STRING", "MEDIUMTEXT", "TEXT", "CHAR", "VARCHAR", "BOOLEAN"], "parameterDefinition": [ { "name": "allowedValues", diff --git a/openmetadata-service/src/main/resources/json/data/tests/columnValuesToBeNotInSet.json b/openmetadata-service/src/main/resources/json/data/tests/columnValuesToBeNotInSet.json index bc9d2e24b09..582d90257f0 100644 --- a/openmetadata-service/src/main/resources/json/data/tests/columnValuesToBeNotInSet.json +++ b/openmetadata-service/src/main/resources/json/data/tests/columnValuesToBeNotInSet.json @@ -5,7 +5,7 @@ "description": "This schema defines the test ColumnValuesToBeNotInSet. Test the column values to not be in the set. ", "entityType": "COLUMN", "testPlatforms": ["OpenMetadata"], - "supportedDataTypes": ["NUMBER", "INT", "FLOAT", "DOUBLE", "DECIMAL", "TINYINT", "SMALLINT", "BIGINT", "BYTEINT", "BYTES", "STRING", "MEDIUMTEXT", "TEXT", "CHAR", "VARCHAR"], + "supportedDataTypes": ["NUMBER", "INT", "FLOAT", "DOUBLE", "DECIMAL", "TINYINT", "SMALLINT", "BIGINT", "BYTEINT", "BYTES", "STRING", "MEDIUMTEXT", "TEXT", "CHAR", "VARCHAR", "BOOLEAN"], "parameterDefinition": [ { "name": "forbiddenValues",