diff --git a/catalog-rest-service/src/main/resources/json/schema/tests/column/columnValuesToBeNotInSet.json b/catalog-rest-service/src/main/resources/json/schema/tests/column/columnValuesToBeNotInSet.json index cde0839d45d..51354a3e73e 100644 --- a/catalog-rest-service/src/main/resources/json/schema/tests/column/columnValuesToBeNotInSet.json +++ b/catalog-rest-service/src/main/resources/json/schema/tests/column/columnValuesToBeNotInSet.json @@ -6,11 +6,14 @@ "type": "object", "javaType": "org.openmetadata.catalog.tests.column.ColumnValuesToBeNotInSet", "properties": { - "values": { + "forbiddenValues": { "description": "An Array of values.", - "type": "array" + "type": "array", + "items": { + "type": ["string", "number"] + } } }, - "required": ["values"], + "required": ["forbiddenValues"], "additionalProperties": false } diff --git a/ingestion-core/src/metadata/_version.py b/ingestion-core/src/metadata/_version.py index 724501b9d40..4f07f0d2bcb 100644 --- a/ingestion-core/src/metadata/_version.py +++ b/ingestion-core/src/metadata/_version.py @@ -7,5 +7,5 @@ Provides metadata version information. from incremental import Version -__version__ = Version("metadata", 0, 9, 0, dev=18) +__version__ = Version("metadata", 0, 9, 0, dev=19) __all__ = ["__version__"] diff --git a/ingestion/examples/sample_data/datasets/tableTests.json b/ingestion/examples/sample_data/datasets/tableTests.json index ff58f61c173..4678867af8f 100644 --- a/ingestion/examples/sample_data/datasets/tableTests.json +++ b/ingestion/examples/sample_data/datasets/tableTests.json @@ -46,6 +46,38 @@ "testCaseStatus": "Success", "result": "Found 5.0 columns vs. the expected 5" } + }, + { + "description": "Rows should always be 100 because of something", + "testCase": { + "config": { + "minValue": 100, + "maxValue": 200 + }, + "tableTestType": "tableRowCountToBeBetween" + }, + "executionFrequency": "Daily", + "result": { + "executionTime": 1646220190, + "testCaseStatus": "Success", + "result": "Found 120.0 rows vs. the expected range [100, 200]" + } + }, + { + "description": "Rows should always be 100 because of something", + "testCase": { + "config": { + "minValue": 100, + "maxValue": 200 + }, + "tableTestType": "tableRowCountToBeBetween" + }, + "executionFrequency": "Daily", + "result": { + "executionTime": 1646221199, + "testCaseStatus": "Success", + "result": "Found 120.0 rows vs. the expected range [100, 200]" + } } ], "columnTests": [ @@ -81,6 +113,20 @@ "result": "Found min=1.0 vs. the expected min=0" } }, + { + "columnName": "email", + "description": "emails should be unique", + "testCase": { + "config": {}, + "columnTestType": "columnValuesToBeUnique" + }, + "executionFrequency": "Daily", + "result": { + "executionTime": 1646220190, + "testCaseStatus": "Success", + "result": "Found uniqueCount=100.0 vs. valuesCount=100.0" + } + }, { "columnName": "email", "description": "emails should be unique", @@ -94,6 +140,170 @@ "testCaseStatus": "Success", "result": "Found uniqueCount=100.0 vs. valuesCount=100.0" } + }, + { + "columnName": "user_id", + "description": "user_id should be not null", + "testCase": { + "config": {}, + "columnTestType": "columnValuesToBeNotNull" + }, + "executionFrequency": "Daily", + "result": { + "executionTime": 1646220190, + "testCaseStatus": "Success", + "result": "Found nullCount=0" + } + }, + { + "columnName": "user_id", + "description": "user_id should be not null", + "testCase": { + "config": {}, + "columnTestType": "columnValuesToBeNotNull" + }, + "executionFrequency": "Daily", + "result": { + "executionTime": 1646221199, + "testCaseStatus": "Success", + "result": "Found nullCount=0" + } + }, + { + "columnName": "last_name", + "description": "last_name should match a regex", + "testCase": { + "config": { + "regex": "%something%" + }, + "columnTestType": "columnValuesToMatchRegex" + }, + "executionFrequency": "Daily", + "result": { + "executionTime": 1646220190, + "testCaseStatus": "Failed", + "result": "Found likeCount=0. Nothing matches %something%" + } + }, + { + "columnName": "last_name", + "description": "last_name should match a regex", + "testCase": { + "config": { + "regex": "%something%" + }, + "columnTestType": "columnValuesToMatchRegex" + }, + "executionFrequency": "Daily", + "result": { + "executionTime": 1646221199, + "testCaseStatus": "Failed", + "result": "Found likeCount=0. Nothing matches %something%" + } + }, + { + "columnName": "first_name", + "description": "Some description...", + "testCase": { + "config": { + "missingCountValue": 10 + }, + "columnTestType": "columnValuesMissingCountToBeEqual" + }, + "executionFrequency": "Daily", + "result": { + "executionTime": 1646220190, + "testCaseStatus": "Failed", + "result": "Found nullCount=0.0 vs. the expected nullCount=10" + } + }, + { + "columnName": "first_name", + "description": "Some description...", + "testCase": { + "config": { + "missingCountValue": 10 + }, + "columnTestType": "columnValuesMissingCountToBeEqual" + }, + "executionFrequency": "Daily", + "result": { + "executionTime": 1646221199, + "testCaseStatus": "Failed", + "result": "Found nullCount=0.0 vs. the expected nullCount=10" + } + }, + { + "columnName": "email", + "description": "email should have a fixed length", + "testCase": { + "config": { + "minValue": 6, + "maxValue": 30 + }, + "columnTestType": "columnValuesToBeBetween" + }, + "executionFrequency": "Daily", + "result": { + "executionTime": 1646221199, + "testCaseStatus": "Success", + "result": "Found min=1.0 vs. the expected min=0" + } + }, + { + "columnName": "email", + "description": "email should have a fixed length", + "testCase": { + "config": { + "minValue": 6, + "maxValue": 30 + }, + "columnTestType": "columnValuesToBeBetween" + }, + "executionFrequency": "Daily", + "result": { + "executionTime": 1646220190, + "testCaseStatus": "Success", + "result": "Found min=1.0 vs. the expected min=0" + } + }, + { + "columnName": "last_name", + "description": "We have reserved last names", + "testCase": { + "config": { + "forbiddenValues": [ + "forbidden", + "random" + ] + }, + "columnTestType": "columnValuesToBeNotInSet" + }, + "executionFrequency": "Daily", + "result": { + "executionTime": 1646220190, + "testCaseStatus": "Success", + "result": "Found countInSet=0" + } + }, + { + "columnName": "last_name", + "description": "We have reserved last names", + "testCase": { + "config": { + "forbiddenValues": [ + "forbidden", + "random" + ] + }, + "columnTestType": "columnValuesToBeNotInSet" + }, + "executionFrequency": "Daily", + "result": { + "executionTime": 1646221199, + "testCaseStatus": "Success", + "result": "Found countInSet=0" + } } ] } diff --git a/ingestion/src/metadata/orm_profiler/profiles/core.py b/ingestion/src/metadata/orm_profiler/profiles/core.py index 2981b507f23..80fd22da120 100644 --- a/ingestion/src/metadata/orm_profiler/profiles/core.py +++ b/ingestion/src/metadata/orm_profiler/profiles/core.py @@ -335,14 +335,21 @@ class Profiler(Generic[MetricType]): We need to transform it to TableProfile """ try: + + # There are columns that we might have skipped from + # computing metrics, if the type is not supported. + # Let's filter those out. + computed_profiles = [ + ColumnProfile(**self.column_results.get(col.name)) + for col in self.columns + if self.column_results.get(col.name) + ] + profile = TableProfile( profileDate=self.profile_date.strftime("%Y-%m-%d"), - columnCount=self._table_results.get("columnCount"), # TODO IMPLEMENT + columnCount=self._table_results.get("columnCount"), rowCount=self._table_results.get(RowCount.name()), - columnProfile=[ - ColumnProfile(**self._column_results.get(col.name)) - for col in self.columns - ], + columnProfile=computed_profiles, ) return profile diff --git a/ingestion/src/metadata/orm_profiler/validations/column/column_values_not_in_set.py b/ingestion/src/metadata/orm_profiler/validations/column/column_values_not_in_set.py index 9792c80c8a2..c02e9f35f19 100644 --- a/ingestion/src/metadata/orm_profiler/validations/column/column_values_not_in_set.py +++ b/ingestion/src/metadata/orm_profiler/validations/column/column_values_not_in_set.py @@ -49,7 +49,7 @@ def column_values_not_in_set( :return: TestCaseResult with status and results """ - set_count = add_props(values=test_case.values)(Metrics.COUNT_IN_SET.value) + set_count = add_props(values=test_case.forbiddenValues)(Metrics.COUNT_IN_SET.value) try: col = next( diff --git a/ingestion/tests/unit/profiler/test_session_validations.py b/ingestion/tests/unit/profiler/test_session_validations.py index e2567325b01..b69785d8e9b 100644 --- a/ingestion/tests/unit/profiler/test_session_validations.py +++ b/ingestion/tests/unit/profiler/test_session_validations.py @@ -81,7 +81,7 @@ class MetricsTest(TestCase): column_profile = ColumnProfile(name="name") # column name res_ok = validate( - ColumnValuesToBeNotInSet(values=["random", "forbidden"]), + ColumnValuesToBeNotInSet(forbiddenValues=["random", "forbidden"]), col_profile=column_profile, execution_date=EXECUTION_DATE, session=self.session, @@ -95,7 +95,7 @@ class MetricsTest(TestCase): ) res_ko = validate( - ColumnValuesToBeNotInSet(values=["John", "forbidden"]), + ColumnValuesToBeNotInSet(forbiddenValues=["John", "forbidden"]), col_profile=column_profile, execution_date=EXECUTION_DATE, session=self.session, @@ -109,7 +109,7 @@ class MetricsTest(TestCase): ) res_aborted = validate( - ColumnValuesToBeNotInSet(values=["John", "forbidden"]), + ColumnValuesToBeNotInSet(forbiddenValues=["John", "forbidden"]), col_profile=ColumnProfile(name="random"), execution_date=EXECUTION_DATE, session=self.session,