Fix #3112 - col profile safety & sample data (#3142)

Fix #3112 - col profile safety & sample data (#3142)
This commit is contained in:
Pere Miquel Brull 2022-03-04 13:14:11 +01:00 committed by GitHub
parent e3001e55ba
commit bd7b91b448
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 233 additions and 13 deletions

View File

@ -6,11 +6,14 @@
"type": "object",
"javaType": "org.openmetadata.catalog.tests.column.ColumnValuesToBeNotInSet",
"properties": {
"values": {
"forbiddenValues": {
"description": "An Array of values.",
"type": "array"
"type": "array",
"items": {
"type": ["string", "number"]
}
}
},
"required": ["values"],
"required": ["forbiddenValues"],
"additionalProperties": false
}

View File

@ -7,5 +7,5 @@ Provides metadata version information.
from incremental import Version
__version__ = Version("metadata", 0, 9, 0, dev=18)
__version__ = Version("metadata", 0, 9, 0, dev=19)
__all__ = ["__version__"]

View File

@ -46,6 +46,38 @@
"testCaseStatus": "Success",
"result": "Found 5.0 columns vs. the expected 5"
}
},
{
"description": "Rows should always be 100 because of something",
"testCase": {
"config": {
"minValue": 100,
"maxValue": 200
},
"tableTestType": "tableRowCountToBeBetween"
},
"executionFrequency": "Daily",
"result": {
"executionTime": 1646220190,
"testCaseStatus": "Success",
"result": "Found 120.0 rows vs. the expected range [100, 200]"
}
},
{
"description": "Rows should always be 100 because of something",
"testCase": {
"config": {
"minValue": 100,
"maxValue": 200
},
"tableTestType": "tableRowCountToBeBetween"
},
"executionFrequency": "Daily",
"result": {
"executionTime": 1646221199,
"testCaseStatus": "Success",
"result": "Found 120.0 rows vs. the expected range [100, 200]"
}
}
],
"columnTests": [
@ -81,6 +113,20 @@
"result": "Found min=1.0 vs. the expected min=0"
}
},
{
"columnName": "email",
"description": "emails should be unique",
"testCase": {
"config": {},
"columnTestType": "columnValuesToBeUnique"
},
"executionFrequency": "Daily",
"result": {
"executionTime": 1646220190,
"testCaseStatus": "Success",
"result": "Found uniqueCount=100.0 vs. valuesCount=100.0"
}
},
{
"columnName": "email",
"description": "emails should be unique",
@ -94,6 +140,170 @@
"testCaseStatus": "Success",
"result": "Found uniqueCount=100.0 vs. valuesCount=100.0"
}
},
{
"columnName": "user_id",
"description": "user_id should be not null",
"testCase": {
"config": {},
"columnTestType": "columnValuesToBeNotNull"
},
"executionFrequency": "Daily",
"result": {
"executionTime": 1646220190,
"testCaseStatus": "Success",
"result": "Found nullCount=0"
}
},
{
"columnName": "user_id",
"description": "user_id should be not null",
"testCase": {
"config": {},
"columnTestType": "columnValuesToBeNotNull"
},
"executionFrequency": "Daily",
"result": {
"executionTime": 1646221199,
"testCaseStatus": "Success",
"result": "Found nullCount=0"
}
},
{
"columnName": "last_name",
"description": "last_name should match a regex",
"testCase": {
"config": {
"regex": "%something%"
},
"columnTestType": "columnValuesToMatchRegex"
},
"executionFrequency": "Daily",
"result": {
"executionTime": 1646220190,
"testCaseStatus": "Failed",
"result": "Found likeCount=0. Nothing matches %something%"
}
},
{
"columnName": "last_name",
"description": "last_name should match a regex",
"testCase": {
"config": {
"regex": "%something%"
},
"columnTestType": "columnValuesToMatchRegex"
},
"executionFrequency": "Daily",
"result": {
"executionTime": 1646221199,
"testCaseStatus": "Failed",
"result": "Found likeCount=0. Nothing matches %something%"
}
},
{
"columnName": "first_name",
"description": "Some description...",
"testCase": {
"config": {
"missingCountValue": 10
},
"columnTestType": "columnValuesMissingCountToBeEqual"
},
"executionFrequency": "Daily",
"result": {
"executionTime": 1646220190,
"testCaseStatus": "Failed",
"result": "Found nullCount=0.0 vs. the expected nullCount=10"
}
},
{
"columnName": "first_name",
"description": "Some description...",
"testCase": {
"config": {
"missingCountValue": 10
},
"columnTestType": "columnValuesMissingCountToBeEqual"
},
"executionFrequency": "Daily",
"result": {
"executionTime": 1646221199,
"testCaseStatus": "Failed",
"result": "Found nullCount=0.0 vs. the expected nullCount=10"
}
},
{
"columnName": "email",
"description": "email should have a fixed length",
"testCase": {
"config": {
"minValue": 6,
"maxValue": 30
},
"columnTestType": "columnValuesToBeBetween"
},
"executionFrequency": "Daily",
"result": {
"executionTime": 1646221199,
"testCaseStatus": "Success",
"result": "Found min=1.0 vs. the expected min=0"
}
},
{
"columnName": "email",
"description": "email should have a fixed length",
"testCase": {
"config": {
"minValue": 6,
"maxValue": 30
},
"columnTestType": "columnValuesToBeBetween"
},
"executionFrequency": "Daily",
"result": {
"executionTime": 1646220190,
"testCaseStatus": "Success",
"result": "Found min=1.0 vs. the expected min=0"
}
},
{
"columnName": "last_name",
"description": "We have reserved last names",
"testCase": {
"config": {
"forbiddenValues": [
"forbidden",
"random"
]
},
"columnTestType": "columnValuesToBeNotInSet"
},
"executionFrequency": "Daily",
"result": {
"executionTime": 1646220190,
"testCaseStatus": "Success",
"result": "Found countInSet=0"
}
},
{
"columnName": "last_name",
"description": "We have reserved last names",
"testCase": {
"config": {
"forbiddenValues": [
"forbidden",
"random"
]
},
"columnTestType": "columnValuesToBeNotInSet"
},
"executionFrequency": "Daily",
"result": {
"executionTime": 1646221199,
"testCaseStatus": "Success",
"result": "Found countInSet=0"
}
}
]
}

View File

@ -335,14 +335,21 @@ class Profiler(Generic[MetricType]):
We need to transform it to TableProfile
"""
try:
# There are columns that we might have skipped from
# computing metrics, if the type is not supported.
# Let's filter those out.
computed_profiles = [
ColumnProfile(**self.column_results.get(col.name))
for col in self.columns
if self.column_results.get(col.name)
]
profile = TableProfile(
profileDate=self.profile_date.strftime("%Y-%m-%d"),
columnCount=self._table_results.get("columnCount"), # TODO IMPLEMENT
columnCount=self._table_results.get("columnCount"),
rowCount=self._table_results.get(RowCount.name()),
columnProfile=[
ColumnProfile(**self._column_results.get(col.name))
for col in self.columns
],
columnProfile=computed_profiles,
)
return profile

View File

@ -49,7 +49,7 @@ def column_values_not_in_set(
:return: TestCaseResult with status and results
"""
set_count = add_props(values=test_case.values)(Metrics.COUNT_IN_SET.value)
set_count = add_props(values=test_case.forbiddenValues)(Metrics.COUNT_IN_SET.value)
try:
col = next(

View File

@ -81,7 +81,7 @@ class MetricsTest(TestCase):
column_profile = ColumnProfile(name="name") # column name
res_ok = validate(
ColumnValuesToBeNotInSet(values=["random", "forbidden"]),
ColumnValuesToBeNotInSet(forbiddenValues=["random", "forbidden"]),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
@ -95,7 +95,7 @@ class MetricsTest(TestCase):
)
res_ko = validate(
ColumnValuesToBeNotInSet(values=["John", "forbidden"]),
ColumnValuesToBeNotInSet(forbiddenValues=["John", "forbidden"]),
col_profile=column_profile,
execution_date=EXECUTION_DATE,
session=self.session,
@ -109,7 +109,7 @@ class MetricsTest(TestCase):
)
res_aborted = validate(
ColumnValuesToBeNotInSet(values=["John", "forbidden"]),
ColumnValuesToBeNotInSet(forbiddenValues=["John", "forbidden"]),
col_profile=ColumnProfile(name="random"),
execution_date=EXECUTION_DATE,
session=self.session,