Issue #14812: Add support for empty string as missing count (#16017)

This commit is contained in:
Ayush Shah 2024-04-25 09:45:26 +05:30 committed by GitHub
parent f90f1301d9
commit a15da7ec98
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 88 additions and 7 deletions

View File

@ -275,6 +275,7 @@ dev = {
# For publishing # For publishing
"twine", "twine",
"build", "build",
*plugins["sample-data"],
} }

View File

@ -46,7 +46,10 @@ class BaseColumnValuesMissingCountValidator(BaseTestValidator):
""" """
try: try:
column: Union[SQALikeColumn, Column] = self._get_column_name() column: Union[SQALikeColumn, Column] = self._get_column_name()
null_res = self._run_results(Metrics.NULL_COUNT, column) null_res = self._run_results(
Metrics.NULL_MISSING_COUNT,
column,
)
except (ValueError, RuntimeError) as exc: except (ValueError, RuntimeError) as exc:
msg = f"Error computing {self.test_case.fullyQualifiedName}: {exc}" # type: ignore msg = f"Error computing {self.test_case.fullyQualifiedName}: {exc}" # type: ignore
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())

View File

@ -42,6 +42,7 @@ from metadata.profiler.metrics.static.min_length import MinLength
from metadata.profiler.metrics.static.not_like_count import NotLikeCount from metadata.profiler.metrics.static.not_like_count import NotLikeCount
from metadata.profiler.metrics.static.not_regexp_match_count import NotRegexCount from metadata.profiler.metrics.static.not_regexp_match_count import NotRegexCount
from metadata.profiler.metrics.static.null_count import NullCount from metadata.profiler.metrics.static.null_count import NullCount
from metadata.profiler.metrics.static.null_missing_count import NullMissingCount
from metadata.profiler.metrics.static.regexp_match_count import RegexCount from metadata.profiler.metrics.static.regexp_match_count import RegexCount
from metadata.profiler.metrics.static.row_count import RowCount from metadata.profiler.metrics.static.row_count import RowCount
from metadata.profiler.metrics.static.stddev import StdDev from metadata.profiler.metrics.static.stddev import StdDev
@ -103,3 +104,6 @@ class Metrics(MetricRegistry):
# Hybrid Metrics # Hybrid Metrics
HISTOGRAM = Histogram HISTOGRAM = Histogram
# Missing Count
NULL_MISSING_COUNT = NullMissingCount

View File

@ -0,0 +1,73 @@
# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Null Count Metric definition
"""
# pylint: disable=duplicate-code
from sqlalchemy import case, column
from metadata.generated.schema.configuration.profilerConfiguration import MetricType
from metadata.profiler.metrics.core import StaticMetric, _label
from metadata.profiler.orm.functions.sum import SumFn
class NullMissingCount(StaticMetric):
"""
NULL + Empty COUNT Metric
Given a column, return the null count.
We are building a CASE WHEN structure:
```
SUM(
CASE is not null THEN 1
ELSE 0
)
```
"""
@classmethod
def name(cls):
"""
Returns the name of the metric.
"""
return MetricType.nullCount.value
@property
def metric_type(self):
"""
Returns the type of the metric.
"""
return int
@_label
def fn(self):
"""
Returns the SQLAlchemy function for calculating the metric.
"""
return SumFn(
case(
[
(column(self.col.name, self.col.type).is_(None), 1),
(column(self.col.name, self.col.type).__eq__(""), 1),
],
else_=0,
)
)
def df_fn(self, dfs=None):
"""
Returns the pandas function for calculating the metric.
"""
return sum(df[self.col.name].isnull().sum() for df in dfs)

View File

@ -199,7 +199,7 @@ class ProfilerSource(ProfilerSourceInterface):
db_service: Optional[DatabaseService], db_service: Optional[DatabaseService],
) -> ProfilerInterface: ) -> ProfilerInterface:
"""Create sqlalchemy profiler interface""" """Create sqlalchemy profiler interface"""
from metadata.profiler.interface.profiler_interface_factory import ( from metadata.profiler.interface.profiler_interface_factory import ( # pylint: disable=import-outside-toplevel
profiler_interface_factory, profiler_interface_factory,
) )

View File

@ -102,7 +102,7 @@ def create_sqlite_table():
name="John", name="John",
first_name="Jo", first_name="Jo",
fullname="John Doe", fullname="John Doe",
nickname="johnny b goode", nickname="",
age=30, age=30,
inserted_date=datetime.today() - timedelta(days=i), inserted_date=datetime.today() - timedelta(days=i),
), ),

View File

@ -66,8 +66,8 @@ EXECUTION_DATE = datetime.strptime("2021-07-03", "%Y-%m-%d")
"COLUMN", "COLUMN",
( (
TestCaseResult, TestCaseResult,
"0",
"8", "8",
"14",
TestCaseStatus.Failed, TestCaseStatus.Failed,
20.0, 20.0,
10.0, 10.0,
@ -216,9 +216,9 @@ EXECUTION_DATE = datetime.strptime("2021-07-03", "%Y-%m-%d")
"COLUMN", "COLUMN",
( (
TestCaseResult, TestCaseResult,
"10", "20",
None, None,
TestCaseStatus.Success, TestCaseStatus.Failed,
None, None,
None, None,
None, None,
@ -229,7 +229,7 @@ EXECUTION_DATE = datetime.strptime("2021-07-03", "%Y-%m-%d")
"test_case_column_values_missing_count_to_be_equal_missing_values", "test_case_column_values_missing_count_to_be_equal_missing_values",
"columnValuesMissingCount", "columnValuesMissingCount",
"COLUMN", "COLUMN",
(TestCaseResult, "20", None, TestCaseStatus.Failed, None, None, None, None), (TestCaseResult, "30", None, TestCaseStatus.Failed, None, None, None, None),
), ),
( (
"test_case_column_values_not_in_set", "test_case_column_values_not_in_set",