diff --git a/ingestion/setup.py b/ingestion/setup.py index d3721f29d84..b9b515c3193 100644 --- a/ingestion/setup.py +++ b/ingestion/setup.py @@ -275,6 +275,7 @@ dev = { # For publishing "twine", "build", + *plugins["sample-data"], } diff --git a/ingestion/src/metadata/data_quality/validations/column/base/columnValuesMissingCount.py b/ingestion/src/metadata/data_quality/validations/column/base/columnValuesMissingCount.py index f3d61223460..082cdc9f34c 100644 --- a/ingestion/src/metadata/data_quality/validations/column/base/columnValuesMissingCount.py +++ b/ingestion/src/metadata/data_quality/validations/column/base/columnValuesMissingCount.py @@ -46,7 +46,10 @@ class BaseColumnValuesMissingCountValidator(BaseTestValidator): """ try: column: Union[SQALikeColumn, Column] = self._get_column_name() - null_res = self._run_results(Metrics.NULL_COUNT, column) + null_res = self._run_results( + Metrics.NULL_MISSING_COUNT, + column, + ) except (ValueError, RuntimeError) as exc: msg = f"Error computing {self.test_case.fullyQualifiedName}: {exc}" # type: ignore logger.debug(traceback.format_exc()) diff --git a/ingestion/src/metadata/profiler/metrics/registry.py b/ingestion/src/metadata/profiler/metrics/registry.py index a95b17f49f7..a2219563f03 100644 --- a/ingestion/src/metadata/profiler/metrics/registry.py +++ b/ingestion/src/metadata/profiler/metrics/registry.py @@ -42,6 +42,7 @@ from metadata.profiler.metrics.static.min_length import MinLength from metadata.profiler.metrics.static.not_like_count import NotLikeCount from metadata.profiler.metrics.static.not_regexp_match_count import NotRegexCount from metadata.profiler.metrics.static.null_count import NullCount +from metadata.profiler.metrics.static.null_missing_count import NullMissingCount from metadata.profiler.metrics.static.regexp_match_count import RegexCount from metadata.profiler.metrics.static.row_count import RowCount from metadata.profiler.metrics.static.stddev import StdDev @@ -103,3 +104,6 @@ class Metrics(MetricRegistry): # Hybrid Metrics HISTOGRAM = Histogram + + # Missing Count + NULL_MISSING_COUNT = NullMissingCount diff --git a/ingestion/src/metadata/profiler/metrics/static/null_missing_count.py b/ingestion/src/metadata/profiler/metrics/static/null_missing_count.py new file mode 100644 index 00000000000..1871efd51a1 --- /dev/null +++ b/ingestion/src/metadata/profiler/metrics/static/null_missing_count.py @@ -0,0 +1,73 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Null Count Metric definition +""" +# pylint: disable=duplicate-code + + +from sqlalchemy import case, column + +from metadata.generated.schema.configuration.profilerConfiguration import MetricType +from metadata.profiler.metrics.core import StaticMetric, _label +from metadata.profiler.orm.functions.sum import SumFn + + +class NullMissingCount(StaticMetric): + """ + NULL + Empty COUNT Metric + + Given a column, return the null count. + + We are building a CASE WHEN structure: + ``` + SUM( + CASE is not null THEN 1 + ELSE 0 + ) + ``` + """ + + @classmethod + def name(cls): + """ + Returns the name of the metric. + """ + return MetricType.nullCount.value + + @property + def metric_type(self): + """ + Returns the type of the metric. + """ + return int + + @_label + def fn(self): + """ + Returns the SQLAlchemy function for calculating the metric. + """ + return SumFn( + case( + [ + (column(self.col.name, self.col.type).is_(None), 1), + (column(self.col.name, self.col.type).__eq__(""), 1), + ], + else_=0, + ) + ) + + def df_fn(self, dfs=None): + """ + Returns the pandas function for calculating the metric. + """ + return sum(df[self.col.name].isnull().sum() for df in dfs) diff --git a/ingestion/src/metadata/profiler/source/base/profiler_source.py b/ingestion/src/metadata/profiler/source/base/profiler_source.py index 4609692d8a1..e6ce06501fe 100644 --- a/ingestion/src/metadata/profiler/source/base/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/base/profiler_source.py @@ -199,7 +199,7 @@ class ProfilerSource(ProfilerSourceInterface): db_service: Optional[DatabaseService], ) -> ProfilerInterface: """Create sqlalchemy profiler interface""" - from metadata.profiler.interface.profiler_interface_factory import ( + from metadata.profiler.interface.profiler_interface_factory import ( # pylint: disable=import-outside-toplevel profiler_interface_factory, ) diff --git a/ingestion/tests/unit/test_suite/conftest.py b/ingestion/tests/unit/test_suite/conftest.py index a2c906cde98..05d3685707e 100644 --- a/ingestion/tests/unit/test_suite/conftest.py +++ b/ingestion/tests/unit/test_suite/conftest.py @@ -102,7 +102,7 @@ def create_sqlite_table(): name="John", first_name="Jo", fullname="John Doe", - nickname="johnny b goode", + nickname="", age=30, inserted_date=datetime.today() - timedelta(days=i), ), diff --git a/ingestion/tests/unit/test_suite/test_validations_databases.py b/ingestion/tests/unit/test_suite/test_validations_databases.py index 9d779434c71..321039ff056 100644 --- a/ingestion/tests/unit/test_suite/test_validations_databases.py +++ b/ingestion/tests/unit/test_suite/test_validations_databases.py @@ -66,8 +66,8 @@ EXECUTION_DATE = datetime.strptime("2021-07-03", "%Y-%m-%d") "COLUMN", ( TestCaseResult, + "0", "8", - "14", TestCaseStatus.Failed, 20.0, 10.0, @@ -216,9 +216,9 @@ EXECUTION_DATE = datetime.strptime("2021-07-03", "%Y-%m-%d") "COLUMN", ( TestCaseResult, - "10", + "20", None, - TestCaseStatus.Success, + TestCaseStatus.Failed, None, None, None, @@ -229,7 +229,7 @@ EXECUTION_DATE = datetime.strptime("2021-07-03", "%Y-%m-%d") "test_case_column_values_missing_count_to_be_equal_missing_values", "columnValuesMissingCount", "COLUMN", - (TestCaseResult, "20", None, TestCaseStatus.Failed, None, None, None, None), + (TestCaseResult, "30", None, TestCaseStatus.Failed, None, None, None, None), ), ( "test_case_column_values_not_in_set",