From 6a4df5f460d6661b3c4e7aca1ec8a413bf4197ae Mon Sep 17 00:00:00 2001 From: Teddy Date: Wed, 1 Mar 2023 16:38:50 +0100 Subject: [PATCH] fix: removed pandas dependencies for non pandas profiler/testSuite workflows (#10380) --- .../interfaces/datalake/datalake_test_suite_interface.py | 4 +--- .../metadata/interfaces/datalake/mixins/pandas_mixin.py | 5 +---- ingestion/src/metadata/profiler/metrics/core.py | 9 --------- .../src/metadata/profiler/metrics/static/column_count.py | 1 - .../src/metadata/profiler/metrics/static/column_names.py | 1 - ingestion/src/metadata/profiler/metrics/static/count.py | 1 - .../src/metadata/profiler/metrics/static/count_in_set.py | 1 - .../metadata/profiler/metrics/static/distinct_count.py | 1 - ingestion/src/metadata/profiler/metrics/static/max.py | 1 - .../src/metadata/profiler/metrics/static/max_length.py | 1 - ingestion/src/metadata/profiler/metrics/static/mean.py | 1 - ingestion/src/metadata/profiler/metrics/static/min.py | 1 - .../src/metadata/profiler/metrics/static/min_length.py | 1 - .../profiler/metrics/static/not_regexp_match_count.py | 1 - .../src/metadata/profiler/metrics/static/null_count.py | 1 - .../profiler/metrics/static/regexp_match_count.py | 1 - .../src/metadata/profiler/metrics/static/row_count.py | 1 - ingestion/src/metadata/profiler/metrics/static/stddev.py | 1 - ingestion/src/metadata/profiler/metrics/static/sum.py | 1 - ingestion/src/metadata/profiler/metrics/window/median.py | 1 - .../interface/pandas/pandas_profiler_interface.py | 3 +-- 21 files changed, 3 insertions(+), 35 deletions(-) diff --git a/ingestion/src/metadata/interfaces/datalake/datalake_test_suite_interface.py b/ingestion/src/metadata/interfaces/datalake/datalake_test_suite_interface.py index d740f482c1e..396d61c00ff 100644 --- a/ingestion/src/metadata/interfaces/datalake/datalake_test_suite_interface.py +++ b/ingestion/src/metadata/interfaces/datalake/datalake_test_suite_interface.py @@ -16,8 +16,6 @@ supporting sqlalchemy abstraction layer from datetime import datetime, timezone from typing import Optional -from pandas import DataFrame - from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( DatalakeConnection, ) @@ -46,7 +44,7 @@ class DataLakeTestSuiteInterface(TestSuiteProtocol, PandasInterfaceMixin): ometa_client: OpenMetadata = None, service_connection_config: DatalakeConnection = None, table_entity=None, - df: DataFrame = None, + df=None, ): self.table_entity = table_entity self.df = df diff --git a/ingestion/src/metadata/interfaces/datalake/mixins/pandas_mixin.py b/ingestion/src/metadata/interfaces/datalake/mixins/pandas_mixin.py index 1dd8365d876..3835bddb66e 100644 --- a/ingestion/src/metadata/interfaces/datalake/mixins/pandas_mixin.py +++ b/ingestion/src/metadata/interfaces/datalake/mixins/pandas_mixin.py @@ -13,9 +13,6 @@ Interfaces with database for all database engine supporting sqlalchemy abstraction layer """ - -from pandas import DataFrame - from metadata.generated.schema.entity.data.table import PartitionIntervalType from metadata.test_suite.validations.table.pandas.tableRowInsertedCountToBeBetween import ( TableRowInsertedCountToBeBetweenValidator, @@ -25,7 +22,7 @@ from metadata.test_suite.validations.table.pandas.tableRowInsertedCountToBeBetwe class PandasInterfaceMixin: """Interface mixin grouping shared methods between test suite and profiler interfaces""" - def get_partitioned_df(self, df: DataFrame) -> DataFrame: + def get_partitioned_df(self, df): """Get partitioned dataframe Returns: diff --git a/ingestion/src/metadata/profiler/metrics/core.py b/ingestion/src/metadata/profiler/metrics/core.py index d8ee089ba83..029ee14c098 100644 --- a/ingestion/src/metadata/profiler/metrics/core.py +++ b/ingestion/src/metadata/profiler/metrics/core.py @@ -43,18 +43,9 @@ def _label(_fn): @wraps(_fn) def inner(self, *args, **kwargs): - import pandas as pd # pylint: disable=import-outside-toplevel - res = _fn(self, *args, **kwargs) # If the metric computation returns some value if res is not None: - try: - if pd.isnull(res): - res = None - except ValueError: - pass - if not hasattr(res, "label"): - return res return res.label(self.name()) return None diff --git a/ingestion/src/metadata/profiler/metrics/static/column_count.py b/ingestion/src/metadata/profiler/metrics/static/column_count.py index 8d48b2f37a1..6ff43415dcc 100644 --- a/ingestion/src/metadata/profiler/metrics/static/column_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/column_count.py @@ -82,7 +82,6 @@ class ColumnCount(StaticMetric): ) return ColunCountFn(literal(len(inspect(self.table).c))) - @_label def df_fn(self, df=None): """dataframe function""" from pandas import DataFrame # pylint: disable=import-outside-toplevel diff --git a/ingestion/src/metadata/profiler/metrics/static/column_names.py b/ingestion/src/metadata/profiler/metrics/static/column_names.py index cc3d6d5335c..fa296c3d889 100644 --- a/ingestion/src/metadata/profiler/metrics/static/column_names.py +++ b/ingestion/src/metadata/profiler/metrics/static/column_names.py @@ -84,7 +84,6 @@ class ColumnNames(StaticMetric): col_names = ",".join(inspect(self.table).c.keys()) return ColunNameFn(literal(col_names, type_=sqlalchemy.types.String)) - @_label def df_fn(self, df=None): from pandas import DataFrame # pylint: disable=import-outside-toplevel diff --git a/ingestion/src/metadata/profiler/metrics/static/count.py b/ingestion/src/metadata/profiler/metrics/static/count.py index 40ba2122ac1..f7fb8dabcb9 100644 --- a/ingestion/src/metadata/profiler/metrics/static/count.py +++ b/ingestion/src/metadata/profiler/metrics/static/count.py @@ -44,7 +44,6 @@ class Count(StaticMetric): """sqlalchemy function""" return func.count(column(self.col.name)) - @_label def df_fn(self, df=None): """pandas function""" from pandas import DataFrame # pylint: disable=import-outside-toplevel diff --git a/ingestion/src/metadata/profiler/metrics/static/count_in_set.py b/ingestion/src/metadata/profiler/metrics/static/count_in_set.py index 7007c86855f..0ef0ea10029 100644 --- a/ingestion/src/metadata/profiler/metrics/static/count_in_set.py +++ b/ingestion/src/metadata/profiler/metrics/static/count_in_set.py @@ -63,7 +63,6 @@ class CountInSet(StaticMetric): logger.warning(f"Error trying to run countInSet for {self.col.name}: {exc}") return None - @_label def df_fn(self, df): """pandas function""" if not hasattr(self, "values"): diff --git a/ingestion/src/metadata/profiler/metrics/static/distinct_count.py b/ingestion/src/metadata/profiler/metrics/static/distinct_count.py index 1b377f8c220..f49f9d5dc57 100644 --- a/ingestion/src/metadata/profiler/metrics/static/distinct_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/distinct_count.py @@ -43,7 +43,6 @@ class DistinctCount(StaticMetric): def fn(self): return func.count(distinct(column(self.col.name))) - @_label def df_fn(self, df=None): from pandas import DataFrame # pylint: disable=import-outside-toplevel diff --git a/ingestion/src/metadata/profiler/metrics/static/max.py b/ingestion/src/metadata/profiler/metrics/static/max.py index be1dc714d2e..6dfa1f1ccff 100644 --- a/ingestion/src/metadata/profiler/metrics/static/max.py +++ b/ingestion/src/metadata/profiler/metrics/static/max.py @@ -41,7 +41,6 @@ class Max(StaticMetric): return func.max(column(self.col.name)) # pylint: disable=import-outside-toplevel - @_label def df_fn(self, df=None): """pandas function""" from pandas import DataFrame diff --git a/ingestion/src/metadata/profiler/metrics/static/max_length.py b/ingestion/src/metadata/profiler/metrics/static/max_length.py index 0eb9ca26b8a..358fea2c16e 100644 --- a/ingestion/src/metadata/profiler/metrics/static/max_length.py +++ b/ingestion/src/metadata/profiler/metrics/static/max_length.py @@ -58,7 +58,6 @@ class MaxLength(StaticMetric): return None # pylint: disable=import-outside-toplevel - @_label def df_fn(self, df=None): """dataframe function""" import pandas as pd diff --git a/ingestion/src/metadata/profiler/metrics/static/mean.py b/ingestion/src/metadata/profiler/metrics/static/mean.py index 49c7309b7da..01538152d52 100644 --- a/ingestion/src/metadata/profiler/metrics/static/mean.py +++ b/ingestion/src/metadata/profiler/metrics/static/mean.py @@ -74,7 +74,6 @@ class Mean(StaticMetric): return None # pylint: disable=import-outside-toplevel - @_label def df_fn(self, df=None): """dataframe function""" from numpy import vectorize diff --git a/ingestion/src/metadata/profiler/metrics/static/min.py b/ingestion/src/metadata/profiler/metrics/static/min.py index 0c4f4840341..16294375343 100644 --- a/ingestion/src/metadata/profiler/metrics/static/min.py +++ b/ingestion/src/metadata/profiler/metrics/static/min.py @@ -41,7 +41,6 @@ class Min(StaticMetric): return func.min(column(self.col.name)) # pylint: disable=import-outside-toplevel - @_label def df_fn(self, df=None): """pandas function""" from pandas import DataFrame diff --git a/ingestion/src/metadata/profiler/metrics/static/min_length.py b/ingestion/src/metadata/profiler/metrics/static/min_length.py index 5d0d0b4e29d..e1cc0671ff4 100644 --- a/ingestion/src/metadata/profiler/metrics/static/min_length.py +++ b/ingestion/src/metadata/profiler/metrics/static/min_length.py @@ -58,7 +58,6 @@ class MinLength(StaticMetric): return None # pylint: disable=import-outside-toplevel - @_label def df_fn(self, df=None): """dataframe function""" from numpy import vectorize diff --git a/ingestion/src/metadata/profiler/metrics/static/not_regexp_match_count.py b/ingestion/src/metadata/profiler/metrics/static/not_regexp_match_count.py index 800617e8c39..670f1f5b15c 100644 --- a/ingestion/src/metadata/profiler/metrics/static/not_regexp_match_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/not_regexp_match_count.py @@ -55,7 +55,6 @@ class NotRegexCount(StaticMetric): ) ) - @_label def df_fn(self, df): """pandas function""" if not hasattr(self, "expression"): diff --git a/ingestion/src/metadata/profiler/metrics/static/null_count.py b/ingestion/src/metadata/profiler/metrics/static/null_count.py index 5678c90abd4..abb8cc04b2f 100644 --- a/ingestion/src/metadata/profiler/metrics/static/null_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/null_count.py @@ -50,7 +50,6 @@ class NullCount(StaticMetric): """sqlalchemy function""" return SumFn(case([(column(self.col.name).is_(None), 1)], else_=0)) - @_label def df_fn(self, df=None): """pandas function""" from pandas import DataFrame # pylint: disable=import-outside-toplevel diff --git a/ingestion/src/metadata/profiler/metrics/static/regexp_match_count.py b/ingestion/src/metadata/profiler/metrics/static/regexp_match_count.py index 8995e083a70..a867cdadff3 100644 --- a/ingestion/src/metadata/profiler/metrics/static/regexp_match_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/regexp_match_count.py @@ -52,7 +52,6 @@ class RegexCount(StaticMetric): case([(column(self.col.name).regexp_match(self.expression), 1)], else_=0) ) - @_label def df_fn(self, df): """pandas function""" diff --git a/ingestion/src/metadata/profiler/metrics/static/row_count.py b/ingestion/src/metadata/profiler/metrics/static/row_count.py index 7d8462f2147..be984462bb3 100644 --- a/ingestion/src/metadata/profiler/metrics/static/row_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/row_count.py @@ -48,7 +48,6 @@ class RowCount(StaticMetric): """sqlalchemy function""" return func.count() - @_label def df_fn(self, df=None): """pandas function""" from pandas import DataFrame # pylint: disable=import-outside-toplevel diff --git a/ingestion/src/metadata/profiler/metrics/static/stddev.py b/ingestion/src/metadata/profiler/metrics/static/stddev.py index ead6570e4e1..34306267d53 100644 --- a/ingestion/src/metadata/profiler/metrics/static/stddev.py +++ b/ingestion/src/metadata/profiler/metrics/static/stddev.py @@ -90,7 +90,6 @@ class StdDev(StaticMetric): ) return None - @_label def df_fn(self, df=None): """pandas function""" from pandas import DataFrame # pylint: disable=import-outside-toplevel diff --git a/ingestion/src/metadata/profiler/metrics/static/sum.py b/ingestion/src/metadata/profiler/metrics/static/sum.py index 4d9c22dcf4a..123d3911a77 100644 --- a/ingestion/src/metadata/profiler/metrics/static/sum.py +++ b/ingestion/src/metadata/profiler/metrics/static/sum.py @@ -42,7 +42,6 @@ class Sum(StaticMetric): return None - @_label def df_fn(self, df): """pandas function""" if is_quantifiable(self.col.type): diff --git a/ingestion/src/metadata/profiler/metrics/window/median.py b/ingestion/src/metadata/profiler/metrics/window/median.py index 936cb756c24..0c020df11cf 100644 --- a/ingestion/src/metadata/profiler/metrics/window/median.py +++ b/ingestion/src/metadata/profiler/metrics/window/median.py @@ -58,7 +58,6 @@ class Median(StaticMetric): ) return None - @_label def df_fn(self, df=None): """Dataframe function""" from pandas import DataFrame # pylint: disable=import-outside-toplevel diff --git a/ingestion/src/metadata/profiler/profiler/interface/pandas/pandas_profiler_interface.py b/ingestion/src/metadata/profiler/profiler/interface/pandas/pandas_profiler_interface.py index d5137b5c64a..2ede0ad476b 100644 --- a/ingestion/src/metadata/profiler/profiler/interface/pandas/pandas_profiler_interface.py +++ b/ingestion/src/metadata/profiler/profiler/interface/pandas/pandas_profiler_interface.py @@ -19,7 +19,6 @@ from collections import defaultdict from datetime import datetime, timezone from typing import Dict, List -from pandas import DataFrame from sqlalchemy import Column from metadata.generated.schema.entity.data.table import DataType, TableData @@ -101,7 +100,7 @@ class PandasProfilerInterface(ProfilerProtocol, PandasInterfaceMixin): self, metric_type: str, metrics: List[Metrics], - dfs: List[DataFrame], + dfs: List, *args, **kwargs, ):