From 328658ebeaa36e5f9de5d4d562a7adaf020bd25a Mon Sep 17 00:00:00 2001 From: Suresh Srinivas Date: Sat, 28 Aug 2021 13:09:07 -0700 Subject: [PATCH 1/4] [WIP] profiler --- ingestion/examples/workflows/redshift.json | 6 +- ingestion/pipelines/mysql.json | 3 +- ingestion/requirements.txt | 3 +- ingestion/setup.py | 1 + .../ingestion/models/table_metadata.py | 47 +++ .../metadata/ingestion/source/sql_source.py | 39 ++- ingestion/src/metadata/utils/dataprofiler.py | 270 ++++++++++++++++++ 7 files changed, 360 insertions(+), 9 deletions(-) create mode 100644 ingestion/src/metadata/utils/dataprofiler.py diff --git a/ingestion/examples/workflows/redshift.json b/ingestion/examples/workflows/redshift.json index fd22db30e30..d71b831d36a 100644 --- a/ingestion/examples/workflows/redshift.json +++ b/ingestion/examples/workflows/redshift.json @@ -2,9 +2,9 @@ "source": { "type": "redshift", "config": { - "host_port": "cluster.name.region.redshift.amazonaws.com:5439", - "username": "username", - "password": "strong_password", + "host_port": "redshift-cluster-1.clot5cqn1cnb.us-west-2.redshift.amazonaws.com:5439", + "username": "awsuser", + "password": "focguC-kaqqe5-nepsok", "database": "warehouse", "service_name": "aws_redshift", "filter_pattern": { diff --git a/ingestion/pipelines/mysql.json b/ingestion/pipelines/mysql.json index 2b084c4cceb..0f1c54fa026 100644 --- a/ingestion/pipelines/mysql.json +++ b/ingestion/pipelines/mysql.json @@ -4,9 +4,10 @@ "config": { "username": "openmetadata_user", "password": "openmetadata_password", + "database": "openmetadata_db", "service_name": "local_mysql", "filter_pattern": { - "excludes": ["mysql.*", "information_schema.*"] + "excludes": ["mysql.*", "information_schema.*", "performance_schema.*", "sys.*"] } } }, diff --git a/ingestion/requirements.txt b/ingestion/requirements.txt index 07ad3a89aad..7cff288849a 100644 --- a/ingestion/requirements.txt +++ b/ingestion/requirements.txt @@ -16,4 +16,5 @@ confluent_kafka>=1.5.0 fastavro>=1.2.0 google~=3.0.0 okta~=2.0.0 -PyMySQL~=1.0.2 \ No newline at end of file +PyMySQL~=1.0.2 +great-expectations>=0.13.31 \ No newline at end of file diff --git a/ingestion/setup.py b/ingestion/setup.py index a30f0e3b667..0111d15a67a 100644 --- a/ingestion/setup.py +++ b/ingestion/setup.py @@ -64,6 +64,7 @@ base_requirements = { "sql-metadata~=2.0.0", "spacy==3.0.5", "requests~=2.25.1", + "great-expectations>=0.13.31", "en_core_web_sm@https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web" } base_plugins = { diff --git a/ingestion/src/metadata/ingestion/models/table_metadata.py b/ingestion/src/metadata/ingestion/models/table_metadata.py index 6605471de76..eb01aa14842 100644 --- a/ingestion/src/metadata/ingestion/models/table_metadata.py +++ b/ingestion/src/metadata/ingestion/models/table_metadata.py @@ -258,3 +258,50 @@ class Dashboard(BaseModel): charts: List[str] service: EntityReference lastModified: int = None + + +class ValueFrequency(BaseModel): + """Profiler ValueFrequency""" + value: str + frequency: int + + +class Histogram(BaseModel): + """Histogram""" + boundaries: List[str] + heights: List[str] + + +class Quantile(BaseModel): + """Quantile""" + quantile: str + value: str + + +class DatasetColumnProfile(BaseModel): + """Dataset Column Profile stats """ + fqdn: str + unique_count: int = None + unique_proportion: int = None + null_count: int = None + null_proportion: int = None + min: str = None + max: str = None + mean: str = None + median: str = None + stddev: str = None + quantiles: List[Quantile] = None + distinct_value_frequencies: List[ValueFrequency] = None + histogram: List[Histogram] = None + sample_values: List[str] = None + + +class DatasetProfile(BaseModel): + """Dataset(table) stats""" + timestamp: int + table_name: str + row_count: int = None + col_count: int = None + col_profiles: List[DatasetColumnProfile] = None + + diff --git a/ingestion/src/metadata/ingestion/source/sql_source.py b/ingestion/src/metadata/ingestion/source/sql_source.py index b0f680a1f9b..d491dd36b50 100644 --- a/ingestion/src/metadata/ingestion/source/sql_source.py +++ b/ingestion/src/metadata/ingestion/source/sql_source.py @@ -29,7 +29,7 @@ from metadata.generated.schema.type.entityReference import EntityReference from metadata.generated.schema.entity.data.database import Database from metadata.generated.schema.entity.data.table import Table, Column, ColumnConstraint, TableType, TableData -from sqlalchemy import create_engine, inspect +from sqlalchemy import create_engine from sqlalchemy.engine.reflection import Inspector from sqlalchemy.sql import sqltypes as types from sqlalchemy.inspection import inspect @@ -37,8 +37,10 @@ from sqlalchemy.inspection import inspect from metadata.ingestion.api.common import IncludeFilterPattern, ConfigModel, Record from metadata.ingestion.api.common import WorkflowContext from metadata.ingestion.api.source import Source, SourceStatus +from metadata.ingestion.models.table_metadata import DatasetProfile from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig from metadata.utils.helpers import get_database_service_or_create +from metadata.utils.dataprofiler import DataProfiler logger: logging.Logger = logging.getLogger(__name__) @@ -72,6 +74,7 @@ class SQLConnectionConfig(ConfigModel): include_views: Optional[bool] = True include_tables: Optional[bool] = True generate_sample_data: Optional[bool] = True + data_profiler_enabled: Optional[bool] = True filter_pattern: IncludeFilterPattern = IncludeFilterPattern.allow_all() @abstractmethod @@ -152,8 +155,10 @@ def get_column_type(status: SQLSourceStatus, dataset_name: str, column_type: Any return type_class -class SQLSource(Source): + + +class SQLSource(Source): def __init__(self, config: SQLConnectionConfig, metadata_config: MetadataServerConfig, ctx: WorkflowContext): super().__init__(ctx) @@ -172,6 +177,9 @@ class SQLSource(Source): def create(cls, config_dict: dict, metadata_config_dict: dict, ctx: WorkflowContext): pass + def _get_profiler_instance(self, inspector: Inspector) -> DataProfiler: + return DataProfiler(conn=inspector.bind, status=self.status) + def standardize_schema_table_names( self, schema: str, table: str ) -> Tuple[str, str]: @@ -211,7 +219,7 @@ class SQLSource(Source): schema, table_name = self.standardize_schema_table_names(schema, table_name) if not self.sql_config.filter_pattern.included(table_name): self.status.filter('{}.{}'.format(self.config.get_service_name(), table_name), - "Table pattern not allowed") + "Table pattern not allowed") continue self.status.scanned('{}.{}'.format(self.config.get_service_name(), table_name)) @@ -227,6 +235,11 @@ class SQLSource(Source): table_data = self.fetch_sample_data(schema, table_name) table_entity.sampleData = table_data + if self.config.data_profiler_enabled: + data_profiler = self._get_profiler_instance(inspector) + profile = self.run_data_profiler(data_profiler, table_name, schema) + logger.info(profile.json()) + table_and_db = OMetaDatabaseAndTable(table=table_entity, database=self._get_database(schema)) yield table_and_db except ValidationError as err: @@ -241,7 +254,7 @@ class SQLSource(Source): try: if not self.sql_config.filter_pattern.included(view_name): self.status.filter('{}.{}'.format(self.config.get_service_name(), view_name), - "View pattern not allowed") + "View pattern not allowed") continue try: view_definition = inspector.get_view_definition(view_name, schema) @@ -324,6 +337,24 @@ class SQLSource(Source): description = table_info["text"] return description + def run_data_profiler( + self, + profiler: DataProfiler, + table: str, + schema: str + ) -> DatasetProfile: + dataset_name = f"{schema}.{table}" + self.status.scanned(f"profile of {dataset_name}") + logger.info(f"Profiling {dataset_name} (this may take a while)") + profile = profiler.generate_profile( + pretty_name=dataset_name, + schema=schema, + table=table, + limit=50000, + offset=0) + logger.debug(f"Finished profiling {dataset_name}") + return profile + def close(self): if self.connection is not None: self.connection.close() diff --git a/ingestion/src/metadata/utils/dataprofiler.py b/ingestion/src/metadata/utils/dataprofiler.py new file mode 100644 index 00000000000..359598bd22c --- /dev/null +++ b/ingestion/src/metadata/utils/dataprofiler.py @@ -0,0 +1,270 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import contextlib +import dataclasses +import logging +import unittest.mock +import time +from typing import Any, Iterable, Optional, Callable, Tuple, TypeVar + +from great_expectations.core.expectation_validation_result import ( + ExpectationSuiteValidationResult, + ExpectationValidationResult, +) +from great_expectations.data_context import BaseDataContext +from great_expectations.data_context.types.base import ( + DataContextConfig, + DatasourceConfig, + InMemoryStoreBackendDefaults, +) +from great_expectations.datasource.sqlalchemy_datasource import SqlAlchemyDatasource + +from metadata.ingestion.api.source import SourceStatus +from metadata.ingestion.models.table_metadata import DatasetProfile, DatasetColumnProfile, Quantile, \ + Histogram, ValueFrequency + +logger: logging.Logger = logging.getLogger(__name__) + +T = TypeVar("T") +K = TypeVar("K") + + +def groupby_unsorted( + iterable: Iterable[T], key: Callable[[T], K] +) -> Iterable[Tuple[K, Iterable[T]]]: + """The default itertools.groupby() requires that the iterable is already sorted by the key. + This method is similar to groupby() but without the pre-sorted requirement.""" + + values = collections.defaultdict(list) + for v in iterable: + values[key(v)].append(v) + return values.items() + + +@contextlib.contextmanager +def _properly_init_datasource(conn): + underlying_datasource_init = SqlAlchemyDatasource.__init__ + + def sqlalchemy_datasource_init( + self: SqlAlchemyDatasource, *args: Any, **kwargs: Any + ) -> None: + underlying_datasource_init(self, *args, **kwargs, engine=conn) + self.drivername = conn.dialect.name + del self._datasource_config["engine"] + + with unittest.mock.patch( + "great_expectations.datasource.sqlalchemy_datasource.SqlAlchemyDatasource.__init__", + sqlalchemy_datasource_init, + ), unittest.mock.patch( + "great_expectations.data_context.store.validations_store.ValidationsStore.set" + ): + yield + + +@dataclasses.dataclass +class DataProfiler: + data_context: BaseDataContext + status: SourceStatus + datasource_name: str = "om_sqlalchemy_datasource" + + def __init__(self, conn, status): + self.conn = conn + self.status = status + + data_context_config = DataContextConfig( + datasources={ + self.datasource_name: DatasourceConfig( + class_name="SqlAlchemyDatasource", + credentials={ + "url": self.conn.engine.url, + }, + ) + }, + store_backend_defaults=InMemoryStoreBackendDefaults(), + anonymous_usage_statistics={ + "enabled": False, + }, + ) + + with _properly_init_datasource(self.conn): + self.data_context = BaseDataContext(project_config=data_context_config) + + def generate_profile( + self, + pretty_name: str, + schema: str = None, + table: str = None, + limit: int = None, + offset: int = None, + **kwargs: Any, + ) -> DatasetProfile: + with _properly_init_datasource(self.conn): + evrs = self._profile_data_asset( + { + "schema": schema, + "table": table, + "limit": limit, + "offset": offset, + **kwargs, + }, + pretty_name=pretty_name, + ) + profile = self._convert_evrs_to_profile(evrs, pretty_name=pretty_name) + return profile + + def _profile_data_asset( + self, + batch_kwargs: dict, + pretty_name: str, + ) -> ExpectationSuiteValidationResult: + # Internally, this uses the GE dataset profiler: + # great_expectations.profile.basic_dataset_profiler.BasicDatasetProfiler + + profile_results = self.data_context.profile_data_asset( + self.datasource_name, + batch_kwargs={ + "datasource": self.datasource_name, + **batch_kwargs, + }, + ) + assert profile_results["success"] + + assert len(profile_results["results"]) == 1 + _suite, evrs = profile_results["results"][0] + return evrs + + @staticmethod + def _get_column_from_evr(evr: ExpectationValidationResult) -> Optional[str]: + return evr.expectation_config.kwargs.get("column") + + def _convert_evrs_to_profile( + self, evrs: ExpectationSuiteValidationResult, pretty_name: str + ) -> DatasetProfile: + profile = None + column_profiles = [] + for col, evrs_for_col in groupby_unsorted( + evrs.results, key=self._get_column_from_evr + ): + if col is None: + profile = self._handle_convert_table_evrs(evrs_for_col, pretty_name=pretty_name) + else: + column_profile = self._handle_convert_column_evrs(col, evrs_for_col, pretty_name=pretty_name) + column_profiles.append(column_profile) + + if profile is not None: + profile.col_profiles = column_profiles + return profile + + def _handle_convert_table_evrs( + self, + table_evrs: Iterable[ExpectationValidationResult], + pretty_name: str, + ) -> DatasetProfile: + logger.info("generating table stats") + profile = DatasetProfile(timestamp=round(time.time() * 1000), table_name=pretty_name) + for evr in table_evrs: + exp: str = evr.expectation_config.expectation_type + res: dict = evr.result + if exp == "expect_table_row_count_to_be_between": + profile.row_count = res["observed_value"] + elif exp == "expect_table_columns_to_match_ordered_list": + profile.col_count = len(res["observed_value"]) + else: + self.status.warning( + f"profile of {pretty_name}", f"unknown table mapper {exp}" + ) + return profile + + def _handle_convert_column_evrs( + self, + column: str, + col_evrs: Iterable[ExpectationValidationResult], + pretty_name: str, + ) -> DatasetColumnProfile: + logger.info(f"Generating Column Stats for {column}") + column_profile = DatasetColumnProfile(fqdn=column) + for evr in col_evrs: + exp: str = evr.expectation_config.expectation_type + res: dict = evr.result + if not res: + self.status.warning( + f"profile of {pretty_name}", f"{exp} did not yield any results" + ) + continue + + if exp == "expect_column_unique_value_count_to_be_between": + column_profile.unique_count = res["observed_value"] + elif exp == "expect_column_proportion_of_unique_values_to_be_between": + column_profile.unique_proportion = res["observed_value"] + elif exp == "expect_column_values_to_not_be_null": + column_profile.null_count = res["unexpected_count"] + if ( + "unexpected_percent" in res + and res["unexpected_percent"] is not None + ): + column_profile.null_proportion = res["unexpected_percent"] / 100 + elif exp == "expect_column_values_to_not_match_regex": + pass + elif exp == "expect_column_mean_to_be_between": + column_profile.mean = str(res["observed_value"]) + elif exp == "expect_column_min_to_be_between": + column_profile.min = str(res["observed_value"]) + elif exp == "expect_column_max_to_be_between": + column_profile.max = str(res["observed_value"]) + elif exp == "expect_column_median_to_be_between": + column_profile.median = str(res["observed_value"]) + elif exp == "expect_column_stdev_to_be_between": + column_profile.stddev = str(res["observed_value"]) + elif exp == "expect_column_quantile_values_to_be_between": + if "observed_value" in res: + column_profile.quantiles = [ + Quantile(quantile=str(quantile), value=str(value)) + for quantile, value in zip( + res["observed_value"]["quantiles"], + res["observed_value"]["values"], + ) + ] + elif exp == "expect_column_values_to_be_in_set": + column_profile.sample_values = [ + str(v) for v in res["partial_unexpected_list"] + ] + elif exp == "expect_column_kl_divergence_to_be_less_than": + if "details" in res and "observed_partition" in res["details"]: + partition = res["details"]["observed_partition"] + column_profile.histogram = Histogram( + [str(v) for v in partition["bins"]], + [ + partition["tail_weights"][0], + *partition["weights"], + partition["tail_weights"][1], + ], + ) + elif exp == "expect_column_distinct_values_to_be_in_set": + if "details" in res and "value_counts" in res["details"]: + column_profile.distinct_value_frequencies = [ + ValueFrequency(value=str(value), frequency=count) + for value, count in res["details"]["value_counts"].items() + ] + elif exp == "expect_column_values_to_be_in_type_list": + pass + elif exp == "expect_column_values_to_be_unique": + pass + else: + self.status.warning( + f"profile of {pretty_name}", + f"warning: unknown column mapper {exp} in col {column}", + ) + return column_profile From 2369ddc8587621e65c4c6717965447765172fc8a Mon Sep 17 00:00:00 2001 From: Sriharsha Chintalapani Date: Wed, 8 Sep 2021 23:55:48 -0700 Subject: [PATCH 2/4] [WIP] Fix #446: Add DataProfiler to ingestion and APIs --- .../catalog/jdbi3/TableRepository.java | 26 ++ .../resources/databases/TableResource.java | 16 ++ .../json/schema/entity/data/table.json | 78 +++++ .../databases/TableResourceTest.java | 18 ++ ingestion/setup.py | 3 +- .../metadata/generated/data/tags/__init__.py | 2 +- .../generated/data/tags/personalDataTags.py | 4 +- .../metadata/generated/data/tags/piiTags.py | 4 +- .../metadata/generated/data/tags/tierTags.py | 2 +- .../metadata/generated/data/tags/userTags.py | 2 +- .../metadata/generated/schema/api/__init__.py | 2 +- .../generated/schema/api/catalogVersion.py | 2 +- .../generated/schema/api/data/__init__.py | 2 +- .../generated/schema/api/data/createChart.py | 2 +- .../schema/api/data/createDashboard.py | 2 +- .../schema/api/data/createDatabase.py | 2 +- .../generated/schema/api/data/createTable.py | 2 +- .../generated/schema/api/data/createTopic.py | 5 +- .../generated/schema/api/feed/__init__.py | 2 +- .../generated/schema/api/feed/createThread.py | 2 +- .../generated/schema/api/services/__init__.py | 2 +- .../api/services/createDashboardService.py | 2 +- .../api/services/createDatabaseService.py | 2 +- .../api/services/createMessagingService.py | 2 +- .../api/services/updateDashboardService.py | 2 +- .../api/services/updateDatabaseService.py | 2 +- .../api/services/updateMessagingService.py | 2 +- .../metadata/generated/schema/api/setOwner.py | 2 +- .../generated/schema/api/tags/__init__.py | 2 +- .../generated/schema/api/tags/createTag.py | 2 +- .../schema/api/tags/createTagCategory.py | 2 +- .../generated/schema/api/teams/__init__.py | 2 +- .../generated/schema/api/teams/createTeam.py | 2 +- .../generated/schema/api/teams/createUser.py | 2 +- .../generated/schema/entity/__init__.py | 2 +- .../metadata/generated/schema/entity/bots.py | 2 +- .../generated/schema/entity/data/__init__.py | 2 +- .../generated/schema/entity/data/chart.py | 2 +- .../generated/schema/entity/data/dashboard.py | 2 +- .../generated/schema/entity/data/database.py | 2 +- .../generated/schema/entity/data/metrics.py | 2 +- .../generated/schema/entity/data/pipeline.py | 2 +- .../generated/schema/entity/data/report.py | 2 +- .../generated/schema/entity/data/table.py | 50 +++- .../generated/schema/entity/data/topic.py | 5 +- .../generated/schema/entity/feed/__init__.py | 2 +- .../generated/schema/entity/feed/thread.py | 2 +- .../schema/entity/services/__init__.py | 2 +- .../entity/services/dashboardService.py | 2 +- .../schema/entity/services/databaseService.py | 2 +- .../entity/services/messagingService.py | 2 +- .../generated/schema/entity/tags/__init__.py | 2 +- .../schema/entity/tags/tagCategory.py | 2 +- .../generated/schema/entity/teams/__init__.py | 2 +- .../generated/schema/entity/teams/team.py | 2 +- .../generated/schema/entity/teams/user.py | 2 +- .../generated/schema/type/__init__.py | 2 +- .../generated/schema/type/auditLog.py | 2 +- .../metadata/generated/schema/type/basic.py | 2 +- .../schema/type/collectionDescriptor.py | 2 +- .../generated/schema/type/dailyCount.py | 2 +- .../generated/schema/type/entityReference.py | 2 +- .../generated/schema/type/entityUsage.py | 2 +- .../generated/schema/type/jdbcConnection.py | 2 +- .../metadata/generated/schema/type/paging.py | 23 ++ .../metadata/generated/schema/type/profile.py | 2 +- .../generated/schema/type/schedule.py | 2 +- .../generated/schema/type/tagLabel.py | 2 +- .../generated/schema/type/usageDetails.py | 2 +- .../metadata/ingestion/source/sql_source.py | 58 ++-- ingestion/src/metadata/profiler/__init__.py | 0 .../src/metadata/profiler/dataprofiler.py | 207 ++++++++++++++ ingestion/src/metadata/profiler/util.py | 30 ++ ingestion/src/metadata/utils/dataprofiler.py | 270 ------------------ 74 files changed, 545 insertions(+), 368 deletions(-) create mode 100644 ingestion/src/metadata/generated/schema/type/paging.py create mode 100644 ingestion/src/metadata/profiler/__init__.py create mode 100644 ingestion/src/metadata/profiler/dataprofiler.py create mode 100644 ingestion/src/metadata/profiler/util.py delete mode 100644 ingestion/src/metadata/utils/dataprofiler.py diff --git a/catalog-rest-service/src/main/java/org/openmetadata/catalog/jdbi3/TableRepository.java b/catalog-rest-service/src/main/java/org/openmetadata/catalog/jdbi3/TableRepository.java index e527805d8a9..223232a6c90 100644 --- a/catalog-rest-service/src/main/java/org/openmetadata/catalog/jdbi3/TableRepository.java +++ b/catalog-rest-service/src/main/java/org/openmetadata/catalog/jdbi3/TableRepository.java @@ -36,6 +36,7 @@ import org.openmetadata.catalog.type.EntityReference; import org.openmetadata.catalog.type.JoinedWith; import org.openmetadata.catalog.type.TableData; import org.openmetadata.catalog.type.TableJoins; +import org.openmetadata.catalog.type.TableProfile; import org.openmetadata.catalog.type.TagLabel; import org.openmetadata.catalog.util.EntityUtil; import org.openmetadata.catalog.util.EntityUtil.Fields; @@ -273,6 +274,25 @@ public abstract class TableRepository { JsonUtils.pojoToJson(tableData)); } + @Transaction + public void addTableProfileData(String tableId, List tableProfiles) throws IOException { + // Validate the request content + Table table = EntityUtil.validate(tableId, tableDAO().findById(tableId), Table.class); + List storedTableProfiles = getTableProfile(table); + Map storedMapTableProfiles = new HashMap<>(); + for (TableProfile profile: storedTableProfiles) { + storedMapTableProfiles.put(profile.getProfileDate(), profile); + } + + for (TableProfile profile: tableProfiles) { + storedMapTableProfiles.put(profile.getProfileDate(), profile); + } + List updatedProfiles = new ArrayList(storedMapTableProfiles.values()); + + entityExtensionDAO().insert(tableId, "table.tableProfile", "tableProfile", + JsonUtils.pojoToJson(updatedProfiles)); + } + @Transaction public void deleteFollower(String tableId, String userId) { EntityUtil.validateUser(userDAO(), userId); @@ -436,6 +456,7 @@ public abstract class TableRepository { table.setJoins(fields.contains("joins") ? getJoins(table) : null); table.setSampleData(fields.contains("sampleData") ? getSampleData(table) : null); table.setViewDefinition(fields.contains("viewDefinition") ? table.getViewDefinition() : null); + table.setTableProfile(fields.contains("tableProfile") ? getTableProfile(table): null); return table; } @@ -650,6 +671,11 @@ public abstract class TableRepository { TableData.class); } + private List getTableProfile(Table table) throws IOException { + return JsonUtils.readObjects(entityExtensionDAO().getExtension(table.getId().toString(), + "table.tableProfile"), + TableProfile.class); + } public interface TableDAO { @SqlUpdate("INSERT INTO table_entity (json) VALUES (:json)") diff --git a/catalog-rest-service/src/main/java/org/openmetadata/catalog/resources/databases/TableResource.java b/catalog-rest-service/src/main/java/org/openmetadata/catalog/resources/databases/TableResource.java index 057086e5dda..fca74404e86 100644 --- a/catalog-rest-service/src/main/java/org/openmetadata/catalog/resources/databases/TableResource.java +++ b/catalog-rest-service/src/main/java/org/openmetadata/catalog/resources/databases/TableResource.java @@ -35,6 +35,7 @@ import org.openmetadata.catalog.resources.Collection; import org.openmetadata.catalog.security.CatalogAuthorizer; import org.openmetadata.catalog.security.SecurityUtil; import org.openmetadata.catalog.type.EntityReference; +import org.openmetadata.catalog.type.TableProfile; import org.openmetadata.catalog.util.EntityUtil; import org.openmetadata.catalog.util.EntityUtil.Fields; import org.openmetadata.catalog.util.RestUtil; @@ -349,6 +350,21 @@ public class TableResource { return addHref(uriInfo, table); } + @PUT + @Path("/{id}/tableProfile") + @Operation(summary = "Add table profile data", tags = "tables", + description = "Add table profile data to the table.") + public Table addDataProfiler(@Context UriInfo uriInfo, + @Context SecurityContext securityContext, + @Parameter(description = "Id of the table", schema = @Schema(type = "string")) + @PathParam("id") String id, List tableProfiles) throws IOException, ParseException { + SecurityUtil.checkAdminOrBotRole(authorizer, securityContext); + Fields fields = new Fields(FIELD_LIST, "tableProfile"); + dao.addTableProfileData(id, tableProfiles); + Table table = dao.get(id, fields); + return addHref(uriInfo, table); + } + @DELETE @Path("/{id}/followers/{userId}") @Operation(summary = "Remove a follower", tags = "tables", diff --git a/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json b/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json index 9d194c52fb3..b021d385345 100644 --- a/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json +++ b/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json @@ -240,6 +240,76 @@ } }, "additionalProperties": false + }, + "columnProfile": { + "type": "object", + "javaType": "org.openmetadata.catalog.type.ColumnProfile", + "description": "This schema defines the type to capture the table's column profile.", + "properties": { + "name": { + "description": "Column Name.", + "type": "string" + }, + "uniqueCount": { + "description": "No. of unique values in the column", + "type": "number" + }, + "uniqueProportion": { + "description": "Proportion of number of unique values in a column", + "type": "number" + }, + "nullCount": { + "description": "No.of null values in a column", + "type": "number" + }, + "nullProportion": { + "description": "No.of null value proportion in columns", + "type": "number" + }, + "min": { + "description": "Minimum value in a column. Applies to column with number values. It will be 0 for any other types", + "type": "number" + }, + "max": { + "description": "Maximum value in a column. Applies to column with number values. It will be 0 for any other types", + "type": "number" + }, + "mean": { + "description": "Avg value in a column. Applies to column with number values. It will be 0 for any other types", + "type": "number" + }, + "stddev": { + "description": "Standard deviation of a column.", + "type": "number" + } + } + }, + "tableProfile": { + "type": "object", + "javaType": "org.openmetadata.catalog.type.TableProfile", + "description": "This schema defines the type to capture the table's data profile.", + "properties": { + "profileDate": { + "description": "Data one which profile is taken.", + "$ref": "../../type/basic.json#/definitions/date" + }, + "columnCount": { + "description": "No.of columns in the table.", + "type": "number" + }, + "rowCount": { + "description": "No.of rows in the table.", + "type": "number" + }, + "columnProfile": { + "description": "List of local column profiles of the table.", + "type": "array", + "items": { + "$ref": "#/definitions/columnProfile" + } + } + }, + "additionalProperties": false } }, "properties": { @@ -321,6 +391,14 @@ "description": "Sample data for a table.", "$ref": "#/definitions/tableData", "default": null + }, + "tableProfile": { + "description": "Data profile for a table.", + "type": "array", + "items": { + "$ref": "#/definitions/tableProfile" + }, + "default": null } }, "required": [ diff --git a/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java b/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java index 861a99a6560..962422dc813 100644 --- a/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java +++ b/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java @@ -610,6 +610,24 @@ public class TableResourceTest extends CatalogApplicationTest { "TableType View, SecureView or MaterializedView"); } + @Test + public void put_tableProfile_200(TestInfo test) throws HttpResponseException { + Table table = createAndCheckTable(create(test), adminAuthHeaders()); + List columns = Arrays.asList("c1", "c2", "c3"); + + // Add 3 rows of sample data for 3 columns + List> rows = Arrays.asList(Arrays.asList("c1Value1", 1, true), + Arrays.asList("c1Value2", null, false), + Arrays.asList("c1Value3", 3, true)); + + TableData tableData = new TableData().withColumns(columns).withRows(rows); + putSampleData(table.getId(), tableData, adminAuthHeaders()); + + table = getTable(table.getId(), "sampleData", adminAuthHeaders()); + assertEquals(tableData, table.getSampleData()); + } + + @Test public void get_nonExistentTable_404_notFound() { HttpResponseException exception = assertThrows(HttpResponseException.class, () -> diff --git a/ingestion/setup.py b/ingestion/setup.py index 0111d15a67a..4eab765c79e 100644 --- a/ingestion/setup.py +++ b/ingestion/setup.py @@ -58,8 +58,7 @@ base_requirements = { "email-validator>=1.0.3", "wheel~=0.36.2", "python-jose==3.3.0", - "okta==1.7.0", - "pandas~=1.3.1", + "okta>=1.7.0", "sqlalchemy>=1.3.24", "sql-metadata~=2.0.0", "spacy==3.0.5", diff --git a/ingestion/src/metadata/generated/data/tags/__init__.py b/ingestion/src/metadata/generated/data/tags/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/data/tags/__init__.py +++ b/ingestion/src/metadata/generated/data/tags/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/data/tags/personalDataTags.py b/ingestion/src/metadata/generated/data/tags/personalDataTags.py index f561667dd81..9ad3620a01e 100644 --- a/ingestion/src/metadata/generated/data/tags/personalDataTags.py +++ b/ingestion/src/metadata/generated/data/tags/personalDataTags.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: data/tags/personalDataTags.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations @@ -12,5 +12,5 @@ from pydantic import BaseModel, Field class Model(BaseModel): __root__: Any = Field( ..., - description='Tags related classifying **Personal data** as defined by **GDPR.**\n\n\n\n_Note to Legal - This tag category is provided as a starting point. Please review and update the tags based on your company policy. Also, add a reference to your GDPR policy document in this description._', + description='Tags related classifying **Personal data** as defined by **GDPR.**

_Note to Legal - This tag category is provided as a starting point. Please review and update the tags based on your company policy. Also, add a reference to your GDPR policy document in this description._', ) diff --git a/ingestion/src/metadata/generated/data/tags/piiTags.py b/ingestion/src/metadata/generated/data/tags/piiTags.py index 8632b988d83..3c92d23b2b7 100644 --- a/ingestion/src/metadata/generated/data/tags/piiTags.py +++ b/ingestion/src/metadata/generated/data/tags/piiTags.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: data/tags/piiTags.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations @@ -12,5 +12,5 @@ from pydantic import BaseModel, Field class Model(BaseModel): __root__: Any = Field( ..., - description='Personally Identifiable Information information that, when used alone or with other relevant data, can identify an individual.\n\n\n\n_Note to Legal - This tag category is provided as a starting point. Please review and update the tags based on your company policy. Also, add a reference to your PII policy document in this description._', + description='Personally Identifiable Information information that, when used alone or with other relevant data, can identify an individual.

_Note to Legal - This tag category is provided as a starting point. Please review and update the tags based on your company policy. Also, add a reference to your PII policy document in this description._', ) diff --git a/ingestion/src/metadata/generated/data/tags/tierTags.py b/ingestion/src/metadata/generated/data/tags/tierTags.py index 81cab8a2d3e..5558bd819f7 100644 --- a/ingestion/src/metadata/generated/data/tags/tierTags.py +++ b/ingestion/src/metadata/generated/data/tags/tierTags.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: data/tags/tierTags.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/data/tags/userTags.py b/ingestion/src/metadata/generated/data/tags/userTags.py index 01900c3ecea..3e574d93006 100644 --- a/ingestion/src/metadata/generated/data/tags/userTags.py +++ b/ingestion/src/metadata/generated/data/tags/userTags.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: data/tags/userTags.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/__init__.py b/ingestion/src/metadata/generated/schema/api/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/schema/api/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/catalogVersion.py b/ingestion/src/metadata/generated/schema/api/catalogVersion.py index 22261343910..1e7c3c03b9b 100644 --- a/ingestion/src/metadata/generated/schema/api/catalogVersion.py +++ b/ingestion/src/metadata/generated/schema/api/catalogVersion.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/catalogVersion.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/__init__.py b/ingestion/src/metadata/generated/schema/api/data/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/schema/api/data/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/data/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/data/createChart.py b/ingestion/src/metadata/generated/schema/api/data/createChart.py index 275497e1cd9..3d01f973e93 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createChart.py +++ b/ingestion/src/metadata/generated/schema/api/data/createChart.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createChart.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/createDashboard.py b/ingestion/src/metadata/generated/schema/api/data/createDashboard.py index eb54e1a8ce6..5089a87df7d 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createDashboard.py +++ b/ingestion/src/metadata/generated/schema/api/data/createDashboard.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createDashboard.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/createDatabase.py b/ingestion/src/metadata/generated/schema/api/data/createDatabase.py index 873c2d4e0cc..4d029474b1f 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createDatabase.py +++ b/ingestion/src/metadata/generated/schema/api/data/createDatabase.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createDatabase.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/createTable.py b/ingestion/src/metadata/generated/schema/api/data/createTable.py index 1fbcce9bc96..bde335327e5 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createTable.py +++ b/ingestion/src/metadata/generated/schema/api/data/createTable.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createTable.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/createTopic.py b/ingestion/src/metadata/generated/schema/api/data/createTopic.py index 97307991d17..ae8e49878ba 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createTopic.py +++ b/ingestion/src/metadata/generated/schema/api/data/createTopic.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createTopic.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations @@ -38,8 +38,7 @@ class CreateTopic(BaseModel): description='Topic clean up policy. For Kafka - `cleanup.policy` configuration.', ) replicationFactor: Optional[int] = Field( - None, - description='Replication Factor in integer (more than 1).', + None, description='Replication Factor in integer (more than 1).' ) retentionTime: Optional[float] = Field( None, diff --git a/ingestion/src/metadata/generated/schema/api/feed/__init__.py b/ingestion/src/metadata/generated/schema/api/feed/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/schema/api/feed/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/feed/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/feed/createThread.py b/ingestion/src/metadata/generated/schema/api/feed/createThread.py index 8a09d019e7f..54c19c6bfc9 100644 --- a/ingestion/src/metadata/generated/schema/api/feed/createThread.py +++ b/ingestion/src/metadata/generated/schema/api/feed/createThread.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/feed/createThread.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/__init__.py b/ingestion/src/metadata/generated/schema/api/services/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/schema/api/services/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/services/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/services/createDashboardService.py b/ingestion/src/metadata/generated/schema/api/services/createDashboardService.py index 39a5bfd2413..375beecf1b9 100644 --- a/ingestion/src/metadata/generated/schema/api/services/createDashboardService.py +++ b/ingestion/src/metadata/generated/schema/api/services/createDashboardService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/createDashboardService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/createDatabaseService.py b/ingestion/src/metadata/generated/schema/api/services/createDatabaseService.py index 260df61a82c..c364749f21a 100644 --- a/ingestion/src/metadata/generated/schema/api/services/createDatabaseService.py +++ b/ingestion/src/metadata/generated/schema/api/services/createDatabaseService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/createDatabaseService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/createMessagingService.py b/ingestion/src/metadata/generated/schema/api/services/createMessagingService.py index e318a6d16bf..b5831d4f7a6 100644 --- a/ingestion/src/metadata/generated/schema/api/services/createMessagingService.py +++ b/ingestion/src/metadata/generated/schema/api/services/createMessagingService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/createMessagingService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/updateDashboardService.py b/ingestion/src/metadata/generated/schema/api/services/updateDashboardService.py index 560493feb7b..e5096c6234e 100644 --- a/ingestion/src/metadata/generated/schema/api/services/updateDashboardService.py +++ b/ingestion/src/metadata/generated/schema/api/services/updateDashboardService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/updateDashboardService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/updateDatabaseService.py b/ingestion/src/metadata/generated/schema/api/services/updateDatabaseService.py index 031d2770e37..85d44d46ab8 100644 --- a/ingestion/src/metadata/generated/schema/api/services/updateDatabaseService.py +++ b/ingestion/src/metadata/generated/schema/api/services/updateDatabaseService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/updateDatabaseService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/updateMessagingService.py b/ingestion/src/metadata/generated/schema/api/services/updateMessagingService.py index ce64422bdde..d3aa0d22082 100644 --- a/ingestion/src/metadata/generated/schema/api/services/updateMessagingService.py +++ b/ingestion/src/metadata/generated/schema/api/services/updateMessagingService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/updateMessagingService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/setOwner.py b/ingestion/src/metadata/generated/schema/api/setOwner.py index 150f08d2156..4b98e3da2ad 100644 --- a/ingestion/src/metadata/generated/schema/api/setOwner.py +++ b/ingestion/src/metadata/generated/schema/api/setOwner.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/setOwner.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/tags/__init__.py b/ingestion/src/metadata/generated/schema/api/tags/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/schema/api/tags/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/tags/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/tags/createTag.py b/ingestion/src/metadata/generated/schema/api/tags/createTag.py index 5a4bcf9558b..e1cf6c6148b 100644 --- a/ingestion/src/metadata/generated/schema/api/tags/createTag.py +++ b/ingestion/src/metadata/generated/schema/api/tags/createTag.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/tags/createTag.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/tags/createTagCategory.py b/ingestion/src/metadata/generated/schema/api/tags/createTagCategory.py index 8542311d5bd..3fcde21259d 100644 --- a/ingestion/src/metadata/generated/schema/api/tags/createTagCategory.py +++ b/ingestion/src/metadata/generated/schema/api/tags/createTagCategory.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/tags/createTagCategory.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/teams/__init__.py b/ingestion/src/metadata/generated/schema/api/teams/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/schema/api/teams/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/teams/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/teams/createTeam.py b/ingestion/src/metadata/generated/schema/api/teams/createTeam.py index ee4c92e71fc..5095eddee3e 100644 --- a/ingestion/src/metadata/generated/schema/api/teams/createTeam.py +++ b/ingestion/src/metadata/generated/schema/api/teams/createTeam.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/teams/createTeam.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/teams/createUser.py b/ingestion/src/metadata/generated/schema/api/teams/createUser.py index 023cd75056d..106675a3c82 100644 --- a/ingestion/src/metadata/generated/schema/api/teams/createUser.py +++ b/ingestion/src/metadata/generated/schema/api/teams/createUser.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/teams/createUser.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/__init__.py b/ingestion/src/metadata/generated/schema/entity/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/schema/entity/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/bots.py b/ingestion/src/metadata/generated/schema/entity/bots.py index ebe1d06cf3d..106b6bb98fd 100644 --- a/ingestion/src/metadata/generated/schema/entity/bots.py +++ b/ingestion/src/metadata/generated/schema/entity/bots.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/bots.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/__init__.py b/ingestion/src/metadata/generated/schema/entity/data/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/data/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/data/chart.py b/ingestion/src/metadata/generated/schema/entity/data/chart.py index 249c82a46bf..37be9ddb313 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/chart.py +++ b/ingestion/src/metadata/generated/schema/entity/data/chart.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/chart.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/dashboard.py b/ingestion/src/metadata/generated/schema/entity/data/dashboard.py index a0729f64840..bf10e03d931 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/dashboard.py +++ b/ingestion/src/metadata/generated/schema/entity/data/dashboard.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/dashboard.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/database.py b/ingestion/src/metadata/generated/schema/entity/data/database.py index c49d76ce02d..50a7a7771ad 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/database.py +++ b/ingestion/src/metadata/generated/schema/entity/data/database.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/database.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/metrics.py b/ingestion/src/metadata/generated/schema/entity/data/metrics.py index 5fc27a3a1e6..fbcd44878ab 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/metrics.py +++ b/ingestion/src/metadata/generated/schema/entity/data/metrics.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/metrics.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/pipeline.py b/ingestion/src/metadata/generated/schema/entity/data/pipeline.py index b4d39be0f92..90a1436c900 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/pipeline.py +++ b/ingestion/src/metadata/generated/schema/entity/data/pipeline.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/pipeline.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/report.py b/ingestion/src/metadata/generated/schema/entity/data/report.py index 5b4b609584e..8e2edcc8362 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/report.py +++ b/ingestion/src/metadata/generated/schema/entity/data/report.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/report.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/table.py b/ingestion/src/metadata/generated/schema/entity/data/table.py index c5d369cda01..d17f2707ca4 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/table.py +++ b/ingestion/src/metadata/generated/schema/entity/data/table.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/table.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations @@ -124,6 +124,35 @@ class TableData(BaseModel): ) +class ColumnProfile(BaseModel): + name: Optional[str] = Field(None, description='Column Name.') + uniqueCount: Optional[float] = Field( + None, description='No. of unique values in the column' + ) + uniqueProportion: Optional[float] = Field( + None, description='Proportion of number of unique values in a column' + ) + nullCount: Optional[float] = Field( + None, description='No.of null values in a column' + ) + nullProportion: Optional[float] = Field( + None, description='No.of null value proportion in columns' + ) + min: Optional[float] = Field( + None, + description='Minimum value in a column. Applies to column with number values. It will be 0 for any other types', + ) + max: Optional[float] = Field( + None, + description='Maximum value in a column. Applies to column with number values. It will be 0 for any other types', + ) + mean: Optional[float] = Field( + None, + description='Avg value in a column. Applies to column with number values. It will be 0 for any other types', + ) + stddev: Optional[float] = Field(None, description='Standard deviation of a column.') + + class TableJoins(BaseModel): class Config: extra = Extra.forbid @@ -135,6 +164,22 @@ class TableJoins(BaseModel): columnJoins: Optional[List[ColumnJoins]] = None +class TableProfile(BaseModel): + class Config: + extra = Extra.forbid + + profileDate: Optional[basic.Date] = Field( + None, description='Data one which profile is taken.' + ) + columnCount: Optional[float] = Field( + None, description='No.of columns in the table.' + ) + rowCount: Optional[float] = Field(None, description='No.of rows in the table.') + column_profile: Optional[List[ColumnProfile]] = Field( + None, description='List of local column profiles of the table.' + ) + + class Column(BaseModel): name: ColumnName columnDataType: ColumnDataType = Field( @@ -194,3 +239,6 @@ class Table(BaseModel): sampleData: Optional[TableData] = Field( None, description='Sample data for a table.' ) + dataProfile: Optional[TableProfile] = Field( + None, description='Data profile for a table.' + ) diff --git a/ingestion/src/metadata/generated/schema/entity/data/topic.py b/ingestion/src/metadata/generated/schema/entity/data/topic.py index 78e16b8b613..527bcef2a19 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/topic.py +++ b/ingestion/src/metadata/generated/schema/entity/data/topic.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/topic.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations @@ -64,6 +64,9 @@ class Topic(BaseModel): None, description='Retention time in milliseconds. For Kafka - `retention.ms` configuration.', ) + replicationFactor: Optional[int] = Field( + None, description='Replication Factor in integer (more than 1).' + ) maximumMessageSize: Optional[int] = Field( None, description='Maximum message size in bytes. For Kafka - `max.message.bytes` configuration.', diff --git a/ingestion/src/metadata/generated/schema/entity/feed/__init__.py b/ingestion/src/metadata/generated/schema/entity/feed/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/schema/entity/feed/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/feed/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/feed/thread.py b/ingestion/src/metadata/generated/schema/entity/feed/thread.py index 90d10fe75e7..cc853bd5ed0 100644 --- a/ingestion/src/metadata/generated/schema/entity/feed/thread.py +++ b/ingestion/src/metadata/generated/schema/entity/feed/thread.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/feed/thread.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/services/__init__.py b/ingestion/src/metadata/generated/schema/entity/services/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/schema/entity/services/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/services/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/services/dashboardService.py b/ingestion/src/metadata/generated/schema/entity/services/dashboardService.py index 2e7dcb13ba5..b73c0cb1b67 100644 --- a/ingestion/src/metadata/generated/schema/entity/services/dashboardService.py +++ b/ingestion/src/metadata/generated/schema/entity/services/dashboardService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/services/dashboardService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/services/databaseService.py b/ingestion/src/metadata/generated/schema/entity/services/databaseService.py index 6969ce9e78d..ce59c700905 100644 --- a/ingestion/src/metadata/generated/schema/entity/services/databaseService.py +++ b/ingestion/src/metadata/generated/schema/entity/services/databaseService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/services/databaseService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/services/messagingService.py b/ingestion/src/metadata/generated/schema/entity/services/messagingService.py index 4d9887a22af..e257a35d44f 100644 --- a/ingestion/src/metadata/generated/schema/entity/services/messagingService.py +++ b/ingestion/src/metadata/generated/schema/entity/services/messagingService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/services/messagingService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/tags/__init__.py b/ingestion/src/metadata/generated/schema/entity/tags/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/schema/entity/tags/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/tags/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/tags/tagCategory.py b/ingestion/src/metadata/generated/schema/entity/tags/tagCategory.py index 31183207f94..5e717673761 100644 --- a/ingestion/src/metadata/generated/schema/entity/tags/tagCategory.py +++ b/ingestion/src/metadata/generated/schema/entity/tags/tagCategory.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/tags/tagCategory.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/teams/__init__.py b/ingestion/src/metadata/generated/schema/entity/teams/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/schema/entity/teams/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/teams/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/teams/team.py b/ingestion/src/metadata/generated/schema/entity/teams/team.py index 423fa81d5db..fec1581c10f 100644 --- a/ingestion/src/metadata/generated/schema/entity/teams/team.py +++ b/ingestion/src/metadata/generated/schema/entity/teams/team.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/teams/team.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/teams/user.py b/ingestion/src/metadata/generated/schema/entity/teams/user.py index 102defbb41e..ae60bd0dcce 100644 --- a/ingestion/src/metadata/generated/schema/entity/teams/user.py +++ b/ingestion/src/metadata/generated/schema/entity/teams/user.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/teams/user.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/__init__.py b/ingestion/src/metadata/generated/schema/type/__init__.py index 72cdaaed7c3..0f1c77f35c0 100644 --- a/ingestion/src/metadata/generated/schema/type/__init__.py +++ b/ingestion/src/metadata/generated/schema/type/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 diff --git a/ingestion/src/metadata/generated/schema/type/auditLog.py b/ingestion/src/metadata/generated/schema/type/auditLog.py index 9c15806268b..2d665b6292c 100644 --- a/ingestion/src/metadata/generated/schema/type/auditLog.py +++ b/ingestion/src/metadata/generated/schema/type/auditLog.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/auditLog.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/basic.py b/ingestion/src/metadata/generated/schema/type/basic.py index 8019c8a7856..dd852384b89 100644 --- a/ingestion/src/metadata/generated/schema/type/basic.py +++ b/ingestion/src/metadata/generated/schema/type/basic.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/basic.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/collectionDescriptor.py b/ingestion/src/metadata/generated/schema/type/collectionDescriptor.py index 35e35acd29b..5c7d19d512f 100644 --- a/ingestion/src/metadata/generated/schema/type/collectionDescriptor.py +++ b/ingestion/src/metadata/generated/schema/type/collectionDescriptor.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/collectionDescriptor.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/dailyCount.py b/ingestion/src/metadata/generated/schema/type/dailyCount.py index fafa104390b..2b59d88b82a 100644 --- a/ingestion/src/metadata/generated/schema/type/dailyCount.py +++ b/ingestion/src/metadata/generated/schema/type/dailyCount.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/dailyCount.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/entityReference.py b/ingestion/src/metadata/generated/schema/type/entityReference.py index 55c3d90ef1f..f85aeab87d7 100644 --- a/ingestion/src/metadata/generated/schema/type/entityReference.py +++ b/ingestion/src/metadata/generated/schema/type/entityReference.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/entityReference.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/entityUsage.py b/ingestion/src/metadata/generated/schema/type/entityUsage.py index a3782be7172..e1d03c36ea6 100644 --- a/ingestion/src/metadata/generated/schema/type/entityUsage.py +++ b/ingestion/src/metadata/generated/schema/type/entityUsage.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/entityUsage.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/jdbcConnection.py b/ingestion/src/metadata/generated/schema/type/jdbcConnection.py index bc401ab98a0..e32a15b5cd1 100644 --- a/ingestion/src/metadata/generated/schema/type/jdbcConnection.py +++ b/ingestion/src/metadata/generated/schema/type/jdbcConnection.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/jdbcConnection.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/paging.py b/ingestion/src/metadata/generated/schema/type/paging.py new file mode 100644 index 00000000000..a35b5b7cccd --- /dev/null +++ b/ingestion/src/metadata/generated/schema/type/paging.py @@ -0,0 +1,23 @@ +# generated by datamodel-codegen: +# filename: schema/type/paging.json +# timestamp: 2021-09-09T06:21:38+00:00 + +from __future__ import annotations + +from typing import Optional + +from pydantic import BaseModel, Field + + +class Paging(BaseModel): + before: Optional[str] = Field( + None, + description='Before cursor used for getting the previous page (see API pagination for details).', + ) + after: Optional[str] = Field( + None, + description='After cursor used for getting the next page (see API pagination for details).', + ) + total: int = Field( + ..., description='Total number of entries available to page through.' + ) diff --git a/ingestion/src/metadata/generated/schema/type/profile.py b/ingestion/src/metadata/generated/schema/type/profile.py index 701fb2c1693..f39746e7809 100644 --- a/ingestion/src/metadata/generated/schema/type/profile.py +++ b/ingestion/src/metadata/generated/schema/type/profile.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/profile.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/schedule.py b/ingestion/src/metadata/generated/schema/type/schedule.py index bfaa72c8296..d51c9f38420 100644 --- a/ingestion/src/metadata/generated/schema/type/schedule.py +++ b/ingestion/src/metadata/generated/schema/type/schedule.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/schedule.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/tagLabel.py b/ingestion/src/metadata/generated/schema/type/tagLabel.py index e6b3b682e18..b0cde5f16f6 100644 --- a/ingestion/src/metadata/generated/schema/type/tagLabel.py +++ b/ingestion/src/metadata/generated/schema/type/tagLabel.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/tagLabel.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/usageDetails.py b/ingestion/src/metadata/generated/schema/type/usageDetails.py index 8c39334b03a..acba4e13c36 100644 --- a/ingestion/src/metadata/generated/schema/type/usageDetails.py +++ b/ingestion/src/metadata/generated/schema/type/usageDetails.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/usageDetails.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T06:21:38+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/ingestion/source/sql_source.py b/ingestion/src/metadata/ingestion/source/sql_source.py index d491dd36b50..2c50fc1c574 100644 --- a/ingestion/src/metadata/ingestion/source/sql_source.py +++ b/ingestion/src/metadata/ingestion/source/sql_source.py @@ -40,7 +40,7 @@ from metadata.ingestion.api.source import Source, SourceStatus from metadata.ingestion.models.table_metadata import DatasetProfile from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig from metadata.utils.helpers import get_database_service_or_create -from metadata.utils.dataprofiler import DataProfiler +from metadata.profiler.dataprofiler import DataProfiler logger: logging.Logger = logging.getLogger(__name__) @@ -75,6 +75,8 @@ class SQLConnectionConfig(ConfigModel): include_tables: Optional[bool] = True generate_sample_data: Optional[bool] = True data_profiler_enabled: Optional[bool] = True + data_profiler_offset: Optional[int] = 0 + data_profiler_limit: Optional[int] = 50000 filter_pattern: IncludeFilterPattern = IncludeFilterPattern.allow_all() @abstractmethod @@ -155,7 +157,15 @@ def get_column_type(status: SQLSourceStatus, dataset_name: str, column_type: Any return type_class - +def _get_table_description(schema: str, table: str, inspector: Inspector) -> str: + description = None + try: + table_info: dict = inspector.get_table_comment(table, schema) + except Exception as err: + logger.error(f"Table Description Error : {err}") + else: + description = table_info["text"] + return description class SQLSource(Source): @@ -167,8 +177,12 @@ class SQLSource(Source): self.service = get_database_service_or_create(config, metadata_config) self.status = SQLSourceStatus() self.sql_config = self.config - self.engine = create_engine(self.sql_config.get_connection_url(), **self.sql_config.options) + self.connection_string = self.sql_config.get_connection_url() + self.engine = create_engine(self.connection_string, **self.sql_config.options) self.connection = self.engine.connect() + if self.config.data_profiler_enabled: + self.data_profiler = DataProfiler(status=self.status, + connection_str=self.connection_string) def prepare(self): pass @@ -177,9 +191,6 @@ class SQLSource(Source): def create(cls, config_dict: dict, metadata_config_dict: dict, ctx: WorkflowContext): pass - def _get_profiler_instance(self, inspector: Inspector) -> DataProfiler: - return DataProfiler(conn=inspector.bind, status=self.status) - def standardize_schema_table_names( self, schema: str, table: str ) -> Tuple[str, str]: @@ -187,7 +198,7 @@ class SQLSource(Source): def fetch_sample_data(self, schema: str, table: str): try: - query = self.config.query.format(schema,table) + query = self.config.query.format(schema, table) logger.info(query) results = self.connection.execute(query) cols = list(results.keys()) @@ -223,7 +234,7 @@ class SQLSource(Source): continue self.status.scanned('{}.{}'.format(self.config.get_service_name(), table_name)) - description = self._get_table_description(schema, table_name, inspector) + description = _get_table_description(schema, table_name, inspector) table_columns = self._get_columns(schema, table_name, inspector) table_entity = Table(id=uuid.uuid4(), @@ -236,8 +247,7 @@ class SQLSource(Source): table_entity.sampleData = table_data if self.config.data_profiler_enabled: - data_profiler = self._get_profiler_instance(inspector) - profile = self.run_data_profiler(data_profiler, table_name, schema) + profile = self.run_data_profiler(table_name, schema) logger.info(profile.json()) table_and_db = OMetaDatabaseAndTable(table=table_entity, database=self._get_database(schema)) @@ -262,7 +272,7 @@ class SQLSource(Source): except NotImplementedError: view_definition = "" - description = self._get_table_description(schema, view_name, inspector) + description = _get_table_description(schema, view_name, inspector) table_columns = self._get_columns(schema, view_name, inspector) table = Table(id=uuid.uuid4(), name=view_name, @@ -327,31 +337,21 @@ class SQLSource(Source): return table_columns - def _get_table_description(self, schema: str, table: str, inspector: Inspector) -> str: - description = None - try: - table_info: dict = inspector.get_table_comment(table, schema) - except Exception as err: - logger.error(f"Table Description Error : {err}") - else: - description = table_info["text"] - return description - def run_data_profiler( self, - profiler: DataProfiler, table: str, schema: str ) -> DatasetProfile: dataset_name = f"{schema}.{table}" self.status.scanned(f"profile of {dataset_name}") - logger.info(f"Profiling {dataset_name} (this may take a while)") - profile = profiler.generate_profile( - pretty_name=dataset_name, - schema=schema, - table=table, - limit=50000, - offset=0) + logger.info(f"Running Profiling for {dataset_name}. " + f"If you haven't configured offset and limit this process can take longer") + profile = self.data_profiler.run_profiler( + dataset_name=dataset_name, + schema=schema, + table=table, + limit=self.sql_config.data_profiler_limit, + offset=self.sql_config.data_profiler_offset) logger.debug(f"Finished profiling {dataset_name}") return profile diff --git a/ingestion/src/metadata/profiler/__init__.py b/ingestion/src/metadata/profiler/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ingestion/src/metadata/profiler/dataprofiler.py b/ingestion/src/metadata/profiler/dataprofiler.py new file mode 100644 index 00000000000..67843bc05de --- /dev/null +++ b/ingestion/src/metadata/profiler/dataprofiler.py @@ -0,0 +1,207 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time +from typing import Any, Iterable, Optional + +from great_expectations.core.expectation_validation_result import ( + ExpectationSuiteValidationResult, + ExpectationValidationResult, +) +from great_expectations.data_context import BaseDataContext +from great_expectations.data_context.types.base import ( + DataContextConfig, + DatasourceConfig, + InMemoryStoreBackendDefaults, +) +from metadata.ingestion.api.source import SourceStatus +from metadata.ingestion.models.table_metadata import DatasetProfile, DatasetColumnProfile, ValueFrequency +from metadata.profiler.util import group_by + +logger: logging.Logger = logging.getLogger(__name__) + + +class DataProfiler: + data_context: BaseDataContext + status: SourceStatus + datasource_name: str = "om_data_source" + + def __init__(self, connection_str, status): + self.status = status + self.connection_str = connection_str + data_context_config = DataContextConfig( + datasources={ + self.datasource_name: DatasourceConfig( + class_name="SqlAlchemyDatasource", + credentials={ + "url": self.connection_str, + }, + ) + }, + store_backend_defaults=InMemoryStoreBackendDefaults(), + anonymous_usage_statistics={ + "enabled": False, + }, + ) + + self.data_context = BaseDataContext(project_config=data_context_config) + + def run_profiler( + self, + dataset_name: str, + schema: str = None, + table: str = None, + limit: int = None, + offset: int = None, + **kwargs: Any, + ) -> DatasetProfile: + profile_test_results = self._profile_data_asset( + { + "schema": schema, + "table": table, + "limit": limit, + "offset": offset, + **kwargs, + } + ) + profile = self._parse_test_results_to_table_profile(profile_test_results, dataset_name=dataset_name) + return profile + + def _profile_data_asset( + self, + batch_kwargs: dict + ) -> ExpectationSuiteValidationResult: + + profile_results = self.data_context.profile_data_asset( + self.datasource_name, + batch_kwargs={ + "datasource": self.datasource_name, + **batch_kwargs, + }, + ) + assert profile_results["success"] + + assert len(profile_results["results"]) == 1 + test_suite, test_results = profile_results["results"][0] + return test_results + + @staticmethod + def _get_column_from_result(result: ExpectationValidationResult) -> Optional[str]: + return result.expectation_config.kwargs.get("column") + + def _parse_test_results_to_table_profile( + self, profile_test_results: ExpectationSuiteValidationResult, dataset_name: str + ) -> DatasetProfile: + profile = None + column_profiles = [] + for col, col_test_result in group_by( + profile_test_results.results, key=self._get_column_from_result + ): + if col is None: + profile = self._parse_table_test_results(col_test_result, dataset_name=dataset_name) + else: + column_profile = self._parse_column_test_results(col, col_test_result, dataset_name=dataset_name) + column_profiles.append(column_profile) + + if profile is not None: + profile.col_profiles = column_profiles + return profile + + def _parse_table_test_results( + self, + table_test_results: Iterable[ExpectationValidationResult], + dataset_name: str, + ) -> DatasetProfile: + logger.info("generating table stats") + profile = DatasetProfile(timestamp=round(time.time() * 1000), table_name=dataset_name) + for table_result in table_test_results: + expectation: str = table_result.expectation_config.expectation_type + result: dict = table_result.result + if expectation == "expect_table_row_count_to_be_between": + profile.row_count = result["observed_value"] + elif expectation == "expect_table_columns_to_match_ordered_list": + profile.col_count = len(result["observed_value"]) + else: + self.status.warning( + f"profile of {dataset_name}", f"unknown table mapper {expectation}" + ) + return profile + + def _parse_column_test_results( + self, + column: str, + col_test_results: Iterable[ExpectationValidationResult], + dataset_name: str, + ) -> DatasetColumnProfile: + logger.info(f"Generating Column Stats for {column}") + column_profile = DatasetColumnProfile(fqdn=column) + for col_result in col_test_results: + expectation: str = col_result.expectation_config.expectation_type + result: dict = col_result.result + if not result: + self.status.warning( + f"profile of {dataset_name}", f"{expectation} did not yield any results" + ) + continue + + if expectation == "expect_column_unique_value_count_to_be_between": + column_profile.unique_count = result["observed_value"] + elif expectation == "expect_column_proportion_of_unique_values_to_be_between": + column_profile.unique_proportion = result["observed_value"] + elif expectation == "expect_column_values_to_not_be_null": + column_profile.null_count = result["unexpected_count"] + if ( + "unexpected_percent" in result + and result["unexpected_percent"] is not None + ): + column_profile.null_proportion = result["unexpected_percent"] / 100 + elif expectation == "expect_column_values_to_not_match_regex": + pass + elif expectation == "expect_column_mean_to_be_between": + column_profile.mean = str(result["observed_value"]) + elif expectation == "expect_column_min_to_be_between": + column_profile.min = str(result["observed_value"]) + elif expectation == "expect_column_max_to_be_between": + column_profile.max = str(result["observed_value"]) + elif expectation == "expect_column_median_to_be_between": + column_profile.median = str(result["observed_value"]) + elif expectation == "expect_column_stdev_to_be_between": + column_profile.stddev = str(result["observed_value"]) + elif expectation == "expect_column_quantile_values_to_be_between": + pass + elif expectation == "expect_column_values_to_be_in_set": + #column_profile.sample_values = [ + # str(v) for v in result["partial_unexpected_list"] + #] + pass + elif expectation == "expect_column_kl_divergence_to_be_less_than": + pass + elif expectation == "expect_column_distinct_values_to_be_in_set": + if "details" in result and "value_counts" in result["details"]: + column_profile.distinct_value_frequencies = [ + ValueFrequency(value=str(value), frequency=count) + for value, count in result["details"]["value_counts"].items() + ] + elif expectation == "expect_column_values_to_be_in_type_list": + pass + elif expectation == "expect_column_values_to_be_unique": + pass + else: + self.status.warning( + f"profile of {dataset_name}", + f"warning: unknown column mapper {expectation} in col {column}", + ) + return column_profile diff --git a/ingestion/src/metadata/profiler/util.py b/ingestion/src/metadata/profiler/util.py new file mode 100644 index 00000000000..f5494d0739d --- /dev/null +++ b/ingestion/src/metadata/profiler/util.py @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +from typing import TypeVar, Iterable, Callable, Tuple + +T = TypeVar("T") +K = TypeVar("K") + + +def group_by( + iterable: Iterable[T], key: Callable[[T], K] +) -> Iterable[Tuple[K, Iterable[T]]]: + + values = collections.defaultdict(list) + for v in iterable: + values[key(v)].append(v) + return values.items() \ No newline at end of file diff --git a/ingestion/src/metadata/utils/dataprofiler.py b/ingestion/src/metadata/utils/dataprofiler.py deleted file mode 100644 index 359598bd22c..00000000000 --- a/ingestion/src/metadata/utils/dataprofiler.py +++ /dev/null @@ -1,270 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import collections -import contextlib -import dataclasses -import logging -import unittest.mock -import time -from typing import Any, Iterable, Optional, Callable, Tuple, TypeVar - -from great_expectations.core.expectation_validation_result import ( - ExpectationSuiteValidationResult, - ExpectationValidationResult, -) -from great_expectations.data_context import BaseDataContext -from great_expectations.data_context.types.base import ( - DataContextConfig, - DatasourceConfig, - InMemoryStoreBackendDefaults, -) -from great_expectations.datasource.sqlalchemy_datasource import SqlAlchemyDatasource - -from metadata.ingestion.api.source import SourceStatus -from metadata.ingestion.models.table_metadata import DatasetProfile, DatasetColumnProfile, Quantile, \ - Histogram, ValueFrequency - -logger: logging.Logger = logging.getLogger(__name__) - -T = TypeVar("T") -K = TypeVar("K") - - -def groupby_unsorted( - iterable: Iterable[T], key: Callable[[T], K] -) -> Iterable[Tuple[K, Iterable[T]]]: - """The default itertools.groupby() requires that the iterable is already sorted by the key. - This method is similar to groupby() but without the pre-sorted requirement.""" - - values = collections.defaultdict(list) - for v in iterable: - values[key(v)].append(v) - return values.items() - - -@contextlib.contextmanager -def _properly_init_datasource(conn): - underlying_datasource_init = SqlAlchemyDatasource.__init__ - - def sqlalchemy_datasource_init( - self: SqlAlchemyDatasource, *args: Any, **kwargs: Any - ) -> None: - underlying_datasource_init(self, *args, **kwargs, engine=conn) - self.drivername = conn.dialect.name - del self._datasource_config["engine"] - - with unittest.mock.patch( - "great_expectations.datasource.sqlalchemy_datasource.SqlAlchemyDatasource.__init__", - sqlalchemy_datasource_init, - ), unittest.mock.patch( - "great_expectations.data_context.store.validations_store.ValidationsStore.set" - ): - yield - - -@dataclasses.dataclass -class DataProfiler: - data_context: BaseDataContext - status: SourceStatus - datasource_name: str = "om_sqlalchemy_datasource" - - def __init__(self, conn, status): - self.conn = conn - self.status = status - - data_context_config = DataContextConfig( - datasources={ - self.datasource_name: DatasourceConfig( - class_name="SqlAlchemyDatasource", - credentials={ - "url": self.conn.engine.url, - }, - ) - }, - store_backend_defaults=InMemoryStoreBackendDefaults(), - anonymous_usage_statistics={ - "enabled": False, - }, - ) - - with _properly_init_datasource(self.conn): - self.data_context = BaseDataContext(project_config=data_context_config) - - def generate_profile( - self, - pretty_name: str, - schema: str = None, - table: str = None, - limit: int = None, - offset: int = None, - **kwargs: Any, - ) -> DatasetProfile: - with _properly_init_datasource(self.conn): - evrs = self._profile_data_asset( - { - "schema": schema, - "table": table, - "limit": limit, - "offset": offset, - **kwargs, - }, - pretty_name=pretty_name, - ) - profile = self._convert_evrs_to_profile(evrs, pretty_name=pretty_name) - return profile - - def _profile_data_asset( - self, - batch_kwargs: dict, - pretty_name: str, - ) -> ExpectationSuiteValidationResult: - # Internally, this uses the GE dataset profiler: - # great_expectations.profile.basic_dataset_profiler.BasicDatasetProfiler - - profile_results = self.data_context.profile_data_asset( - self.datasource_name, - batch_kwargs={ - "datasource": self.datasource_name, - **batch_kwargs, - }, - ) - assert profile_results["success"] - - assert len(profile_results["results"]) == 1 - _suite, evrs = profile_results["results"][0] - return evrs - - @staticmethod - def _get_column_from_evr(evr: ExpectationValidationResult) -> Optional[str]: - return evr.expectation_config.kwargs.get("column") - - def _convert_evrs_to_profile( - self, evrs: ExpectationSuiteValidationResult, pretty_name: str - ) -> DatasetProfile: - profile = None - column_profiles = [] - for col, evrs_for_col in groupby_unsorted( - evrs.results, key=self._get_column_from_evr - ): - if col is None: - profile = self._handle_convert_table_evrs(evrs_for_col, pretty_name=pretty_name) - else: - column_profile = self._handle_convert_column_evrs(col, evrs_for_col, pretty_name=pretty_name) - column_profiles.append(column_profile) - - if profile is not None: - profile.col_profiles = column_profiles - return profile - - def _handle_convert_table_evrs( - self, - table_evrs: Iterable[ExpectationValidationResult], - pretty_name: str, - ) -> DatasetProfile: - logger.info("generating table stats") - profile = DatasetProfile(timestamp=round(time.time() * 1000), table_name=pretty_name) - for evr in table_evrs: - exp: str = evr.expectation_config.expectation_type - res: dict = evr.result - if exp == "expect_table_row_count_to_be_between": - profile.row_count = res["observed_value"] - elif exp == "expect_table_columns_to_match_ordered_list": - profile.col_count = len(res["observed_value"]) - else: - self.status.warning( - f"profile of {pretty_name}", f"unknown table mapper {exp}" - ) - return profile - - def _handle_convert_column_evrs( - self, - column: str, - col_evrs: Iterable[ExpectationValidationResult], - pretty_name: str, - ) -> DatasetColumnProfile: - logger.info(f"Generating Column Stats for {column}") - column_profile = DatasetColumnProfile(fqdn=column) - for evr in col_evrs: - exp: str = evr.expectation_config.expectation_type - res: dict = evr.result - if not res: - self.status.warning( - f"profile of {pretty_name}", f"{exp} did not yield any results" - ) - continue - - if exp == "expect_column_unique_value_count_to_be_between": - column_profile.unique_count = res["observed_value"] - elif exp == "expect_column_proportion_of_unique_values_to_be_between": - column_profile.unique_proportion = res["observed_value"] - elif exp == "expect_column_values_to_not_be_null": - column_profile.null_count = res["unexpected_count"] - if ( - "unexpected_percent" in res - and res["unexpected_percent"] is not None - ): - column_profile.null_proportion = res["unexpected_percent"] / 100 - elif exp == "expect_column_values_to_not_match_regex": - pass - elif exp == "expect_column_mean_to_be_between": - column_profile.mean = str(res["observed_value"]) - elif exp == "expect_column_min_to_be_between": - column_profile.min = str(res["observed_value"]) - elif exp == "expect_column_max_to_be_between": - column_profile.max = str(res["observed_value"]) - elif exp == "expect_column_median_to_be_between": - column_profile.median = str(res["observed_value"]) - elif exp == "expect_column_stdev_to_be_between": - column_profile.stddev = str(res["observed_value"]) - elif exp == "expect_column_quantile_values_to_be_between": - if "observed_value" in res: - column_profile.quantiles = [ - Quantile(quantile=str(quantile), value=str(value)) - for quantile, value in zip( - res["observed_value"]["quantiles"], - res["observed_value"]["values"], - ) - ] - elif exp == "expect_column_values_to_be_in_set": - column_profile.sample_values = [ - str(v) for v in res["partial_unexpected_list"] - ] - elif exp == "expect_column_kl_divergence_to_be_less_than": - if "details" in res and "observed_partition" in res["details"]: - partition = res["details"]["observed_partition"] - column_profile.histogram = Histogram( - [str(v) for v in partition["bins"]], - [ - partition["tail_weights"][0], - *partition["weights"], - partition["tail_weights"][1], - ], - ) - elif exp == "expect_column_distinct_values_to_be_in_set": - if "details" in res and "value_counts" in res["details"]: - column_profile.distinct_value_frequencies = [ - ValueFrequency(value=str(value), frequency=count) - for value, count in res["details"]["value_counts"].items() - ] - elif exp == "expect_column_values_to_be_in_type_list": - pass - elif exp == "expect_column_values_to_be_unique": - pass - else: - self.status.warning( - f"profile of {pretty_name}", - f"warning: unknown column mapper {exp} in col {column}", - ) - return column_profile From 4998d491676018cef78990c7e9bd1eb43cfcef13 Mon Sep 17 00:00:00 2001 From: Sriharsha Chintalapani Date: Thu, 9 Sep 2021 10:51:07 -0700 Subject: [PATCH 3/4] Fix #446: Add DataProfiler to ingestion and APIs --- .../catalog/jdbi3/TableRepository.java | 19 ++++-- .../resources/databases/TableResource.java | 6 +- .../json/schema/entity/data/table.json | 16 +++-- .../databases/TableResourceTest.java | 68 ++++++++++++++++--- .../metadata/generated/data/tags/__init__.py | 2 +- .../generated/data/tags/personalDataTags.py | 2 +- .../metadata/generated/data/tags/piiTags.py | 2 +- .../metadata/generated/data/tags/tierTags.py | 2 +- .../metadata/generated/data/tags/userTags.py | 2 +- .../metadata/generated/schema/api/__init__.py | 2 +- .../generated/schema/api/catalogVersion.py | 2 +- .../generated/schema/api/data/__init__.py | 2 +- .../generated/schema/api/data/createChart.py | 2 +- .../schema/api/data/createDashboard.py | 2 +- .../schema/api/data/createDatabase.py | 2 +- .../generated/schema/api/data/createTable.py | 2 +- .../generated/schema/api/data/createTopic.py | 2 +- .../generated/schema/api/feed/__init__.py | 2 +- .../generated/schema/api/feed/createThread.py | 2 +- .../generated/schema/api/services/__init__.py | 2 +- .../api/services/createDashboardService.py | 2 +- .../api/services/createDatabaseService.py | 2 +- .../api/services/createMessagingService.py | 2 +- .../api/services/updateDashboardService.py | 2 +- .../api/services/updateDatabaseService.py | 2 +- .../api/services/updateMessagingService.py | 2 +- .../metadata/generated/schema/api/setOwner.py | 2 +- .../generated/schema/api/tags/__init__.py | 2 +- .../generated/schema/api/tags/createTag.py | 2 +- .../schema/api/tags/createTagCategory.py | 2 +- .../generated/schema/api/teams/__init__.py | 2 +- .../generated/schema/api/teams/createTeam.py | 2 +- .../generated/schema/api/teams/createUser.py | 2 +- .../generated/schema/entity/__init__.py | 2 +- .../metadata/generated/schema/entity/bots.py | 2 +- .../generated/schema/entity/data/__init__.py | 2 +- .../generated/schema/entity/data/chart.py | 2 +- .../generated/schema/entity/data/dashboard.py | 2 +- .../generated/schema/entity/data/database.py | 2 +- .../generated/schema/entity/data/metrics.py | 2 +- .../generated/schema/entity/data/pipeline.py | 2 +- .../generated/schema/entity/data/report.py | 2 +- .../generated/schema/entity/data/table.py | 22 ++---- .../generated/schema/entity/data/topic.py | 2 +- .../generated/schema/entity/feed/__init__.py | 2 +- .../generated/schema/entity/feed/thread.py | 2 +- .../schema/entity/services/__init__.py | 2 +- .../entity/services/dashboardService.py | 2 +- .../schema/entity/services/databaseService.py | 2 +- .../entity/services/messagingService.py | 2 +- .../generated/schema/entity/tags/__init__.py | 2 +- .../schema/entity/tags/tagCategory.py | 2 +- .../generated/schema/entity/teams/__init__.py | 2 +- .../generated/schema/entity/teams/team.py | 2 +- .../generated/schema/entity/teams/user.py | 2 +- .../generated/schema/type/__init__.py | 2 +- .../generated/schema/type/auditLog.py | 2 +- .../metadata/generated/schema/type/basic.py | 2 +- .../schema/type/collectionDescriptor.py | 2 +- .../generated/schema/type/dailyCount.py | 2 +- .../generated/schema/type/entityReference.py | 2 +- .../generated/schema/type/entityUsage.py | 2 +- .../generated/schema/type/jdbcConnection.py | 2 +- .../metadata/generated/schema/type/paging.py | 2 +- .../metadata/generated/schema/type/profile.py | 2 +- .../generated/schema/type/schedule.py | 2 +- .../generated/schema/type/tagLabel.py | 2 +- .../generated/schema/type/usageDetails.py | 2 +- .../ingestion/ometa/openmetadata_rest.py | 13 ++-- .../metadata/ingestion/sink/metadata_rest.py | 8 ++- .../metadata/ingestion/source/sql_source.py | 7 +- .../src/metadata/profiler/dataprofiler.py | 35 +++++----- 72 files changed, 191 insertions(+), 129 deletions(-) diff --git a/catalog-rest-service/src/main/java/org/openmetadata/catalog/jdbi3/TableRepository.java b/catalog-rest-service/src/main/java/org/openmetadata/catalog/jdbi3/TableRepository.java index 223232a6c90..57d77d7b4f0 100644 --- a/catalog-rest-service/src/main/java/org/openmetadata/catalog/jdbi3/TableRepository.java +++ b/catalog-rest-service/src/main/java/org/openmetadata/catalog/jdbi3/TableRepository.java @@ -31,6 +31,7 @@ import org.openmetadata.catalog.resources.databases.TableResource; import org.openmetadata.catalog.resources.databases.TableResource.TableList; import org.openmetadata.catalog.type.Column; import org.openmetadata.catalog.type.ColumnJoin; +import org.openmetadata.catalog.type.ColumnProfile; import org.openmetadata.catalog.type.DailyCount; import org.openmetadata.catalog.type.EntityReference; import org.openmetadata.catalog.type.JoinedWith; @@ -275,19 +276,23 @@ public abstract class TableRepository { } @Transaction - public void addTableProfileData(String tableId, List tableProfiles) throws IOException { + public void addTableProfileData(String tableId, TableProfile tableProfile) throws IOException { // Validate the request content Table table = EntityUtil.validate(tableId, tableDAO().findById(tableId), Table.class); + List storedTableProfiles = getTableProfile(table); Map storedMapTableProfiles = new HashMap<>(); - for (TableProfile profile: storedTableProfiles) { - storedMapTableProfiles.put(profile.getProfileDate(), profile); + if (storedTableProfiles != null) { + for (TableProfile profile : storedTableProfiles) { + storedMapTableProfiles.put(profile.getProfileDate(), profile); + } } - - for (TableProfile profile: tableProfiles) { - storedMapTableProfiles.put(profile.getProfileDate(), profile); + //validate all the columns + for (ColumnProfile columnProfile: tableProfile.getColumnProfile()) { + validateColumn(table, columnProfile.getName()); } - List updatedProfiles = new ArrayList(storedMapTableProfiles.values()); + storedMapTableProfiles.put(tableProfile.getProfileDate(), tableProfile); + List updatedProfiles = new ArrayList<>(storedMapTableProfiles.values()); entityExtensionDAO().insert(tableId, "table.tableProfile", "tableProfile", JsonUtils.pojoToJson(updatedProfiles)); diff --git a/catalog-rest-service/src/main/java/org/openmetadata/catalog/resources/databases/TableResource.java b/catalog-rest-service/src/main/java/org/openmetadata/catalog/resources/databases/TableResource.java index fca74404e86..a7e3ddd38a1 100644 --- a/catalog-rest-service/src/main/java/org/openmetadata/catalog/resources/databases/TableResource.java +++ b/catalog-rest-service/src/main/java/org/openmetadata/catalog/resources/databases/TableResource.java @@ -117,7 +117,7 @@ public class TableResource { } static final String FIELDS = "columns,tableConstraints,usageSummary,owner," + - "database,tags,followers,joins,sampleData,viewDefinition"; + "database,tags,followers,joins,sampleData,viewDefinition,tableProfile"; public static final List FIELD_LIST = Arrays.asList(FIELDS.replaceAll(" ", "") .split(",")); @@ -357,10 +357,10 @@ public class TableResource { public Table addDataProfiler(@Context UriInfo uriInfo, @Context SecurityContext securityContext, @Parameter(description = "Id of the table", schema = @Schema(type = "string")) - @PathParam("id") String id, List tableProfiles) throws IOException, ParseException { + @PathParam("id") String id, TableProfile tableProfile) throws IOException, ParseException { SecurityUtil.checkAdminOrBotRole(authorizer, securityContext); Fields fields = new Fields(FIELD_LIST, "tableProfile"); - dao.addTableProfileData(id, tableProfiles); + dao.addTableProfileData(id, tableProfile); Table table = dao.get(id, fields); return addHref(uriInfo, table); } diff --git a/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json b/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json index b021d385345..d8a09867a95 100644 --- a/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json +++ b/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json @@ -267,16 +267,20 @@ "type": "number" }, "min": { - "description": "Minimum value in a column. Applies to column with number values. It will be 0 for any other types", - "type": "number" + "description": "Minimum value in a column.", + "type": "string" }, "max": { - "description": "Maximum value in a column. Applies to column with number values. It will be 0 for any other types", - "type": "number" + "description": "Maximum value in a column.", + "type": "string" }, "mean": { - "description": "Avg value in a column. Applies to column with number values. It will be 0 for any other types", - "type": "number" + "description": "Avg value in a column.", + "type": "string" + }, + "median": { + "description": "Median value in a column.", + "type": "string" }, "stddev": { "description": "Standard deviation of a column.", diff --git a/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java b/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java index 962422dc813..b7908aa3ab5 100644 --- a/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java +++ b/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java @@ -44,12 +44,14 @@ import org.openmetadata.catalog.resources.teams.UserResourceTest; import org.openmetadata.catalog.type.Column; import org.openmetadata.catalog.type.ColumnDataType; import org.openmetadata.catalog.type.ColumnJoin; +import org.openmetadata.catalog.type.ColumnProfile; import org.openmetadata.catalog.type.EntityReference; import org.openmetadata.catalog.type.JoinedWith; import org.openmetadata.catalog.type.TableConstraint; import org.openmetadata.catalog.type.TableConstraint.ConstraintType; import org.openmetadata.catalog.type.TableData; import org.openmetadata.catalog.type.TableJoins; +import org.openmetadata.catalog.type.TableProfile; import org.openmetadata.catalog.type.TableType; import org.openmetadata.catalog.type.TagLabel; import org.openmetadata.catalog.util.EntityUtil; @@ -614,19 +616,49 @@ public class TableResourceTest extends CatalogApplicationTest { public void put_tableProfile_200(TestInfo test) throws HttpResponseException { Table table = createAndCheckTable(create(test), adminAuthHeaders()); List columns = Arrays.asList("c1", "c2", "c3"); + ColumnProfile c1Profile = new ColumnProfile().withName("c1").withMax("100.0").withMin("10.0").withUniqueCount(100.0); + ColumnProfile c2Profile = new ColumnProfile().withName("c2").withMax("99.0").withMin("20.0").withUniqueCount(89.0); + ColumnProfile c3Profile = new ColumnProfile().withName("c3").withMax("75.0").withMin("25.0").withUniqueCount(77.0); + // Add column profiles + List columnProfiles = List.of(c1Profile, c2Profile, c3Profile); + TableProfile tableProfile = new TableProfile().withRowCount(6.0).withColumnCount(3.0) + .withColumnProfile(columnProfiles).withProfileDate("2021-09-09"); + putTableProfileData(table.getId(), tableProfile, adminAuthHeaders()); - // Add 3 rows of sample data for 3 columns - List> rows = Arrays.asList(Arrays.asList("c1Value1", 1, true), - Arrays.asList("c1Value2", null, false), - Arrays.asList("c1Value3", 3, true)); + table = getTable(table.getId(), "tableProfile", adminAuthHeaders()); + verifyTableProfileData(table.getTableProfile(), List.of(tableProfile)); - TableData tableData = new TableData().withColumns(columns).withRows(rows); - putSampleData(table.getId(), tableData, adminAuthHeaders()); + // Add new date for TableProfile + TableProfile newTableProfile = new TableProfile().withRowCount(7.0).withColumnCount(3.0) + .withColumnProfile(columnProfiles).withProfileDate("2021-09-08"); + putTableProfileData(table.getId(), newTableProfile, adminAuthHeaders()); + table = getTable(table.getId(), "tableProfile", adminAuthHeaders()); + verifyTableProfileData(table.getTableProfile(), List.of(newTableProfile, tableProfile)); - table = getTable(table.getId(), "sampleData", adminAuthHeaders()); - assertEquals(tableData, table.getSampleData()); + // Replace table profile for a date + TableProfile newTableProfile1 = new TableProfile().withRowCount(21.0).withColumnCount(3.0) + .withColumnProfile(columnProfiles).withProfileDate("2021-09-08"); + putTableProfileData(table.getId(), newTableProfile1, adminAuthHeaders()); + table = getTable(table.getId(), "tableProfile", adminAuthHeaders()); + verifyTableProfileData(table.getTableProfile(), List.of(newTableProfile1, tableProfile)); } + @Test + public void put_tableInvalidTableProfileData_4xx(TestInfo test) throws HttpResponseException { + Table table = createAndCheckTable(create(test), adminAuthHeaders()); + + ColumnProfile c1Profile = new ColumnProfile().withName("c1").withMax("100").withMin("10.0") + .withUniqueCount(100.0); + ColumnProfile c2Profile = new ColumnProfile().withName("c2").withMax("99.0").withMin("20.0").withUniqueCount(89.0); + ColumnProfile c3Profile = new ColumnProfile().withName("invalidColumn").withMax("75") + .withMin("25").withUniqueCount(77.0); + List columnProfiles = List.of(c1Profile, c2Profile, c3Profile); + TableProfile tableProfile = new TableProfile().withRowCount(6.0).withColumnCount(3.0) + .withColumnProfile(columnProfiles).withProfileDate("2021-09-09"); + HttpResponseException exception = assertThrows(HttpResponseException.class, () + -> putTableProfileData(table.getId(), tableProfile, adminAuthHeaders())); + TestUtils.assertResponseContains(exception, BAD_REQUEST, "Invalid column name invalidColumn"); + } @Test public void get_nonExistentTable_404_notFound() { @@ -1243,6 +1275,13 @@ public class TableResourceTest extends CatalogApplicationTest { TestUtils.put(target, data, OK, authHeaders); } + public static void putTableProfileData(UUID tableId, TableProfile data, Map authHeaders) + throws HttpResponseException { + WebTarget target = CatalogApplicationTest.getResource("tables/" + tableId + "/tableProfile"); + TestUtils.put(target, data, OK, authHeaders); + } + + private void deleteTable(UUID id, Map authHeaders) throws HttpResponseException { TestUtils.delete(CatalogApplicationTest.getResource("tables/" + id), authHeaders); @@ -1373,4 +1412,17 @@ public class TableResourceTest extends CatalogApplicationTest { public static String getTableName(TestInfo test, int index) { return String.format("table%d_%s", index, test.getDisplayName()); } + + private void verifyTableProfileData(List actualProfiles, List expectedProfiles) { + assertEquals(actualProfiles.size(), expectedProfiles.size()); + Map tableProfileMap = new HashMap<>(); + for(TableProfile profile: actualProfiles) { + tableProfileMap.put(profile.getProfileDate(), profile); + } + for(TableProfile tableProfile: expectedProfiles) { + TableProfile storedProfile = tableProfileMap.get(tableProfile.getProfileDate()); + assertNotNull(storedProfile); + assertEquals(tableProfile, storedProfile); + } + } } diff --git a/ingestion/src/metadata/generated/data/tags/__init__.py b/ingestion/src/metadata/generated/data/tags/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/data/tags/__init__.py +++ b/ingestion/src/metadata/generated/data/tags/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/data/tags/personalDataTags.py b/ingestion/src/metadata/generated/data/tags/personalDataTags.py index 9ad3620a01e..7454a3e2db6 100644 --- a/ingestion/src/metadata/generated/data/tags/personalDataTags.py +++ b/ingestion/src/metadata/generated/data/tags/personalDataTags.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: data/tags/personalDataTags.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/data/tags/piiTags.py b/ingestion/src/metadata/generated/data/tags/piiTags.py index 3c92d23b2b7..c2c4c403ee5 100644 --- a/ingestion/src/metadata/generated/data/tags/piiTags.py +++ b/ingestion/src/metadata/generated/data/tags/piiTags.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: data/tags/piiTags.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/data/tags/tierTags.py b/ingestion/src/metadata/generated/data/tags/tierTags.py index 5558bd819f7..fa121533465 100644 --- a/ingestion/src/metadata/generated/data/tags/tierTags.py +++ b/ingestion/src/metadata/generated/data/tags/tierTags.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: data/tags/tierTags.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/data/tags/userTags.py b/ingestion/src/metadata/generated/data/tags/userTags.py index 3e574d93006..de5de310349 100644 --- a/ingestion/src/metadata/generated/data/tags/userTags.py +++ b/ingestion/src/metadata/generated/data/tags/userTags.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: data/tags/userTags.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/__init__.py b/ingestion/src/metadata/generated/schema/api/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/api/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/catalogVersion.py b/ingestion/src/metadata/generated/schema/api/catalogVersion.py index 1e7c3c03b9b..809d24f0949 100644 --- a/ingestion/src/metadata/generated/schema/api/catalogVersion.py +++ b/ingestion/src/metadata/generated/schema/api/catalogVersion.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/catalogVersion.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/__init__.py b/ingestion/src/metadata/generated/schema/api/data/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/api/data/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/data/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/data/createChart.py b/ingestion/src/metadata/generated/schema/api/data/createChart.py index 3d01f973e93..c849b35915f 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createChart.py +++ b/ingestion/src/metadata/generated/schema/api/data/createChart.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createChart.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/createDashboard.py b/ingestion/src/metadata/generated/schema/api/data/createDashboard.py index 5089a87df7d..b0542b28890 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createDashboard.py +++ b/ingestion/src/metadata/generated/schema/api/data/createDashboard.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createDashboard.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/createDatabase.py b/ingestion/src/metadata/generated/schema/api/data/createDatabase.py index 4d029474b1f..981bd22ee35 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createDatabase.py +++ b/ingestion/src/metadata/generated/schema/api/data/createDatabase.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createDatabase.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/createTable.py b/ingestion/src/metadata/generated/schema/api/data/createTable.py index bde335327e5..9ef094897ce 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createTable.py +++ b/ingestion/src/metadata/generated/schema/api/data/createTable.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createTable.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/createTopic.py b/ingestion/src/metadata/generated/schema/api/data/createTopic.py index ae8e49878ba..7b990410669 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createTopic.py +++ b/ingestion/src/metadata/generated/schema/api/data/createTopic.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createTopic.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/feed/__init__.py b/ingestion/src/metadata/generated/schema/api/feed/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/api/feed/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/feed/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/feed/createThread.py b/ingestion/src/metadata/generated/schema/api/feed/createThread.py index 54c19c6bfc9..85475220bc4 100644 --- a/ingestion/src/metadata/generated/schema/api/feed/createThread.py +++ b/ingestion/src/metadata/generated/schema/api/feed/createThread.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/feed/createThread.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/__init__.py b/ingestion/src/metadata/generated/schema/api/services/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/api/services/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/services/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/services/createDashboardService.py b/ingestion/src/metadata/generated/schema/api/services/createDashboardService.py index 375beecf1b9..9cc975de60e 100644 --- a/ingestion/src/metadata/generated/schema/api/services/createDashboardService.py +++ b/ingestion/src/metadata/generated/schema/api/services/createDashboardService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/createDashboardService.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/createDatabaseService.py b/ingestion/src/metadata/generated/schema/api/services/createDatabaseService.py index c364749f21a..a579a98d964 100644 --- a/ingestion/src/metadata/generated/schema/api/services/createDatabaseService.py +++ b/ingestion/src/metadata/generated/schema/api/services/createDatabaseService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/createDatabaseService.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/createMessagingService.py b/ingestion/src/metadata/generated/schema/api/services/createMessagingService.py index b5831d4f7a6..24779c4977b 100644 --- a/ingestion/src/metadata/generated/schema/api/services/createMessagingService.py +++ b/ingestion/src/metadata/generated/schema/api/services/createMessagingService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/createMessagingService.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/updateDashboardService.py b/ingestion/src/metadata/generated/schema/api/services/updateDashboardService.py index e5096c6234e..c2ebd7fee81 100644 --- a/ingestion/src/metadata/generated/schema/api/services/updateDashboardService.py +++ b/ingestion/src/metadata/generated/schema/api/services/updateDashboardService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/updateDashboardService.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/updateDatabaseService.py b/ingestion/src/metadata/generated/schema/api/services/updateDatabaseService.py index 85d44d46ab8..24cdec50e51 100644 --- a/ingestion/src/metadata/generated/schema/api/services/updateDatabaseService.py +++ b/ingestion/src/metadata/generated/schema/api/services/updateDatabaseService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/updateDatabaseService.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/updateMessagingService.py b/ingestion/src/metadata/generated/schema/api/services/updateMessagingService.py index d3aa0d22082..b0a41c638f3 100644 --- a/ingestion/src/metadata/generated/schema/api/services/updateMessagingService.py +++ b/ingestion/src/metadata/generated/schema/api/services/updateMessagingService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/updateMessagingService.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/setOwner.py b/ingestion/src/metadata/generated/schema/api/setOwner.py index 4b98e3da2ad..a329b72174c 100644 --- a/ingestion/src/metadata/generated/schema/api/setOwner.py +++ b/ingestion/src/metadata/generated/schema/api/setOwner.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/setOwner.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/tags/__init__.py b/ingestion/src/metadata/generated/schema/api/tags/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/api/tags/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/tags/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/tags/createTag.py b/ingestion/src/metadata/generated/schema/api/tags/createTag.py index e1cf6c6148b..31a4eaa6f45 100644 --- a/ingestion/src/metadata/generated/schema/api/tags/createTag.py +++ b/ingestion/src/metadata/generated/schema/api/tags/createTag.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/tags/createTag.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/tags/createTagCategory.py b/ingestion/src/metadata/generated/schema/api/tags/createTagCategory.py index 3fcde21259d..dcb6aaf9c86 100644 --- a/ingestion/src/metadata/generated/schema/api/tags/createTagCategory.py +++ b/ingestion/src/metadata/generated/schema/api/tags/createTagCategory.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/tags/createTagCategory.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/teams/__init__.py b/ingestion/src/metadata/generated/schema/api/teams/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/api/teams/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/teams/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/teams/createTeam.py b/ingestion/src/metadata/generated/schema/api/teams/createTeam.py index 5095eddee3e..1ed8173451e 100644 --- a/ingestion/src/metadata/generated/schema/api/teams/createTeam.py +++ b/ingestion/src/metadata/generated/schema/api/teams/createTeam.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/teams/createTeam.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/teams/createUser.py b/ingestion/src/metadata/generated/schema/api/teams/createUser.py index 106675a3c82..07ef77981d3 100644 --- a/ingestion/src/metadata/generated/schema/api/teams/createUser.py +++ b/ingestion/src/metadata/generated/schema/api/teams/createUser.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/teams/createUser.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/__init__.py b/ingestion/src/metadata/generated/schema/entity/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/entity/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/bots.py b/ingestion/src/metadata/generated/schema/entity/bots.py index 106b6bb98fd..ed970070549 100644 --- a/ingestion/src/metadata/generated/schema/entity/bots.py +++ b/ingestion/src/metadata/generated/schema/entity/bots.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/bots.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/__init__.py b/ingestion/src/metadata/generated/schema/entity/data/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/data/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/data/chart.py b/ingestion/src/metadata/generated/schema/entity/data/chart.py index 37be9ddb313..82dcf229ffc 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/chart.py +++ b/ingestion/src/metadata/generated/schema/entity/data/chart.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/chart.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/dashboard.py b/ingestion/src/metadata/generated/schema/entity/data/dashboard.py index bf10e03d931..7035fc3b34f 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/dashboard.py +++ b/ingestion/src/metadata/generated/schema/entity/data/dashboard.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/dashboard.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/database.py b/ingestion/src/metadata/generated/schema/entity/data/database.py index 50a7a7771ad..9e072387987 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/database.py +++ b/ingestion/src/metadata/generated/schema/entity/data/database.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/database.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/metrics.py b/ingestion/src/metadata/generated/schema/entity/data/metrics.py index fbcd44878ab..a3ef45f8fb0 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/metrics.py +++ b/ingestion/src/metadata/generated/schema/entity/data/metrics.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/metrics.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/pipeline.py b/ingestion/src/metadata/generated/schema/entity/data/pipeline.py index 90a1436c900..a802334f293 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/pipeline.py +++ b/ingestion/src/metadata/generated/schema/entity/data/pipeline.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/pipeline.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/report.py b/ingestion/src/metadata/generated/schema/entity/data/report.py index 8e2edcc8362..b50e1302922 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/report.py +++ b/ingestion/src/metadata/generated/schema/entity/data/report.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/report.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/table.py b/ingestion/src/metadata/generated/schema/entity/data/table.py index d17f2707ca4..a8ee201bf8d 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/table.py +++ b/ingestion/src/metadata/generated/schema/entity/data/table.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/table.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations @@ -138,18 +138,10 @@ class ColumnProfile(BaseModel): nullProportion: Optional[float] = Field( None, description='No.of null value proportion in columns' ) - min: Optional[float] = Field( - None, - description='Minimum value in a column. Applies to column with number values. It will be 0 for any other types', - ) - max: Optional[float] = Field( - None, - description='Maximum value in a column. Applies to column with number values. It will be 0 for any other types', - ) - mean: Optional[float] = Field( - None, - description='Avg value in a column. Applies to column with number values. It will be 0 for any other types', - ) + min: Optional[str] = Field(None, description='Minimum value in a column.') + max: Optional[str] = Field(None, description='Maximum value in a column.') + mean: Optional[str] = Field(None, description='Avg value in a column.') + median: Optional[str] = Field(None, description='Median value in a column.') stddev: Optional[float] = Field(None, description='Standard deviation of a column.') @@ -175,7 +167,7 @@ class TableProfile(BaseModel): None, description='No.of columns in the table.' ) rowCount: Optional[float] = Field(None, description='No.of rows in the table.') - column_profile: Optional[List[ColumnProfile]] = Field( + columnProfile: Optional[List[ColumnProfile]] = Field( None, description='List of local column profiles of the table.' ) @@ -239,6 +231,6 @@ class Table(BaseModel): sampleData: Optional[TableData] = Field( None, description='Sample data for a table.' ) - dataProfile: Optional[TableProfile] = Field( + tableProfile: Optional[List[TableProfile]] = Field( None, description='Data profile for a table.' ) diff --git a/ingestion/src/metadata/generated/schema/entity/data/topic.py b/ingestion/src/metadata/generated/schema/entity/data/topic.py index 527bcef2a19..790212905f5 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/topic.py +++ b/ingestion/src/metadata/generated/schema/entity/data/topic.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/topic.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/feed/__init__.py b/ingestion/src/metadata/generated/schema/entity/feed/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/entity/feed/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/feed/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/feed/thread.py b/ingestion/src/metadata/generated/schema/entity/feed/thread.py index cc853bd5ed0..c75c26f202f 100644 --- a/ingestion/src/metadata/generated/schema/entity/feed/thread.py +++ b/ingestion/src/metadata/generated/schema/entity/feed/thread.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/feed/thread.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/services/__init__.py b/ingestion/src/metadata/generated/schema/entity/services/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/entity/services/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/services/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/services/dashboardService.py b/ingestion/src/metadata/generated/schema/entity/services/dashboardService.py index b73c0cb1b67..55a3f7d3621 100644 --- a/ingestion/src/metadata/generated/schema/entity/services/dashboardService.py +++ b/ingestion/src/metadata/generated/schema/entity/services/dashboardService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/services/dashboardService.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/services/databaseService.py b/ingestion/src/metadata/generated/schema/entity/services/databaseService.py index ce59c700905..1dd6327ad31 100644 --- a/ingestion/src/metadata/generated/schema/entity/services/databaseService.py +++ b/ingestion/src/metadata/generated/schema/entity/services/databaseService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/services/databaseService.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/services/messagingService.py b/ingestion/src/metadata/generated/schema/entity/services/messagingService.py index e257a35d44f..171f726fe89 100644 --- a/ingestion/src/metadata/generated/schema/entity/services/messagingService.py +++ b/ingestion/src/metadata/generated/schema/entity/services/messagingService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/services/messagingService.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/tags/__init__.py b/ingestion/src/metadata/generated/schema/entity/tags/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/entity/tags/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/tags/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/tags/tagCategory.py b/ingestion/src/metadata/generated/schema/entity/tags/tagCategory.py index 5e717673761..9f8e1aef884 100644 --- a/ingestion/src/metadata/generated/schema/entity/tags/tagCategory.py +++ b/ingestion/src/metadata/generated/schema/entity/tags/tagCategory.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/tags/tagCategory.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/teams/__init__.py b/ingestion/src/metadata/generated/schema/entity/teams/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/entity/teams/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/teams/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/teams/team.py b/ingestion/src/metadata/generated/schema/entity/teams/team.py index fec1581c10f..b357f675af5 100644 --- a/ingestion/src/metadata/generated/schema/entity/teams/team.py +++ b/ingestion/src/metadata/generated/schema/entity/teams/team.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/teams/team.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/teams/user.py b/ingestion/src/metadata/generated/schema/entity/teams/user.py index ae60bd0dcce..389515cf16d 100644 --- a/ingestion/src/metadata/generated/schema/entity/teams/user.py +++ b/ingestion/src/metadata/generated/schema/entity/teams/user.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/teams/user.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/__init__.py b/ingestion/src/metadata/generated/schema/type/__init__.py index 0f1c77f35c0..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/type/__init__.py +++ b/ingestion/src/metadata/generated/schema/type/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/type/auditLog.py b/ingestion/src/metadata/generated/schema/type/auditLog.py index 2d665b6292c..61533d0462e 100644 --- a/ingestion/src/metadata/generated/schema/type/auditLog.py +++ b/ingestion/src/metadata/generated/schema/type/auditLog.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/auditLog.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/basic.py b/ingestion/src/metadata/generated/schema/type/basic.py index dd852384b89..fefc9d382c6 100644 --- a/ingestion/src/metadata/generated/schema/type/basic.py +++ b/ingestion/src/metadata/generated/schema/type/basic.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/basic.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/collectionDescriptor.py b/ingestion/src/metadata/generated/schema/type/collectionDescriptor.py index 5c7d19d512f..a000a642c0c 100644 --- a/ingestion/src/metadata/generated/schema/type/collectionDescriptor.py +++ b/ingestion/src/metadata/generated/schema/type/collectionDescriptor.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/collectionDescriptor.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/dailyCount.py b/ingestion/src/metadata/generated/schema/type/dailyCount.py index 2b59d88b82a..7fc3d4b2f43 100644 --- a/ingestion/src/metadata/generated/schema/type/dailyCount.py +++ b/ingestion/src/metadata/generated/schema/type/dailyCount.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/dailyCount.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/entityReference.py b/ingestion/src/metadata/generated/schema/type/entityReference.py index f85aeab87d7..ae11cf76a96 100644 --- a/ingestion/src/metadata/generated/schema/type/entityReference.py +++ b/ingestion/src/metadata/generated/schema/type/entityReference.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/entityReference.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/entityUsage.py b/ingestion/src/metadata/generated/schema/type/entityUsage.py index e1d03c36ea6..d38ca958d31 100644 --- a/ingestion/src/metadata/generated/schema/type/entityUsage.py +++ b/ingestion/src/metadata/generated/schema/type/entityUsage.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/entityUsage.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/jdbcConnection.py b/ingestion/src/metadata/generated/schema/type/jdbcConnection.py index e32a15b5cd1..74818d079c1 100644 --- a/ingestion/src/metadata/generated/schema/type/jdbcConnection.py +++ b/ingestion/src/metadata/generated/schema/type/jdbcConnection.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/jdbcConnection.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/paging.py b/ingestion/src/metadata/generated/schema/type/paging.py index a35b5b7cccd..688a3b492dd 100644 --- a/ingestion/src/metadata/generated/schema/type/paging.py +++ b/ingestion/src/metadata/generated/schema/type/paging.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/paging.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/profile.py b/ingestion/src/metadata/generated/schema/type/profile.py index f39746e7809..c1e83abf930 100644 --- a/ingestion/src/metadata/generated/schema/type/profile.py +++ b/ingestion/src/metadata/generated/schema/type/profile.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/profile.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/schedule.py b/ingestion/src/metadata/generated/schema/type/schedule.py index d51c9f38420..430cb05b528 100644 --- a/ingestion/src/metadata/generated/schema/type/schedule.py +++ b/ingestion/src/metadata/generated/schema/type/schedule.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/schedule.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/tagLabel.py b/ingestion/src/metadata/generated/schema/type/tagLabel.py index b0cde5f16f6..977e161d8f1 100644 --- a/ingestion/src/metadata/generated/schema/type/tagLabel.py +++ b/ingestion/src/metadata/generated/schema/type/tagLabel.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/tagLabel.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/usageDetails.py b/ingestion/src/metadata/generated/schema/type/usageDetails.py index acba4e13c36..02369ba6eb2 100644 --- a/ingestion/src/metadata/generated/schema/type/usageDetails.py +++ b/ingestion/src/metadata/generated/schema/type/usageDetails.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/usageDetails.json -# timestamp: 2021-09-09T06:21:38+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/ingestion/ometa/openmetadata_rest.py b/ingestion/src/metadata/ingestion/ometa/openmetadata_rest.py index a73d5e3b05a..045b69f78d8 100644 --- a/ingestion/src/metadata/ingestion/ometa/openmetadata_rest.py +++ b/ingestion/src/metadata/ingestion/ometa/openmetadata_rest.py @@ -28,7 +28,7 @@ from metadata.generated.schema.api.services.createMessagingService import Create from metadata.generated.schema.entity.data.chart import Chart from metadata.generated.schema.entity.data.dashboard import Dashboard from metadata.generated.schema.entity.data.database import Database -from metadata.generated.schema.entity.data.table import Table, TableData, TableJoins +from metadata.generated.schema.entity.data.table import Table, TableData, TableJoins, TableProfile from metadata.generated.schema.entity.data.topic import Topic from metadata.generated.schema.entity.services.dashboardService import DashboardService from metadata.generated.schema.entity.services.databaseService import DatabaseService @@ -56,7 +56,7 @@ TableEntities = List[Table] Tags = List[Tag] Topics = List[Topic] Dashboards = List[Dashboard] - +TableProfiles = List[TableProfile] class MetadataServerConfig(ConfigModel): api_endpoint: str @@ -241,10 +241,15 @@ class OpenMetadataAPIClient(object): else: return [Table(**t) for t in resp['data']] - def ingest_sample_data(self, id, sample_data): - resp = self.client.put('/tables/{}/sampleData'.format(id.__root__), data=sample_data.json()) + def ingest_sample_data(self, table_id, sample_data): + resp = self.client.put('/tables/{}/sampleData'.format(table_id.__root__), data=sample_data.json()) return TableData(**resp['sampleData']) + def ingest_table_profile_data(self, table_id, table_profile): + print(table_profile.json()) + resp = self.client.put('/tables/{}/tableProfile'.format(table_id.__root__), data=table_profile.json()) + return [TableProfile(**t) for t in resp['tableProfile']] + def get_table_by_id(self, table_id: str, fields: [] = ['columns']) -> Table: """Get Table By ID""" params = {'fields': ",".join(fields)} diff --git a/ingestion/src/metadata/ingestion/sink/metadata_rest.py b/ingestion/src/metadata/ingestion/sink/metadata_rest.py index e1a25445d7f..45c7aea1f08 100644 --- a/ingestion/src/metadata/ingestion/sink/metadata_rest.py +++ b/ingestion/src/metadata/ingestion/sink/metadata_rest.py @@ -14,6 +14,7 @@ # limitations under the License. import logging +from typing import List from pydantic import ValidationError @@ -30,7 +31,7 @@ from metadata.ingestion.api.sink import Sink, SinkStatus from metadata.ingestion.models.ometa_table_db import OMetaDatabaseAndTable from metadata.ingestion.models.table_metadata import Chart, Dashboard from metadata.ingestion.ometa.client import APIError -from metadata.ingestion.ometa.openmetadata_rest import OpenMetadataAPIClient, MetadataServerConfig +from metadata.ingestion.ometa.openmetadata_rest import OpenMetadataAPIClient, MetadataServerConfig, TableProfiles logger = logging.getLogger(__name__) @@ -102,7 +103,10 @@ class MetadataRestSink(Sink): created_table = self.client.create_or_update_table(table_request) if table_and_db.table.sampleData is not None: - self.client.ingest_sample_data(id=created_table.id, sample_data=table_and_db.table.sampleData) + self.client.ingest_sample_data(table_id=created_table.id, sample_data=table_and_db.table.sampleData) + if table_and_db.table.tableProfile is not None: + self.client.ingest_table_profile_data(table_id=created_table.id, + table_profile=table_and_db.table.tableProfile) logger.info( 'Successfully ingested table {}.{}'. diff --git a/ingestion/src/metadata/ingestion/source/sql_source.py b/ingestion/src/metadata/ingestion/source/sql_source.py index 2c50fc1c574..5feaa9198d5 100644 --- a/ingestion/src/metadata/ingestion/source/sql_source.py +++ b/ingestion/src/metadata/ingestion/source/sql_source.py @@ -28,7 +28,8 @@ from metadata.generated.schema.type.entityReference import EntityReference from metadata.generated.schema.entity.data.database import Database -from metadata.generated.schema.entity.data.table import Table, Column, ColumnConstraint, TableType, TableData +from metadata.generated.schema.entity.data.table import Table, Column, ColumnConstraint, TableType, TableData, \ + TableProfile from sqlalchemy import create_engine from sqlalchemy.engine.reflection import Inspector from sqlalchemy.sql import sqltypes as types @@ -248,7 +249,7 @@ class SQLSource(Source): if self.config.data_profiler_enabled: profile = self.run_data_profiler(table_name, schema) - logger.info(profile.json()) + table_entity.tableProfile = profile table_and_db = OMetaDatabaseAndTable(table=table_entity, database=self._get_database(schema)) yield table_and_db @@ -341,7 +342,7 @@ class SQLSource(Source): self, table: str, schema: str - ) -> DatasetProfile: + ) -> TableProfile: dataset_name = f"{schema}.{table}" self.status.scanned(f"profile of {dataset_name}") logger.info(f"Running Profiling for {dataset_name}. " diff --git a/ingestion/src/metadata/profiler/dataprofiler.py b/ingestion/src/metadata/profiler/dataprofiler.py index 67843bc05de..9cce386b864 100644 --- a/ingestion/src/metadata/profiler/dataprofiler.py +++ b/ingestion/src/metadata/profiler/dataprofiler.py @@ -15,6 +15,7 @@ import logging import time +from datetime import datetime from typing import Any, Iterable, Optional from great_expectations.core.expectation_validation_result import ( @@ -27,6 +28,8 @@ from great_expectations.data_context.types.base import ( DatasourceConfig, InMemoryStoreBackendDefaults, ) + +from metadata.generated.schema.entity.data.table import TableProfile, ColumnProfile from metadata.ingestion.api.source import SourceStatus from metadata.ingestion.models.table_metadata import DatasetProfile, DatasetColumnProfile, ValueFrequency from metadata.profiler.util import group_by @@ -67,7 +70,7 @@ class DataProfiler: limit: int = None, offset: int = None, **kwargs: Any, - ) -> DatasetProfile: + ) -> TableProfile: profile_test_results = self._profile_data_asset( { "schema": schema, @@ -104,7 +107,7 @@ class DataProfiler: def _parse_test_results_to_table_profile( self, profile_test_results: ExpectationSuiteValidationResult, dataset_name: str - ) -> DatasetProfile: + ) -> TableProfile: profile = None column_profiles = [] for col, col_test_result in group_by( @@ -117,23 +120,23 @@ class DataProfiler: column_profiles.append(column_profile) if profile is not None: - profile.col_profiles = column_profiles + profile.columnProfile = column_profiles return profile def _parse_table_test_results( self, table_test_results: Iterable[ExpectationValidationResult], dataset_name: str, - ) -> DatasetProfile: + ) -> TableProfile: logger.info("generating table stats") - profile = DatasetProfile(timestamp=round(time.time() * 1000), table_name=dataset_name) + profile = TableProfile(profileDate=datetime.now(). strftime("%Y-%m-%d")) for table_result in table_test_results: expectation: str = table_result.expectation_config.expectation_type result: dict = table_result.result if expectation == "expect_table_row_count_to_be_between": - profile.row_count = result["observed_value"] + profile.rowCount = result['observed_value'] elif expectation == "expect_table_columns_to_match_ordered_list": - profile.col_count = len(result["observed_value"]) + profile.columnCount = len(result["observed_value"]) else: self.status.warning( f"profile of {dataset_name}", f"unknown table mapper {expectation}" @@ -145,9 +148,9 @@ class DataProfiler: column: str, col_test_results: Iterable[ExpectationValidationResult], dataset_name: str, - ) -> DatasetColumnProfile: + ) -> ColumnProfile: logger.info(f"Generating Column Stats for {column}") - column_profile = DatasetColumnProfile(fqdn=column) + column_profile = ColumnProfile(name=column) for col_result in col_test_results: expectation: str = col_result.expectation_config.expectation_type result: dict = col_result.result @@ -158,16 +161,16 @@ class DataProfiler: continue if expectation == "expect_column_unique_value_count_to_be_between": - column_profile.unique_count = result["observed_value"] + column_profile.uniqueCount = result["observed_value"] elif expectation == "expect_column_proportion_of_unique_values_to_be_between": - column_profile.unique_proportion = result["observed_value"] + column_profile.uniqueProportion = result["observed_value"] elif expectation == "expect_column_values_to_not_be_null": - column_profile.null_count = result["unexpected_count"] + column_profile.nullCount = result["unexpected_count"] if ( "unexpected_percent" in result and result["unexpected_percent"] is not None ): - column_profile.null_proportion = result["unexpected_percent"] / 100 + column_profile.nullProportion = result["unexpected_percent"] / 100 elif expectation == "expect_column_values_to_not_match_regex": pass elif expectation == "expect_column_mean_to_be_between": @@ -190,11 +193,7 @@ class DataProfiler: elif expectation == "expect_column_kl_divergence_to_be_less_than": pass elif expectation == "expect_column_distinct_values_to_be_in_set": - if "details" in result and "value_counts" in result["details"]: - column_profile.distinct_value_frequencies = [ - ValueFrequency(value=str(value), frequency=count) - for value, count in result["details"]["value_counts"].items() - ] + pass elif expectation == "expect_column_values_to_be_in_type_list": pass elif expectation == "expect_column_values_to_be_unique": From f31651b60bc7d39a81206a13c11fa66009f127fa Mon Sep 17 00:00:00 2001 From: Sriharsha Chintalapani Date: Thu, 9 Sep 2021 10:53:15 -0700 Subject: [PATCH 4/4] Fix #446: Add DataProfiler to ingestion and APIs --- .../catalog/resources/databases/TableResourceTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java b/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java index b7908aa3ab5..246e73846ac 100644 --- a/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java +++ b/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java @@ -616,7 +616,8 @@ public class TableResourceTest extends CatalogApplicationTest { public void put_tableProfile_200(TestInfo test) throws HttpResponseException { Table table = createAndCheckTable(create(test), adminAuthHeaders()); List columns = Arrays.asList("c1", "c2", "c3"); - ColumnProfile c1Profile = new ColumnProfile().withName("c1").withMax("100.0").withMin("10.0").withUniqueCount(100.0); + ColumnProfile c1Profile = new ColumnProfile().withName("c1").withMax("100.0") + .withMin("10.0").withUniqueCount(100.0); ColumnProfile c2Profile = new ColumnProfile().withName("c2").withMax("99.0").withMin("20.0").withUniqueCount(89.0); ColumnProfile c3Profile = new ColumnProfile().withName("c3").withMax("75.0").withMin("25.0").withUniqueCount(77.0); // Add column profiles