diff --git a/catalog-rest-service/src/main/java/org/openmetadata/catalog/jdbi3/TableRepository.java b/catalog-rest-service/src/main/java/org/openmetadata/catalog/jdbi3/TableRepository.java index e527805d8a9..57d77d7b4f0 100644 --- a/catalog-rest-service/src/main/java/org/openmetadata/catalog/jdbi3/TableRepository.java +++ b/catalog-rest-service/src/main/java/org/openmetadata/catalog/jdbi3/TableRepository.java @@ -31,11 +31,13 @@ import org.openmetadata.catalog.resources.databases.TableResource; import org.openmetadata.catalog.resources.databases.TableResource.TableList; import org.openmetadata.catalog.type.Column; import org.openmetadata.catalog.type.ColumnJoin; +import org.openmetadata.catalog.type.ColumnProfile; import org.openmetadata.catalog.type.DailyCount; import org.openmetadata.catalog.type.EntityReference; import org.openmetadata.catalog.type.JoinedWith; import org.openmetadata.catalog.type.TableData; import org.openmetadata.catalog.type.TableJoins; +import org.openmetadata.catalog.type.TableProfile; import org.openmetadata.catalog.type.TagLabel; import org.openmetadata.catalog.util.EntityUtil; import org.openmetadata.catalog.util.EntityUtil.Fields; @@ -273,6 +275,29 @@ public abstract class TableRepository { JsonUtils.pojoToJson(tableData)); } + @Transaction + public void addTableProfileData(String tableId, TableProfile tableProfile) throws IOException { + // Validate the request content + Table table = EntityUtil.validate(tableId, tableDAO().findById(tableId), Table.class); + + List storedTableProfiles = getTableProfile(table); + Map storedMapTableProfiles = new HashMap<>(); + if (storedTableProfiles != null) { + for (TableProfile profile : storedTableProfiles) { + storedMapTableProfiles.put(profile.getProfileDate(), profile); + } + } + //validate all the columns + for (ColumnProfile columnProfile: tableProfile.getColumnProfile()) { + validateColumn(table, columnProfile.getName()); + } + storedMapTableProfiles.put(tableProfile.getProfileDate(), tableProfile); + List updatedProfiles = new ArrayList<>(storedMapTableProfiles.values()); + + entityExtensionDAO().insert(tableId, "table.tableProfile", "tableProfile", + JsonUtils.pojoToJson(updatedProfiles)); + } + @Transaction public void deleteFollower(String tableId, String userId) { EntityUtil.validateUser(userDAO(), userId); @@ -436,6 +461,7 @@ public abstract class TableRepository { table.setJoins(fields.contains("joins") ? getJoins(table) : null); table.setSampleData(fields.contains("sampleData") ? getSampleData(table) : null); table.setViewDefinition(fields.contains("viewDefinition") ? table.getViewDefinition() : null); + table.setTableProfile(fields.contains("tableProfile") ? getTableProfile(table): null); return table; } @@ -650,6 +676,11 @@ public abstract class TableRepository { TableData.class); } + private List getTableProfile(Table table) throws IOException { + return JsonUtils.readObjects(entityExtensionDAO().getExtension(table.getId().toString(), + "table.tableProfile"), + TableProfile.class); + } public interface TableDAO { @SqlUpdate("INSERT INTO table_entity (json) VALUES (:json)") diff --git a/catalog-rest-service/src/main/java/org/openmetadata/catalog/resources/databases/TableResource.java b/catalog-rest-service/src/main/java/org/openmetadata/catalog/resources/databases/TableResource.java index 057086e5dda..a7e3ddd38a1 100644 --- a/catalog-rest-service/src/main/java/org/openmetadata/catalog/resources/databases/TableResource.java +++ b/catalog-rest-service/src/main/java/org/openmetadata/catalog/resources/databases/TableResource.java @@ -35,6 +35,7 @@ import org.openmetadata.catalog.resources.Collection; import org.openmetadata.catalog.security.CatalogAuthorizer; import org.openmetadata.catalog.security.SecurityUtil; import org.openmetadata.catalog.type.EntityReference; +import org.openmetadata.catalog.type.TableProfile; import org.openmetadata.catalog.util.EntityUtil; import org.openmetadata.catalog.util.EntityUtil.Fields; import org.openmetadata.catalog.util.RestUtil; @@ -116,7 +117,7 @@ public class TableResource { } static final String FIELDS = "columns,tableConstraints,usageSummary,owner," + - "database,tags,followers,joins,sampleData,viewDefinition"; + "database,tags,followers,joins,sampleData,viewDefinition,tableProfile"; public static final List FIELD_LIST = Arrays.asList(FIELDS.replaceAll(" ", "") .split(",")); @@ -349,6 +350,21 @@ public class TableResource { return addHref(uriInfo, table); } + @PUT + @Path("/{id}/tableProfile") + @Operation(summary = "Add table profile data", tags = "tables", + description = "Add table profile data to the table.") + public Table addDataProfiler(@Context UriInfo uriInfo, + @Context SecurityContext securityContext, + @Parameter(description = "Id of the table", schema = @Schema(type = "string")) + @PathParam("id") String id, TableProfile tableProfile) throws IOException, ParseException { + SecurityUtil.checkAdminOrBotRole(authorizer, securityContext); + Fields fields = new Fields(FIELD_LIST, "tableProfile"); + dao.addTableProfileData(id, tableProfile); + Table table = dao.get(id, fields); + return addHref(uriInfo, table); + } + @DELETE @Path("/{id}/followers/{userId}") @Operation(summary = "Remove a follower", tags = "tables", diff --git a/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json b/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json index 9d194c52fb3..d8a09867a95 100644 --- a/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json +++ b/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json @@ -240,6 +240,80 @@ } }, "additionalProperties": false + }, + "columnProfile": { + "type": "object", + "javaType": "org.openmetadata.catalog.type.ColumnProfile", + "description": "This schema defines the type to capture the table's column profile.", + "properties": { + "name": { + "description": "Column Name.", + "type": "string" + }, + "uniqueCount": { + "description": "No. of unique values in the column", + "type": "number" + }, + "uniqueProportion": { + "description": "Proportion of number of unique values in a column", + "type": "number" + }, + "nullCount": { + "description": "No.of null values in a column", + "type": "number" + }, + "nullProportion": { + "description": "No.of null value proportion in columns", + "type": "number" + }, + "min": { + "description": "Minimum value in a column.", + "type": "string" + }, + "max": { + "description": "Maximum value in a column.", + "type": "string" + }, + "mean": { + "description": "Avg value in a column.", + "type": "string" + }, + "median": { + "description": "Median value in a column.", + "type": "string" + }, + "stddev": { + "description": "Standard deviation of a column.", + "type": "number" + } + } + }, + "tableProfile": { + "type": "object", + "javaType": "org.openmetadata.catalog.type.TableProfile", + "description": "This schema defines the type to capture the table's data profile.", + "properties": { + "profileDate": { + "description": "Data one which profile is taken.", + "$ref": "../../type/basic.json#/definitions/date" + }, + "columnCount": { + "description": "No.of columns in the table.", + "type": "number" + }, + "rowCount": { + "description": "No.of rows in the table.", + "type": "number" + }, + "columnProfile": { + "description": "List of local column profiles of the table.", + "type": "array", + "items": { + "$ref": "#/definitions/columnProfile" + } + } + }, + "additionalProperties": false } }, "properties": { @@ -321,6 +395,14 @@ "description": "Sample data for a table.", "$ref": "#/definitions/tableData", "default": null + }, + "tableProfile": { + "description": "Data profile for a table.", + "type": "array", + "items": { + "$ref": "#/definitions/tableProfile" + }, + "default": null } }, "required": [ diff --git a/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java b/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java index 861a99a6560..246e73846ac 100644 --- a/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java +++ b/catalog-rest-service/src/test/java/org/openmetadata/catalog/resources/databases/TableResourceTest.java @@ -44,12 +44,14 @@ import org.openmetadata.catalog.resources.teams.UserResourceTest; import org.openmetadata.catalog.type.Column; import org.openmetadata.catalog.type.ColumnDataType; import org.openmetadata.catalog.type.ColumnJoin; +import org.openmetadata.catalog.type.ColumnProfile; import org.openmetadata.catalog.type.EntityReference; import org.openmetadata.catalog.type.JoinedWith; import org.openmetadata.catalog.type.TableConstraint; import org.openmetadata.catalog.type.TableConstraint.ConstraintType; import org.openmetadata.catalog.type.TableData; import org.openmetadata.catalog.type.TableJoins; +import org.openmetadata.catalog.type.TableProfile; import org.openmetadata.catalog.type.TableType; import org.openmetadata.catalog.type.TagLabel; import org.openmetadata.catalog.util.EntityUtil; @@ -610,6 +612,55 @@ public class TableResourceTest extends CatalogApplicationTest { "TableType View, SecureView or MaterializedView"); } + @Test + public void put_tableProfile_200(TestInfo test) throws HttpResponseException { + Table table = createAndCheckTable(create(test), adminAuthHeaders()); + List columns = Arrays.asList("c1", "c2", "c3"); + ColumnProfile c1Profile = new ColumnProfile().withName("c1").withMax("100.0") + .withMin("10.0").withUniqueCount(100.0); + ColumnProfile c2Profile = new ColumnProfile().withName("c2").withMax("99.0").withMin("20.0").withUniqueCount(89.0); + ColumnProfile c3Profile = new ColumnProfile().withName("c3").withMax("75.0").withMin("25.0").withUniqueCount(77.0); + // Add column profiles + List columnProfiles = List.of(c1Profile, c2Profile, c3Profile); + TableProfile tableProfile = new TableProfile().withRowCount(6.0).withColumnCount(3.0) + .withColumnProfile(columnProfiles).withProfileDate("2021-09-09"); + putTableProfileData(table.getId(), tableProfile, adminAuthHeaders()); + + table = getTable(table.getId(), "tableProfile", adminAuthHeaders()); + verifyTableProfileData(table.getTableProfile(), List.of(tableProfile)); + + // Add new date for TableProfile + TableProfile newTableProfile = new TableProfile().withRowCount(7.0).withColumnCount(3.0) + .withColumnProfile(columnProfiles).withProfileDate("2021-09-08"); + putTableProfileData(table.getId(), newTableProfile, adminAuthHeaders()); + table = getTable(table.getId(), "tableProfile", adminAuthHeaders()); + verifyTableProfileData(table.getTableProfile(), List.of(newTableProfile, tableProfile)); + + // Replace table profile for a date + TableProfile newTableProfile1 = new TableProfile().withRowCount(21.0).withColumnCount(3.0) + .withColumnProfile(columnProfiles).withProfileDate("2021-09-08"); + putTableProfileData(table.getId(), newTableProfile1, adminAuthHeaders()); + table = getTable(table.getId(), "tableProfile", adminAuthHeaders()); + verifyTableProfileData(table.getTableProfile(), List.of(newTableProfile1, tableProfile)); + } + + @Test + public void put_tableInvalidTableProfileData_4xx(TestInfo test) throws HttpResponseException { + Table table = createAndCheckTable(create(test), adminAuthHeaders()); + + ColumnProfile c1Profile = new ColumnProfile().withName("c1").withMax("100").withMin("10.0") + .withUniqueCount(100.0); + ColumnProfile c2Profile = new ColumnProfile().withName("c2").withMax("99.0").withMin("20.0").withUniqueCount(89.0); + ColumnProfile c3Profile = new ColumnProfile().withName("invalidColumn").withMax("75") + .withMin("25").withUniqueCount(77.0); + List columnProfiles = List.of(c1Profile, c2Profile, c3Profile); + TableProfile tableProfile = new TableProfile().withRowCount(6.0).withColumnCount(3.0) + .withColumnProfile(columnProfiles).withProfileDate("2021-09-09"); + HttpResponseException exception = assertThrows(HttpResponseException.class, () + -> putTableProfileData(table.getId(), tableProfile, adminAuthHeaders())); + TestUtils.assertResponseContains(exception, BAD_REQUEST, "Invalid column name invalidColumn"); + } + @Test public void get_nonExistentTable_404_notFound() { HttpResponseException exception = assertThrows(HttpResponseException.class, () -> @@ -1225,6 +1276,13 @@ public class TableResourceTest extends CatalogApplicationTest { TestUtils.put(target, data, OK, authHeaders); } + public static void putTableProfileData(UUID tableId, TableProfile data, Map authHeaders) + throws HttpResponseException { + WebTarget target = CatalogApplicationTest.getResource("tables/" + tableId + "/tableProfile"); + TestUtils.put(target, data, OK, authHeaders); + } + + private void deleteTable(UUID id, Map authHeaders) throws HttpResponseException { TestUtils.delete(CatalogApplicationTest.getResource("tables/" + id), authHeaders); @@ -1355,4 +1413,17 @@ public class TableResourceTest extends CatalogApplicationTest { public static String getTableName(TestInfo test, int index) { return String.format("table%d_%s", index, test.getDisplayName()); } + + private void verifyTableProfileData(List actualProfiles, List expectedProfiles) { + assertEquals(actualProfiles.size(), expectedProfiles.size()); + Map tableProfileMap = new HashMap<>(); + for(TableProfile profile: actualProfiles) { + tableProfileMap.put(profile.getProfileDate(), profile); + } + for(TableProfile tableProfile: expectedProfiles) { + TableProfile storedProfile = tableProfileMap.get(tableProfile.getProfileDate()); + assertNotNull(storedProfile); + assertEquals(tableProfile, storedProfile); + } + } } diff --git a/ingestion/examples/workflows/redshift.json b/ingestion/examples/workflows/redshift.json index fd22db30e30..d71b831d36a 100644 --- a/ingestion/examples/workflows/redshift.json +++ b/ingestion/examples/workflows/redshift.json @@ -2,9 +2,9 @@ "source": { "type": "redshift", "config": { - "host_port": "cluster.name.region.redshift.amazonaws.com:5439", - "username": "username", - "password": "strong_password", + "host_port": "redshift-cluster-1.clot5cqn1cnb.us-west-2.redshift.amazonaws.com:5439", + "username": "awsuser", + "password": "focguC-kaqqe5-nepsok", "database": "warehouse", "service_name": "aws_redshift", "filter_pattern": { diff --git a/ingestion/pipelines/mysql.json b/ingestion/pipelines/mysql.json index 2b084c4cceb..0f1c54fa026 100644 --- a/ingestion/pipelines/mysql.json +++ b/ingestion/pipelines/mysql.json @@ -4,9 +4,10 @@ "config": { "username": "openmetadata_user", "password": "openmetadata_password", + "database": "openmetadata_db", "service_name": "local_mysql", "filter_pattern": { - "excludes": ["mysql.*", "information_schema.*"] + "excludes": ["mysql.*", "information_schema.*", "performance_schema.*", "sys.*"] } } }, diff --git a/ingestion/requirements.txt b/ingestion/requirements.txt index 07ad3a89aad..7cff288849a 100644 --- a/ingestion/requirements.txt +++ b/ingestion/requirements.txt @@ -16,4 +16,5 @@ confluent_kafka>=1.5.0 fastavro>=1.2.0 google~=3.0.0 okta~=2.0.0 -PyMySQL~=1.0.2 \ No newline at end of file +PyMySQL~=1.0.2 +great-expectations>=0.13.31 \ No newline at end of file diff --git a/ingestion/setup.py b/ingestion/setup.py index a30f0e3b667..4eab765c79e 100644 --- a/ingestion/setup.py +++ b/ingestion/setup.py @@ -58,12 +58,12 @@ base_requirements = { "email-validator>=1.0.3", "wheel~=0.36.2", "python-jose==3.3.0", - "okta==1.7.0", - "pandas~=1.3.1", + "okta>=1.7.0", "sqlalchemy>=1.3.24", "sql-metadata~=2.0.0", "spacy==3.0.5", "requests~=2.25.1", + "great-expectations>=0.13.31", "en_core_web_sm@https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz#egg=en_core_web" } base_plugins = { diff --git a/ingestion/src/metadata/generated/data/tags/__init__.py b/ingestion/src/metadata/generated/data/tags/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/data/tags/__init__.py +++ b/ingestion/src/metadata/generated/data/tags/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/data/tags/personalDataTags.py b/ingestion/src/metadata/generated/data/tags/personalDataTags.py index f561667dd81..7454a3e2db6 100644 --- a/ingestion/src/metadata/generated/data/tags/personalDataTags.py +++ b/ingestion/src/metadata/generated/data/tags/personalDataTags.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: data/tags/personalDataTags.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations @@ -12,5 +12,5 @@ from pydantic import BaseModel, Field class Model(BaseModel): __root__: Any = Field( ..., - description='Tags related classifying **Personal data** as defined by **GDPR.**\n\n\n\n_Note to Legal - This tag category is provided as a starting point. Please review and update the tags based on your company policy. Also, add a reference to your GDPR policy document in this description._', + description='Tags related classifying **Personal data** as defined by **GDPR.**

_Note to Legal - This tag category is provided as a starting point. Please review and update the tags based on your company policy. Also, add a reference to your GDPR policy document in this description._', ) diff --git a/ingestion/src/metadata/generated/data/tags/piiTags.py b/ingestion/src/metadata/generated/data/tags/piiTags.py index 8632b988d83..c2c4c403ee5 100644 --- a/ingestion/src/metadata/generated/data/tags/piiTags.py +++ b/ingestion/src/metadata/generated/data/tags/piiTags.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: data/tags/piiTags.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations @@ -12,5 +12,5 @@ from pydantic import BaseModel, Field class Model(BaseModel): __root__: Any = Field( ..., - description='Personally Identifiable Information information that, when used alone or with other relevant data, can identify an individual.\n\n\n\n_Note to Legal - This tag category is provided as a starting point. Please review and update the tags based on your company policy. Also, add a reference to your PII policy document in this description._', + description='Personally Identifiable Information information that, when used alone or with other relevant data, can identify an individual.

_Note to Legal - This tag category is provided as a starting point. Please review and update the tags based on your company policy. Also, add a reference to your PII policy document in this description._', ) diff --git a/ingestion/src/metadata/generated/data/tags/tierTags.py b/ingestion/src/metadata/generated/data/tags/tierTags.py index 81cab8a2d3e..fa121533465 100644 --- a/ingestion/src/metadata/generated/data/tags/tierTags.py +++ b/ingestion/src/metadata/generated/data/tags/tierTags.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: data/tags/tierTags.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/data/tags/userTags.py b/ingestion/src/metadata/generated/data/tags/userTags.py index 01900c3ecea..de5de310349 100644 --- a/ingestion/src/metadata/generated/data/tags/userTags.py +++ b/ingestion/src/metadata/generated/data/tags/userTags.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: data/tags/userTags.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/__init__.py b/ingestion/src/metadata/generated/schema/api/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/api/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/catalogVersion.py b/ingestion/src/metadata/generated/schema/api/catalogVersion.py index 22261343910..809d24f0949 100644 --- a/ingestion/src/metadata/generated/schema/api/catalogVersion.py +++ b/ingestion/src/metadata/generated/schema/api/catalogVersion.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/catalogVersion.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/__init__.py b/ingestion/src/metadata/generated/schema/api/data/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/api/data/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/data/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/data/createChart.py b/ingestion/src/metadata/generated/schema/api/data/createChart.py index 275497e1cd9..c849b35915f 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createChart.py +++ b/ingestion/src/metadata/generated/schema/api/data/createChart.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createChart.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/createDashboard.py b/ingestion/src/metadata/generated/schema/api/data/createDashboard.py index eb54e1a8ce6..b0542b28890 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createDashboard.py +++ b/ingestion/src/metadata/generated/schema/api/data/createDashboard.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createDashboard.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/createDatabase.py b/ingestion/src/metadata/generated/schema/api/data/createDatabase.py index 873c2d4e0cc..981bd22ee35 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createDatabase.py +++ b/ingestion/src/metadata/generated/schema/api/data/createDatabase.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createDatabase.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/createTable.py b/ingestion/src/metadata/generated/schema/api/data/createTable.py index 1fbcce9bc96..9ef094897ce 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createTable.py +++ b/ingestion/src/metadata/generated/schema/api/data/createTable.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createTable.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/data/createTopic.py b/ingestion/src/metadata/generated/schema/api/data/createTopic.py index 97307991d17..7b990410669 100644 --- a/ingestion/src/metadata/generated/schema/api/data/createTopic.py +++ b/ingestion/src/metadata/generated/schema/api/data/createTopic.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/data/createTopic.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations @@ -38,8 +38,7 @@ class CreateTopic(BaseModel): description='Topic clean up policy. For Kafka - `cleanup.policy` configuration.', ) replicationFactor: Optional[int] = Field( - None, - description='Replication Factor in integer (more than 1).', + None, description='Replication Factor in integer (more than 1).' ) retentionTime: Optional[float] = Field( None, diff --git a/ingestion/src/metadata/generated/schema/api/feed/__init__.py b/ingestion/src/metadata/generated/schema/api/feed/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/api/feed/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/feed/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/feed/createThread.py b/ingestion/src/metadata/generated/schema/api/feed/createThread.py index 8a09d019e7f..85475220bc4 100644 --- a/ingestion/src/metadata/generated/schema/api/feed/createThread.py +++ b/ingestion/src/metadata/generated/schema/api/feed/createThread.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/feed/createThread.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/__init__.py b/ingestion/src/metadata/generated/schema/api/services/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/api/services/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/services/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/services/createDashboardService.py b/ingestion/src/metadata/generated/schema/api/services/createDashboardService.py index 39a5bfd2413..9cc975de60e 100644 --- a/ingestion/src/metadata/generated/schema/api/services/createDashboardService.py +++ b/ingestion/src/metadata/generated/schema/api/services/createDashboardService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/createDashboardService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/createDatabaseService.py b/ingestion/src/metadata/generated/schema/api/services/createDatabaseService.py index 260df61a82c..a579a98d964 100644 --- a/ingestion/src/metadata/generated/schema/api/services/createDatabaseService.py +++ b/ingestion/src/metadata/generated/schema/api/services/createDatabaseService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/createDatabaseService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/createMessagingService.py b/ingestion/src/metadata/generated/schema/api/services/createMessagingService.py index e318a6d16bf..24779c4977b 100644 --- a/ingestion/src/metadata/generated/schema/api/services/createMessagingService.py +++ b/ingestion/src/metadata/generated/schema/api/services/createMessagingService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/createMessagingService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/updateDashboardService.py b/ingestion/src/metadata/generated/schema/api/services/updateDashboardService.py index 560493feb7b..c2ebd7fee81 100644 --- a/ingestion/src/metadata/generated/schema/api/services/updateDashboardService.py +++ b/ingestion/src/metadata/generated/schema/api/services/updateDashboardService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/updateDashboardService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/updateDatabaseService.py b/ingestion/src/metadata/generated/schema/api/services/updateDatabaseService.py index 031d2770e37..24cdec50e51 100644 --- a/ingestion/src/metadata/generated/schema/api/services/updateDatabaseService.py +++ b/ingestion/src/metadata/generated/schema/api/services/updateDatabaseService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/updateDatabaseService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/services/updateMessagingService.py b/ingestion/src/metadata/generated/schema/api/services/updateMessagingService.py index ce64422bdde..b0a41c638f3 100644 --- a/ingestion/src/metadata/generated/schema/api/services/updateMessagingService.py +++ b/ingestion/src/metadata/generated/schema/api/services/updateMessagingService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/services/updateMessagingService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/setOwner.py b/ingestion/src/metadata/generated/schema/api/setOwner.py index 150f08d2156..a329b72174c 100644 --- a/ingestion/src/metadata/generated/schema/api/setOwner.py +++ b/ingestion/src/metadata/generated/schema/api/setOwner.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/setOwner.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/tags/__init__.py b/ingestion/src/metadata/generated/schema/api/tags/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/api/tags/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/tags/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/tags/createTag.py b/ingestion/src/metadata/generated/schema/api/tags/createTag.py index 5a4bcf9558b..31a4eaa6f45 100644 --- a/ingestion/src/metadata/generated/schema/api/tags/createTag.py +++ b/ingestion/src/metadata/generated/schema/api/tags/createTag.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/tags/createTag.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/tags/createTagCategory.py b/ingestion/src/metadata/generated/schema/api/tags/createTagCategory.py index 8542311d5bd..dcb6aaf9c86 100644 --- a/ingestion/src/metadata/generated/schema/api/tags/createTagCategory.py +++ b/ingestion/src/metadata/generated/schema/api/tags/createTagCategory.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/tags/createTagCategory.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/teams/__init__.py b/ingestion/src/metadata/generated/schema/api/teams/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/api/teams/__init__.py +++ b/ingestion/src/metadata/generated/schema/api/teams/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/api/teams/createTeam.py b/ingestion/src/metadata/generated/schema/api/teams/createTeam.py index ee4c92e71fc..1ed8173451e 100644 --- a/ingestion/src/metadata/generated/schema/api/teams/createTeam.py +++ b/ingestion/src/metadata/generated/schema/api/teams/createTeam.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/teams/createTeam.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/api/teams/createUser.py b/ingestion/src/metadata/generated/schema/api/teams/createUser.py index 023cd75056d..07ef77981d3 100644 --- a/ingestion/src/metadata/generated/schema/api/teams/createUser.py +++ b/ingestion/src/metadata/generated/schema/api/teams/createUser.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/api/teams/createUser.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/__init__.py b/ingestion/src/metadata/generated/schema/entity/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/entity/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/bots.py b/ingestion/src/metadata/generated/schema/entity/bots.py index ebe1d06cf3d..ed970070549 100644 --- a/ingestion/src/metadata/generated/schema/entity/bots.py +++ b/ingestion/src/metadata/generated/schema/entity/bots.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/bots.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/__init__.py b/ingestion/src/metadata/generated/schema/entity/data/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/data/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/data/chart.py b/ingestion/src/metadata/generated/schema/entity/data/chart.py index 249c82a46bf..82dcf229ffc 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/chart.py +++ b/ingestion/src/metadata/generated/schema/entity/data/chart.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/chart.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/dashboard.py b/ingestion/src/metadata/generated/schema/entity/data/dashboard.py index a0729f64840..7035fc3b34f 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/dashboard.py +++ b/ingestion/src/metadata/generated/schema/entity/data/dashboard.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/dashboard.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/database.py b/ingestion/src/metadata/generated/schema/entity/data/database.py index c49d76ce02d..9e072387987 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/database.py +++ b/ingestion/src/metadata/generated/schema/entity/data/database.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/database.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/metrics.py b/ingestion/src/metadata/generated/schema/entity/data/metrics.py index 5fc27a3a1e6..a3ef45f8fb0 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/metrics.py +++ b/ingestion/src/metadata/generated/schema/entity/data/metrics.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/metrics.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/pipeline.py b/ingestion/src/metadata/generated/schema/entity/data/pipeline.py index b4d39be0f92..a802334f293 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/pipeline.py +++ b/ingestion/src/metadata/generated/schema/entity/data/pipeline.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/pipeline.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/report.py b/ingestion/src/metadata/generated/schema/entity/data/report.py index 5b4b609584e..b50e1302922 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/report.py +++ b/ingestion/src/metadata/generated/schema/entity/data/report.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/report.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/data/table.py b/ingestion/src/metadata/generated/schema/entity/data/table.py index c5d369cda01..a8ee201bf8d 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/table.py +++ b/ingestion/src/metadata/generated/schema/entity/data/table.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/table.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations @@ -124,6 +124,27 @@ class TableData(BaseModel): ) +class ColumnProfile(BaseModel): + name: Optional[str] = Field(None, description='Column Name.') + uniqueCount: Optional[float] = Field( + None, description='No. of unique values in the column' + ) + uniqueProportion: Optional[float] = Field( + None, description='Proportion of number of unique values in a column' + ) + nullCount: Optional[float] = Field( + None, description='No.of null values in a column' + ) + nullProportion: Optional[float] = Field( + None, description='No.of null value proportion in columns' + ) + min: Optional[str] = Field(None, description='Minimum value in a column.') + max: Optional[str] = Field(None, description='Maximum value in a column.') + mean: Optional[str] = Field(None, description='Avg value in a column.') + median: Optional[str] = Field(None, description='Median value in a column.') + stddev: Optional[float] = Field(None, description='Standard deviation of a column.') + + class TableJoins(BaseModel): class Config: extra = Extra.forbid @@ -135,6 +156,22 @@ class TableJoins(BaseModel): columnJoins: Optional[List[ColumnJoins]] = None +class TableProfile(BaseModel): + class Config: + extra = Extra.forbid + + profileDate: Optional[basic.Date] = Field( + None, description='Data one which profile is taken.' + ) + columnCount: Optional[float] = Field( + None, description='No.of columns in the table.' + ) + rowCount: Optional[float] = Field(None, description='No.of rows in the table.') + columnProfile: Optional[List[ColumnProfile]] = Field( + None, description='List of local column profiles of the table.' + ) + + class Column(BaseModel): name: ColumnName columnDataType: ColumnDataType = Field( @@ -194,3 +231,6 @@ class Table(BaseModel): sampleData: Optional[TableData] = Field( None, description='Sample data for a table.' ) + tableProfile: Optional[List[TableProfile]] = Field( + None, description='Data profile for a table.' + ) diff --git a/ingestion/src/metadata/generated/schema/entity/data/topic.py b/ingestion/src/metadata/generated/schema/entity/data/topic.py index 78e16b8b613..790212905f5 100644 --- a/ingestion/src/metadata/generated/schema/entity/data/topic.py +++ b/ingestion/src/metadata/generated/schema/entity/data/topic.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/data/topic.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations @@ -64,6 +64,9 @@ class Topic(BaseModel): None, description='Retention time in milliseconds. For Kafka - `retention.ms` configuration.', ) + replicationFactor: Optional[int] = Field( + None, description='Replication Factor in integer (more than 1).' + ) maximumMessageSize: Optional[int] = Field( None, description='Maximum message size in bytes. For Kafka - `max.message.bytes` configuration.', diff --git a/ingestion/src/metadata/generated/schema/entity/feed/__init__.py b/ingestion/src/metadata/generated/schema/entity/feed/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/entity/feed/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/feed/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/feed/thread.py b/ingestion/src/metadata/generated/schema/entity/feed/thread.py index 90d10fe75e7..c75c26f202f 100644 --- a/ingestion/src/metadata/generated/schema/entity/feed/thread.py +++ b/ingestion/src/metadata/generated/schema/entity/feed/thread.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/feed/thread.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/services/__init__.py b/ingestion/src/metadata/generated/schema/entity/services/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/entity/services/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/services/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/services/dashboardService.py b/ingestion/src/metadata/generated/schema/entity/services/dashboardService.py index 2e7dcb13ba5..55a3f7d3621 100644 --- a/ingestion/src/metadata/generated/schema/entity/services/dashboardService.py +++ b/ingestion/src/metadata/generated/schema/entity/services/dashboardService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/services/dashboardService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/services/databaseService.py b/ingestion/src/metadata/generated/schema/entity/services/databaseService.py index 6969ce9e78d..1dd6327ad31 100644 --- a/ingestion/src/metadata/generated/schema/entity/services/databaseService.py +++ b/ingestion/src/metadata/generated/schema/entity/services/databaseService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/services/databaseService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/services/messagingService.py b/ingestion/src/metadata/generated/schema/entity/services/messagingService.py index 4d9887a22af..171f726fe89 100644 --- a/ingestion/src/metadata/generated/schema/entity/services/messagingService.py +++ b/ingestion/src/metadata/generated/schema/entity/services/messagingService.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/services/messagingService.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/tags/__init__.py b/ingestion/src/metadata/generated/schema/entity/tags/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/entity/tags/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/tags/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/tags/tagCategory.py b/ingestion/src/metadata/generated/schema/entity/tags/tagCategory.py index 31183207f94..9f8e1aef884 100644 --- a/ingestion/src/metadata/generated/schema/entity/tags/tagCategory.py +++ b/ingestion/src/metadata/generated/schema/entity/tags/tagCategory.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/tags/tagCategory.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/teams/__init__.py b/ingestion/src/metadata/generated/schema/entity/teams/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/entity/teams/__init__.py +++ b/ingestion/src/metadata/generated/schema/entity/teams/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/entity/teams/team.py b/ingestion/src/metadata/generated/schema/entity/teams/team.py index 423fa81d5db..b357f675af5 100644 --- a/ingestion/src/metadata/generated/schema/entity/teams/team.py +++ b/ingestion/src/metadata/generated/schema/entity/teams/team.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/teams/team.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/entity/teams/user.py b/ingestion/src/metadata/generated/schema/entity/teams/user.py index 102defbb41e..389515cf16d 100644 --- a/ingestion/src/metadata/generated/schema/entity/teams/user.py +++ b/ingestion/src/metadata/generated/schema/entity/teams/user.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/entity/teams/user.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/__init__.py b/ingestion/src/metadata/generated/schema/type/__init__.py index 72cdaaed7c3..2bb2db246ef 100644 --- a/ingestion/src/metadata/generated/schema/type/__init__.py +++ b/ingestion/src/metadata/generated/schema/type/__init__.py @@ -1,3 +1,3 @@ # generated by datamodel-codegen: # filename: json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 diff --git a/ingestion/src/metadata/generated/schema/type/auditLog.py b/ingestion/src/metadata/generated/schema/type/auditLog.py index 9c15806268b..61533d0462e 100644 --- a/ingestion/src/metadata/generated/schema/type/auditLog.py +++ b/ingestion/src/metadata/generated/schema/type/auditLog.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/auditLog.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/basic.py b/ingestion/src/metadata/generated/schema/type/basic.py index 8019c8a7856..fefc9d382c6 100644 --- a/ingestion/src/metadata/generated/schema/type/basic.py +++ b/ingestion/src/metadata/generated/schema/type/basic.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/basic.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/collectionDescriptor.py b/ingestion/src/metadata/generated/schema/type/collectionDescriptor.py index 35e35acd29b..a000a642c0c 100644 --- a/ingestion/src/metadata/generated/schema/type/collectionDescriptor.py +++ b/ingestion/src/metadata/generated/schema/type/collectionDescriptor.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/collectionDescriptor.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/dailyCount.py b/ingestion/src/metadata/generated/schema/type/dailyCount.py index fafa104390b..7fc3d4b2f43 100644 --- a/ingestion/src/metadata/generated/schema/type/dailyCount.py +++ b/ingestion/src/metadata/generated/schema/type/dailyCount.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/dailyCount.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/entityReference.py b/ingestion/src/metadata/generated/schema/type/entityReference.py index 55c3d90ef1f..ae11cf76a96 100644 --- a/ingestion/src/metadata/generated/schema/type/entityReference.py +++ b/ingestion/src/metadata/generated/schema/type/entityReference.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/entityReference.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/entityUsage.py b/ingestion/src/metadata/generated/schema/type/entityUsage.py index a3782be7172..d38ca958d31 100644 --- a/ingestion/src/metadata/generated/schema/type/entityUsage.py +++ b/ingestion/src/metadata/generated/schema/type/entityUsage.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/entityUsage.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/jdbcConnection.py b/ingestion/src/metadata/generated/schema/type/jdbcConnection.py index bc401ab98a0..74818d079c1 100644 --- a/ingestion/src/metadata/generated/schema/type/jdbcConnection.py +++ b/ingestion/src/metadata/generated/schema/type/jdbcConnection.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/jdbcConnection.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/paging.py b/ingestion/src/metadata/generated/schema/type/paging.py new file mode 100644 index 00000000000..688a3b492dd --- /dev/null +++ b/ingestion/src/metadata/generated/schema/type/paging.py @@ -0,0 +1,23 @@ +# generated by datamodel-codegen: +# filename: schema/type/paging.json +# timestamp: 2021-09-09T17:49:00+00:00 + +from __future__ import annotations + +from typing import Optional + +from pydantic import BaseModel, Field + + +class Paging(BaseModel): + before: Optional[str] = Field( + None, + description='Before cursor used for getting the previous page (see API pagination for details).', + ) + after: Optional[str] = Field( + None, + description='After cursor used for getting the next page (see API pagination for details).', + ) + total: int = Field( + ..., description='Total number of entries available to page through.' + ) diff --git a/ingestion/src/metadata/generated/schema/type/profile.py b/ingestion/src/metadata/generated/schema/type/profile.py index 701fb2c1693..c1e83abf930 100644 --- a/ingestion/src/metadata/generated/schema/type/profile.py +++ b/ingestion/src/metadata/generated/schema/type/profile.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/profile.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/schedule.py b/ingestion/src/metadata/generated/schema/type/schedule.py index bfaa72c8296..430cb05b528 100644 --- a/ingestion/src/metadata/generated/schema/type/schedule.py +++ b/ingestion/src/metadata/generated/schema/type/schedule.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/schedule.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/tagLabel.py b/ingestion/src/metadata/generated/schema/type/tagLabel.py index e6b3b682e18..977e161d8f1 100644 --- a/ingestion/src/metadata/generated/schema/type/tagLabel.py +++ b/ingestion/src/metadata/generated/schema/type/tagLabel.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/tagLabel.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/generated/schema/type/usageDetails.py b/ingestion/src/metadata/generated/schema/type/usageDetails.py index 8c39334b03a..02369ba6eb2 100644 --- a/ingestion/src/metadata/generated/schema/type/usageDetails.py +++ b/ingestion/src/metadata/generated/schema/type/usageDetails.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: schema/type/usageDetails.json -# timestamp: 2021-09-02T03:08:31+00:00 +# timestamp: 2021-09-09T17:49:00+00:00 from __future__ import annotations diff --git a/ingestion/src/metadata/ingestion/models/table_metadata.py b/ingestion/src/metadata/ingestion/models/table_metadata.py index 6605471de76..eb01aa14842 100644 --- a/ingestion/src/metadata/ingestion/models/table_metadata.py +++ b/ingestion/src/metadata/ingestion/models/table_metadata.py @@ -258,3 +258,50 @@ class Dashboard(BaseModel): charts: List[str] service: EntityReference lastModified: int = None + + +class ValueFrequency(BaseModel): + """Profiler ValueFrequency""" + value: str + frequency: int + + +class Histogram(BaseModel): + """Histogram""" + boundaries: List[str] + heights: List[str] + + +class Quantile(BaseModel): + """Quantile""" + quantile: str + value: str + + +class DatasetColumnProfile(BaseModel): + """Dataset Column Profile stats """ + fqdn: str + unique_count: int = None + unique_proportion: int = None + null_count: int = None + null_proportion: int = None + min: str = None + max: str = None + mean: str = None + median: str = None + stddev: str = None + quantiles: List[Quantile] = None + distinct_value_frequencies: List[ValueFrequency] = None + histogram: List[Histogram] = None + sample_values: List[str] = None + + +class DatasetProfile(BaseModel): + """Dataset(table) stats""" + timestamp: int + table_name: str + row_count: int = None + col_count: int = None + col_profiles: List[DatasetColumnProfile] = None + + diff --git a/ingestion/src/metadata/ingestion/ometa/openmetadata_rest.py b/ingestion/src/metadata/ingestion/ometa/openmetadata_rest.py index a73d5e3b05a..045b69f78d8 100644 --- a/ingestion/src/metadata/ingestion/ometa/openmetadata_rest.py +++ b/ingestion/src/metadata/ingestion/ometa/openmetadata_rest.py @@ -28,7 +28,7 @@ from metadata.generated.schema.api.services.createMessagingService import Create from metadata.generated.schema.entity.data.chart import Chart from metadata.generated.schema.entity.data.dashboard import Dashboard from metadata.generated.schema.entity.data.database import Database -from metadata.generated.schema.entity.data.table import Table, TableData, TableJoins +from metadata.generated.schema.entity.data.table import Table, TableData, TableJoins, TableProfile from metadata.generated.schema.entity.data.topic import Topic from metadata.generated.schema.entity.services.dashboardService import DashboardService from metadata.generated.schema.entity.services.databaseService import DatabaseService @@ -56,7 +56,7 @@ TableEntities = List[Table] Tags = List[Tag] Topics = List[Topic] Dashboards = List[Dashboard] - +TableProfiles = List[TableProfile] class MetadataServerConfig(ConfigModel): api_endpoint: str @@ -241,10 +241,15 @@ class OpenMetadataAPIClient(object): else: return [Table(**t) for t in resp['data']] - def ingest_sample_data(self, id, sample_data): - resp = self.client.put('/tables/{}/sampleData'.format(id.__root__), data=sample_data.json()) + def ingest_sample_data(self, table_id, sample_data): + resp = self.client.put('/tables/{}/sampleData'.format(table_id.__root__), data=sample_data.json()) return TableData(**resp['sampleData']) + def ingest_table_profile_data(self, table_id, table_profile): + print(table_profile.json()) + resp = self.client.put('/tables/{}/tableProfile'.format(table_id.__root__), data=table_profile.json()) + return [TableProfile(**t) for t in resp['tableProfile']] + def get_table_by_id(self, table_id: str, fields: [] = ['columns']) -> Table: """Get Table By ID""" params = {'fields': ",".join(fields)} diff --git a/ingestion/src/metadata/ingestion/sink/metadata_rest.py b/ingestion/src/metadata/ingestion/sink/metadata_rest.py index e1a25445d7f..45c7aea1f08 100644 --- a/ingestion/src/metadata/ingestion/sink/metadata_rest.py +++ b/ingestion/src/metadata/ingestion/sink/metadata_rest.py @@ -14,6 +14,7 @@ # limitations under the License. import logging +from typing import List from pydantic import ValidationError @@ -30,7 +31,7 @@ from metadata.ingestion.api.sink import Sink, SinkStatus from metadata.ingestion.models.ometa_table_db import OMetaDatabaseAndTable from metadata.ingestion.models.table_metadata import Chart, Dashboard from metadata.ingestion.ometa.client import APIError -from metadata.ingestion.ometa.openmetadata_rest import OpenMetadataAPIClient, MetadataServerConfig +from metadata.ingestion.ometa.openmetadata_rest import OpenMetadataAPIClient, MetadataServerConfig, TableProfiles logger = logging.getLogger(__name__) @@ -102,7 +103,10 @@ class MetadataRestSink(Sink): created_table = self.client.create_or_update_table(table_request) if table_and_db.table.sampleData is not None: - self.client.ingest_sample_data(id=created_table.id, sample_data=table_and_db.table.sampleData) + self.client.ingest_sample_data(table_id=created_table.id, sample_data=table_and_db.table.sampleData) + if table_and_db.table.tableProfile is not None: + self.client.ingest_table_profile_data(table_id=created_table.id, + table_profile=table_and_db.table.tableProfile) logger.info( 'Successfully ingested table {}.{}'. diff --git a/ingestion/src/metadata/ingestion/source/sql_source.py b/ingestion/src/metadata/ingestion/source/sql_source.py index b0f680a1f9b..5feaa9198d5 100644 --- a/ingestion/src/metadata/ingestion/source/sql_source.py +++ b/ingestion/src/metadata/ingestion/source/sql_source.py @@ -28,8 +28,9 @@ from metadata.generated.schema.type.entityReference import EntityReference from metadata.generated.schema.entity.data.database import Database -from metadata.generated.schema.entity.data.table import Table, Column, ColumnConstraint, TableType, TableData -from sqlalchemy import create_engine, inspect +from metadata.generated.schema.entity.data.table import Table, Column, ColumnConstraint, TableType, TableData, \ + TableProfile +from sqlalchemy import create_engine from sqlalchemy.engine.reflection import Inspector from sqlalchemy.sql import sqltypes as types from sqlalchemy.inspection import inspect @@ -37,8 +38,10 @@ from sqlalchemy.inspection import inspect from metadata.ingestion.api.common import IncludeFilterPattern, ConfigModel, Record from metadata.ingestion.api.common import WorkflowContext from metadata.ingestion.api.source import Source, SourceStatus +from metadata.ingestion.models.table_metadata import DatasetProfile from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig from metadata.utils.helpers import get_database_service_or_create +from metadata.profiler.dataprofiler import DataProfiler logger: logging.Logger = logging.getLogger(__name__) @@ -72,6 +75,9 @@ class SQLConnectionConfig(ConfigModel): include_views: Optional[bool] = True include_tables: Optional[bool] = True generate_sample_data: Optional[bool] = True + data_profiler_enabled: Optional[bool] = True + data_profiler_offset: Optional[int] = 0 + data_profiler_limit: Optional[int] = 50000 filter_pattern: IncludeFilterPattern = IncludeFilterPattern.allow_all() @abstractmethod @@ -152,8 +158,18 @@ def get_column_type(status: SQLSourceStatus, dataset_name: str, column_type: Any return type_class -class SQLSource(Source): +def _get_table_description(schema: str, table: str, inspector: Inspector) -> str: + description = None + try: + table_info: dict = inspector.get_table_comment(table, schema) + except Exception as err: + logger.error(f"Table Description Error : {err}") + else: + description = table_info["text"] + return description + +class SQLSource(Source): def __init__(self, config: SQLConnectionConfig, metadata_config: MetadataServerConfig, ctx: WorkflowContext): super().__init__(ctx) @@ -162,8 +178,12 @@ class SQLSource(Source): self.service = get_database_service_or_create(config, metadata_config) self.status = SQLSourceStatus() self.sql_config = self.config - self.engine = create_engine(self.sql_config.get_connection_url(), **self.sql_config.options) + self.connection_string = self.sql_config.get_connection_url() + self.engine = create_engine(self.connection_string, **self.sql_config.options) self.connection = self.engine.connect() + if self.config.data_profiler_enabled: + self.data_profiler = DataProfiler(status=self.status, + connection_str=self.connection_string) def prepare(self): pass @@ -179,7 +199,7 @@ class SQLSource(Source): def fetch_sample_data(self, schema: str, table: str): try: - query = self.config.query.format(schema,table) + query = self.config.query.format(schema, table) logger.info(query) results = self.connection.execute(query) cols = list(results.keys()) @@ -211,11 +231,11 @@ class SQLSource(Source): schema, table_name = self.standardize_schema_table_names(schema, table_name) if not self.sql_config.filter_pattern.included(table_name): self.status.filter('{}.{}'.format(self.config.get_service_name(), table_name), - "Table pattern not allowed") + "Table pattern not allowed") continue self.status.scanned('{}.{}'.format(self.config.get_service_name(), table_name)) - description = self._get_table_description(schema, table_name, inspector) + description = _get_table_description(schema, table_name, inspector) table_columns = self._get_columns(schema, table_name, inspector) table_entity = Table(id=uuid.uuid4(), @@ -227,6 +247,10 @@ class SQLSource(Source): table_data = self.fetch_sample_data(schema, table_name) table_entity.sampleData = table_data + if self.config.data_profiler_enabled: + profile = self.run_data_profiler(table_name, schema) + table_entity.tableProfile = profile + table_and_db = OMetaDatabaseAndTable(table=table_entity, database=self._get_database(schema)) yield table_and_db except ValidationError as err: @@ -241,7 +265,7 @@ class SQLSource(Source): try: if not self.sql_config.filter_pattern.included(view_name): self.status.filter('{}.{}'.format(self.config.get_service_name(), view_name), - "View pattern not allowed") + "View pattern not allowed") continue try: view_definition = inspector.get_view_definition(view_name, schema) @@ -249,7 +273,7 @@ class SQLSource(Source): except NotImplementedError: view_definition = "" - description = self._get_table_description(schema, view_name, inspector) + description = _get_table_description(schema, view_name, inspector) table_columns = self._get_columns(schema, view_name, inspector) table = Table(id=uuid.uuid4(), name=view_name, @@ -314,15 +338,23 @@ class SQLSource(Source): return table_columns - def _get_table_description(self, schema: str, table: str, inspector: Inspector) -> str: - description = None - try: - table_info: dict = inspector.get_table_comment(table, schema) - except Exception as err: - logger.error(f"Table Description Error : {err}") - else: - description = table_info["text"] - return description + def run_data_profiler( + self, + table: str, + schema: str + ) -> TableProfile: + dataset_name = f"{schema}.{table}" + self.status.scanned(f"profile of {dataset_name}") + logger.info(f"Running Profiling for {dataset_name}. " + f"If you haven't configured offset and limit this process can take longer") + profile = self.data_profiler.run_profiler( + dataset_name=dataset_name, + schema=schema, + table=table, + limit=self.sql_config.data_profiler_limit, + offset=self.sql_config.data_profiler_offset) + logger.debug(f"Finished profiling {dataset_name}") + return profile def close(self): if self.connection is not None: diff --git a/ingestion/src/metadata/profiler/__init__.py b/ingestion/src/metadata/profiler/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ingestion/src/metadata/profiler/dataprofiler.py b/ingestion/src/metadata/profiler/dataprofiler.py new file mode 100644 index 00000000000..9cce386b864 --- /dev/null +++ b/ingestion/src/metadata/profiler/dataprofiler.py @@ -0,0 +1,206 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time +from datetime import datetime +from typing import Any, Iterable, Optional + +from great_expectations.core.expectation_validation_result import ( + ExpectationSuiteValidationResult, + ExpectationValidationResult, +) +from great_expectations.data_context import BaseDataContext +from great_expectations.data_context.types.base import ( + DataContextConfig, + DatasourceConfig, + InMemoryStoreBackendDefaults, +) + +from metadata.generated.schema.entity.data.table import TableProfile, ColumnProfile +from metadata.ingestion.api.source import SourceStatus +from metadata.ingestion.models.table_metadata import DatasetProfile, DatasetColumnProfile, ValueFrequency +from metadata.profiler.util import group_by + +logger: logging.Logger = logging.getLogger(__name__) + + +class DataProfiler: + data_context: BaseDataContext + status: SourceStatus + datasource_name: str = "om_data_source" + + def __init__(self, connection_str, status): + self.status = status + self.connection_str = connection_str + data_context_config = DataContextConfig( + datasources={ + self.datasource_name: DatasourceConfig( + class_name="SqlAlchemyDatasource", + credentials={ + "url": self.connection_str, + }, + ) + }, + store_backend_defaults=InMemoryStoreBackendDefaults(), + anonymous_usage_statistics={ + "enabled": False, + }, + ) + + self.data_context = BaseDataContext(project_config=data_context_config) + + def run_profiler( + self, + dataset_name: str, + schema: str = None, + table: str = None, + limit: int = None, + offset: int = None, + **kwargs: Any, + ) -> TableProfile: + profile_test_results = self._profile_data_asset( + { + "schema": schema, + "table": table, + "limit": limit, + "offset": offset, + **kwargs, + } + ) + profile = self._parse_test_results_to_table_profile(profile_test_results, dataset_name=dataset_name) + return profile + + def _profile_data_asset( + self, + batch_kwargs: dict + ) -> ExpectationSuiteValidationResult: + + profile_results = self.data_context.profile_data_asset( + self.datasource_name, + batch_kwargs={ + "datasource": self.datasource_name, + **batch_kwargs, + }, + ) + assert profile_results["success"] + + assert len(profile_results["results"]) == 1 + test_suite, test_results = profile_results["results"][0] + return test_results + + @staticmethod + def _get_column_from_result(result: ExpectationValidationResult) -> Optional[str]: + return result.expectation_config.kwargs.get("column") + + def _parse_test_results_to_table_profile( + self, profile_test_results: ExpectationSuiteValidationResult, dataset_name: str + ) -> TableProfile: + profile = None + column_profiles = [] + for col, col_test_result in group_by( + profile_test_results.results, key=self._get_column_from_result + ): + if col is None: + profile = self._parse_table_test_results(col_test_result, dataset_name=dataset_name) + else: + column_profile = self._parse_column_test_results(col, col_test_result, dataset_name=dataset_name) + column_profiles.append(column_profile) + + if profile is not None: + profile.columnProfile = column_profiles + return profile + + def _parse_table_test_results( + self, + table_test_results: Iterable[ExpectationValidationResult], + dataset_name: str, + ) -> TableProfile: + logger.info("generating table stats") + profile = TableProfile(profileDate=datetime.now(). strftime("%Y-%m-%d")) + for table_result in table_test_results: + expectation: str = table_result.expectation_config.expectation_type + result: dict = table_result.result + if expectation == "expect_table_row_count_to_be_between": + profile.rowCount = result['observed_value'] + elif expectation == "expect_table_columns_to_match_ordered_list": + profile.columnCount = len(result["observed_value"]) + else: + self.status.warning( + f"profile of {dataset_name}", f"unknown table mapper {expectation}" + ) + return profile + + def _parse_column_test_results( + self, + column: str, + col_test_results: Iterable[ExpectationValidationResult], + dataset_name: str, + ) -> ColumnProfile: + logger.info(f"Generating Column Stats for {column}") + column_profile = ColumnProfile(name=column) + for col_result in col_test_results: + expectation: str = col_result.expectation_config.expectation_type + result: dict = col_result.result + if not result: + self.status.warning( + f"profile of {dataset_name}", f"{expectation} did not yield any results" + ) + continue + + if expectation == "expect_column_unique_value_count_to_be_between": + column_profile.uniqueCount = result["observed_value"] + elif expectation == "expect_column_proportion_of_unique_values_to_be_between": + column_profile.uniqueProportion = result["observed_value"] + elif expectation == "expect_column_values_to_not_be_null": + column_profile.nullCount = result["unexpected_count"] + if ( + "unexpected_percent" in result + and result["unexpected_percent"] is not None + ): + column_profile.nullProportion = result["unexpected_percent"] / 100 + elif expectation == "expect_column_values_to_not_match_regex": + pass + elif expectation == "expect_column_mean_to_be_between": + column_profile.mean = str(result["observed_value"]) + elif expectation == "expect_column_min_to_be_between": + column_profile.min = str(result["observed_value"]) + elif expectation == "expect_column_max_to_be_between": + column_profile.max = str(result["observed_value"]) + elif expectation == "expect_column_median_to_be_between": + column_profile.median = str(result["observed_value"]) + elif expectation == "expect_column_stdev_to_be_between": + column_profile.stddev = str(result["observed_value"]) + elif expectation == "expect_column_quantile_values_to_be_between": + pass + elif expectation == "expect_column_values_to_be_in_set": + #column_profile.sample_values = [ + # str(v) for v in result["partial_unexpected_list"] + #] + pass + elif expectation == "expect_column_kl_divergence_to_be_less_than": + pass + elif expectation == "expect_column_distinct_values_to_be_in_set": + pass + elif expectation == "expect_column_values_to_be_in_type_list": + pass + elif expectation == "expect_column_values_to_be_unique": + pass + else: + self.status.warning( + f"profile of {dataset_name}", + f"warning: unknown column mapper {expectation} in col {column}", + ) + return column_profile diff --git a/ingestion/src/metadata/profiler/util.py b/ingestion/src/metadata/profiler/util.py new file mode 100644 index 00000000000..f5494d0739d --- /dev/null +++ b/ingestion/src/metadata/profiler/util.py @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +from typing import TypeVar, Iterable, Callable, Tuple + +T = TypeVar("T") +K = TypeVar("K") + + +def group_by( + iterable: Iterable[T], key: Callable[[T], K] +) -> Iterable[Tuple[K, Iterable[T]]]: + + values = collections.defaultdict(list) + for v in iterable: + values[key(v)].append(v) + return values.items() \ No newline at end of file