feat(open assertion spec): MVP for Snowflake DMF Assertions: update models, add assertions cli with snowflake integration (#10602)

This commit is contained in:
Mayuri Nehate 2024-06-01 00:33:22 +05:30 committed by GitHub
parent 648fd459eb
commit 81b655c82d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
77 changed files with 5351 additions and 16 deletions

View File

@ -20,6 +20,7 @@ public class Constants {
public static final String LINEAGE_SCHEMA_FILE = "lineage.graphql";
public static final String PROPERTIES_SCHEMA_FILE = "properties.graphql";
public static final String FORMS_SCHEMA_FILE = "forms.graphql";
public static final String ASSERTIONS_SCHEMA_FILE = "assertions.graphql";
public static final String INCIDENTS_SCHEMA_FILE = "incident.graphql";
public static final String CONNECTIONS_SCHEMA_FILE = "connection.graphql";
public static final String BROWSE_PATH_DELIMITER = "/";

View File

@ -118,7 +118,12 @@ import com.linkedin.datahub.graphql.resolvers.MeResolver;
import com.linkedin.datahub.graphql.resolvers.assertion.AssertionRunEventResolver;
import com.linkedin.datahub.graphql.resolvers.assertion.DeleteAssertionResolver;
import com.linkedin.datahub.graphql.resolvers.assertion.EntityAssertionsResolver;
import com.linkedin.datahub.graphql.resolvers.auth.*;
import com.linkedin.datahub.graphql.resolvers.auth.CreateAccessTokenResolver;
import com.linkedin.datahub.graphql.resolvers.auth.DebugAccessResolver;
import com.linkedin.datahub.graphql.resolvers.auth.GetAccessTokenMetadataResolver;
import com.linkedin.datahub.graphql.resolvers.auth.GetAccessTokenResolver;
import com.linkedin.datahub.graphql.resolvers.auth.ListAccessTokensResolver;
import com.linkedin.datahub.graphql.resolvers.auth.RevokeAccessTokenResolver;
import com.linkedin.datahub.graphql.resolvers.browse.BrowsePathsResolver;
import com.linkedin.datahub.graphql.resolvers.browse.BrowseResolver;
import com.linkedin.datahub.graphql.resolvers.browse.EntityBrowsePathsResolver;
@ -814,6 +819,7 @@ public class GmsGraphQLEngine {
.addSchema(fileBasedSchema(PROPERTIES_SCHEMA_FILE))
.addSchema(fileBasedSchema(FORMS_SCHEMA_FILE))
.addSchema(fileBasedSchema(CONNECTIONS_SCHEMA_FILE))
.addSchema(fileBasedSchema(ASSERTIONS_SCHEMA_FILE))
.addSchema(fileBasedSchema(INCIDENTS_SCHEMA_FILE));
for (GmsGraphQLPlugin plugin : this.graphQLPlugins) {

View File

@ -98,6 +98,16 @@ public class AssertionRunEventResolver
&& AssertionResultType.SUCCESS.equals(
runEvent.getResult().getType()))
.count()));
result.setErrored(
Math.toIntExact(
runEvents.stream()
.filter(
runEvent ->
AssertionRunStatus.COMPLETE.equals(runEvent.getStatus())
&& runEvent.getResult() != null
&& AssertionResultType.ERROR.equals(
runEvent.getResult().getType()))
.count()));
result.setRunEvents(runEvents);
return result;
} catch (RemoteInvocationException e) {

View File

@ -2,6 +2,8 @@ package com.linkedin.datahub.graphql.types.assertion;
import static com.linkedin.metadata.Constants.GLOBAL_TAGS_ASPECT_NAME;
import com.linkedin.assertion.AssertionAction;
import com.linkedin.assertion.AssertionActions;
import com.linkedin.assertion.AssertionInfo;
import com.linkedin.common.DataPlatformInstance;
import com.linkedin.common.GlobalTags;
@ -10,24 +12,40 @@ import com.linkedin.common.urn.Urn;
import com.linkedin.data.DataMap;
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.generated.Assertion;
import com.linkedin.datahub.graphql.generated.AssertionActionType;
import com.linkedin.datahub.graphql.generated.AssertionSource;
import com.linkedin.datahub.graphql.generated.AssertionSourceType;
import com.linkedin.datahub.graphql.generated.AssertionStdAggregation;
import com.linkedin.datahub.graphql.generated.AssertionStdOperator;
import com.linkedin.datahub.graphql.generated.AssertionStdParameter;
import com.linkedin.datahub.graphql.generated.AssertionStdParameterType;
import com.linkedin.datahub.graphql.generated.AssertionStdParameters;
import com.linkedin.datahub.graphql.generated.AssertionType;
import com.linkedin.datahub.graphql.generated.AuditStamp;
import com.linkedin.datahub.graphql.generated.DataPlatform;
import com.linkedin.datahub.graphql.generated.DatasetAssertionInfo;
import com.linkedin.datahub.graphql.generated.DatasetAssertionScope;
import com.linkedin.datahub.graphql.generated.DateInterval;
import com.linkedin.datahub.graphql.generated.EntityType;
import com.linkedin.datahub.graphql.generated.FieldAssertionInfo;
import com.linkedin.datahub.graphql.generated.FixedIntervalSchedule;
import com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo;
import com.linkedin.datahub.graphql.generated.SchemaAssertionCompatibility;
import com.linkedin.datahub.graphql.generated.SchemaAssertionField;
import com.linkedin.datahub.graphql.generated.SchemaAssertionInfo;
import com.linkedin.datahub.graphql.generated.SchemaFieldRef;
import com.linkedin.datahub.graphql.generated.SqlAssertionInfo;
import com.linkedin.datahub.graphql.generated.VolumeAssertionInfo;
import com.linkedin.datahub.graphql.types.common.mappers.DataPlatformInstanceAspectMapper;
import com.linkedin.datahub.graphql.types.common.mappers.StringMapMapper;
import com.linkedin.datahub.graphql.types.dataset.mappers.SchemaFieldMapper;
import com.linkedin.datahub.graphql.types.dataset.mappers.SchemaMetadataMapper;
import com.linkedin.datahub.graphql.types.tag.mappers.GlobalTagsMapper;
import com.linkedin.entity.EntityResponse;
import com.linkedin.entity.EnvelopedAspect;
import com.linkedin.entity.EnvelopedAspectMap;
import com.linkedin.metadata.Constants;
import com.linkedin.schema.SchemaField;
import java.util.Collections;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
@ -48,6 +66,14 @@ public class AssertionMapper {
result.setInfo(
mapAssertionInfo(context, new AssertionInfo(envelopedAssertionInfo.getValue().data())));
}
final EnvelopedAspect envelopedAssertionActions =
aspects.get(Constants.ASSERTION_ACTIONS_ASPECT_NAME);
if (envelopedAssertionActions != null) {
result.setActions(
mapAssertionActions(new AssertionActions(envelopedAssertionActions.getValue().data())));
}
final EnvelopedAspect envelopedPlatformInstance =
aspects.get(Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME);
if (envelopedPlatformInstance != null) {
@ -83,20 +109,93 @@ public class AssertionMapper {
return result;
}
private static com.linkedin.datahub.graphql.generated.AssertionInfo mapAssertionInfo(
public static com.linkedin.datahub.graphql.generated.AssertionInfo mapAssertionInfo(
@Nullable QueryContext context, final AssertionInfo gmsAssertionInfo) {
final com.linkedin.datahub.graphql.generated.AssertionInfo assertionInfo =
new com.linkedin.datahub.graphql.generated.AssertionInfo();
assertionInfo.setType(AssertionType.valueOf(gmsAssertionInfo.getType().name()));
if (gmsAssertionInfo.hasLastUpdated()) {
assertionInfo.setLastUpdated(
new AuditStamp(
gmsAssertionInfo.getLastUpdated().getTime(),
gmsAssertionInfo.getLastUpdated().getActor().toString()));
}
if (gmsAssertionInfo.hasDatasetAssertion()) {
DatasetAssertionInfo datasetAssertion =
mapDatasetAssertionInfo(context, gmsAssertionInfo.getDatasetAssertion());
assertionInfo.setDatasetAssertion(datasetAssertion);
}
assertionInfo.setDescription(gmsAssertionInfo.getDescription());
// Description
if (gmsAssertionInfo.hasDescription()) {
assertionInfo.setDescription(gmsAssertionInfo.getDescription());
}
// FRESHNESS Assertions
if (gmsAssertionInfo.hasFreshnessAssertion()) {
FreshnessAssertionInfo freshnessAssertionInfo =
FreshnessAssertionMapper.mapFreshnessAssertionInfo(
context, gmsAssertionInfo.getFreshnessAssertion());
assertionInfo.setFreshnessAssertion(freshnessAssertionInfo);
}
// VOLUME Assertions
if (gmsAssertionInfo.hasVolumeAssertion()) {
VolumeAssertionInfo volumeAssertionInfo =
VolumeAssertionMapper.mapVolumeAssertionInfo(
context, gmsAssertionInfo.getVolumeAssertion());
assertionInfo.setVolumeAssertion(volumeAssertionInfo);
}
// SQL Assertions
if (gmsAssertionInfo.hasSqlAssertion()) {
SqlAssertionInfo sqlAssertionInfo =
SqlAssertionMapper.mapSqlAssertionInfo(gmsAssertionInfo.getSqlAssertion());
assertionInfo.setSqlAssertion(sqlAssertionInfo);
}
// FIELD Assertions
if (gmsAssertionInfo.hasFieldAssertion()) {
FieldAssertionInfo fieldAssertionInfo =
FieldAssertionMapper.mapFieldAssertionInfo(context, gmsAssertionInfo.getFieldAssertion());
assertionInfo.setFieldAssertion(fieldAssertionInfo);
}
// SCHEMA Assertions
if (gmsAssertionInfo.hasSchemaAssertion()) {
SchemaAssertionInfo schemaAssertionInfo =
mapSchemaAssertionInfo(context, gmsAssertionInfo.getSchemaAssertion());
assertionInfo.setSchemaAssertion(schemaAssertionInfo);
}
// Source Type
if (gmsAssertionInfo.hasSource()) {
assertionInfo.setSource(mapSource(gmsAssertionInfo.getSource()));
}
return assertionInfo;
}
private static com.linkedin.datahub.graphql.generated.AssertionActions mapAssertionActions(
final AssertionActions gmsAssertionActions) {
final com.linkedin.datahub.graphql.generated.AssertionActions result =
new com.linkedin.datahub.graphql.generated.AssertionActions();
if (gmsAssertionActions.hasOnFailure()) {
result.setOnFailure(
gmsAssertionActions.getOnFailure().stream()
.map(AssertionMapper::mapAssertionAction)
.collect(Collectors.toList()));
}
if (gmsAssertionActions.hasOnSuccess()) {
result.setOnSuccess(
gmsAssertionActions.getOnSuccess().stream()
.map(AssertionMapper::mapAssertionAction)
.collect(Collectors.toList()));
}
return result;
}
private static com.linkedin.datahub.graphql.generated.AssertionAction mapAssertionAction(
final AssertionAction gmsAssertionAction) {
final com.linkedin.datahub.graphql.generated.AssertionAction result =
new com.linkedin.datahub.graphql.generated.AssertionAction();
result.setType(AssertionActionType.valueOf(gmsAssertionAction.getType().toString()));
return result;
}
private static DatasetAssertionInfo mapDatasetAssertionInfo(
@Nullable QueryContext context,
final com.linkedin.assertion.DatasetAssertionInfo gmsDatasetAssertion) {
@ -152,7 +251,7 @@ public class AssertionMapper {
return new SchemaFieldRef(schemaFieldUrn.toString(), schemaFieldUrn.getEntityKey().get(1));
}
private static AssertionStdParameters mapParameters(
protected static AssertionStdParameters mapParameters(
final com.linkedin.assertion.AssertionStdParameters params) {
final AssertionStdParameters result = new AssertionStdParameters();
if (params.hasValue()) {
@ -175,5 +274,61 @@ public class AssertionMapper {
return result;
}
private AssertionMapper() {}
protected static FixedIntervalSchedule mapFixedIntervalSchedule(
com.linkedin.assertion.FixedIntervalSchedule gmsFixedIntervalSchedule) {
FixedIntervalSchedule fixedIntervalSchedule = new FixedIntervalSchedule();
fixedIntervalSchedule.setUnit(DateInterval.valueOf(gmsFixedIntervalSchedule.getUnit().name()));
fixedIntervalSchedule.setMultiple(gmsFixedIntervalSchedule.getMultiple());
return fixedIntervalSchedule;
}
private static AssertionSource mapSource(final com.linkedin.assertion.AssertionSource gmsSource) {
AssertionSource result = new AssertionSource();
result.setType(AssertionSourceType.valueOf(gmsSource.getType().toString()));
if (gmsSource.hasCreated()) {
result.setCreated(
new AuditStamp(
gmsSource.getCreated().getTime(), gmsSource.getCreated().getActor().toString()));
}
return result;
}
protected static com.linkedin.datahub.graphql.generated.SchemaFieldSpec mapSchemaFieldSpec(
final com.linkedin.schema.SchemaFieldSpec gmsField) {
final com.linkedin.datahub.graphql.generated.SchemaFieldSpec result =
new com.linkedin.datahub.graphql.generated.SchemaFieldSpec();
result.setPath(gmsField.getPath());
result.setType(gmsField.getType());
result.setNativeType(gmsField.getNativeType());
return result;
}
private static SchemaAssertionInfo mapSchemaAssertionInfo(
@Nullable final QueryContext context,
final com.linkedin.assertion.SchemaAssertionInfo gmsSchemaAssertionInfo) {
SchemaAssertionInfo result = new SchemaAssertionInfo();
result.setCompatibility(
SchemaAssertionCompatibility.valueOf(gmsSchemaAssertionInfo.getCompatibility().name()));
result.setEntityUrn(gmsSchemaAssertionInfo.getEntity().toString());
result.setSchema(
SchemaMetadataMapper.INSTANCE.apply(
context, gmsSchemaAssertionInfo.getSchema(), gmsSchemaAssertionInfo.getEntity(), 0L));
result.setFields(
gmsSchemaAssertionInfo.getSchema().getFields().stream()
.map(AssertionMapper::mapSchemaField)
.collect(Collectors.toList()));
return result;
}
private static SchemaAssertionField mapSchemaField(final SchemaField gmsField) {
SchemaAssertionField result = new SchemaAssertionField();
result.setPath(gmsField.getFieldPath());
result.setType(new SchemaFieldMapper().mapSchemaFieldDataType(gmsField.getType()));
if (gmsField.hasNativeDataType()) {
result.setNativeType(gmsField.getNativeDataType());
}
return result;
}
protected AssertionMapper() {}
}

View File

@ -28,8 +28,8 @@ public class AssertionType
Constants.ASSERTION_KEY_ASPECT_NAME,
Constants.ASSERTION_INFO_ASPECT_NAME,
Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME,
Constants.GLOBAL_TAGS_ASPECT_NAME);
Constants.GLOBAL_TAGS_ASPECT_NAME,
Constants.ASSERTION_ACTIONS_ASPECT_NAME);
private final EntityClient _entityClient;
public AssertionType(final EntityClient entityClient) {

View File

@ -0,0 +1,92 @@
package com.linkedin.datahub.graphql.types.assertion;
import com.linkedin.assertion.FieldAssertionInfo;
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.generated.AssertionStdOperator;
import com.linkedin.datahub.graphql.generated.FieldAssertionType;
import com.linkedin.datahub.graphql.generated.FieldMetricType;
import com.linkedin.datahub.graphql.generated.FieldTransformType;
import com.linkedin.datahub.graphql.generated.FieldValuesFailThresholdType;
import com.linkedin.datahub.graphql.types.dataset.mappers.DatasetFilterMapper;
import javax.annotation.Nullable;
public class FieldAssertionMapper extends AssertionMapper {
public static com.linkedin.datahub.graphql.generated.FieldAssertionInfo mapFieldAssertionInfo(
@Nullable final QueryContext context, final FieldAssertionInfo gmsFieldAssertionInfo) {
final com.linkedin.datahub.graphql.generated.FieldAssertionInfo result =
new com.linkedin.datahub.graphql.generated.FieldAssertionInfo();
result.setEntityUrn(gmsFieldAssertionInfo.getEntity().toString());
result.setType(FieldAssertionType.valueOf(gmsFieldAssertionInfo.getType().name()));
if (gmsFieldAssertionInfo.hasFilter()) {
result.setFilter(DatasetFilterMapper.map(context, gmsFieldAssertionInfo.getFilter()));
}
if (gmsFieldAssertionInfo.hasFieldValuesAssertion()) {
result.setFieldValuesAssertion(
mapFieldValuesAssertion(gmsFieldAssertionInfo.getFieldValuesAssertion()));
}
if (gmsFieldAssertionInfo.hasFieldMetricAssertion()) {
result.setFieldMetricAssertion(
mapFieldMetricAssertion(gmsFieldAssertionInfo.getFieldMetricAssertion()));
}
return result;
}
private static com.linkedin.datahub.graphql.generated.FieldValuesAssertion
mapFieldValuesAssertion(
final com.linkedin.assertion.FieldValuesAssertion gmsFieldValuesAssertion) {
final com.linkedin.datahub.graphql.generated.FieldValuesAssertion result =
new com.linkedin.datahub.graphql.generated.FieldValuesAssertion();
result.setField(mapSchemaFieldSpec(gmsFieldValuesAssertion.getField()));
result.setOperator(AssertionStdOperator.valueOf(gmsFieldValuesAssertion.getOperator().name()));
result.setFailThreshold(
mapFieldValuesFailThreshold(gmsFieldValuesAssertion.getFailThreshold()));
result.setExcludeNulls(gmsFieldValuesAssertion.isExcludeNulls());
if (gmsFieldValuesAssertion.hasTransform()) {
result.setTransform(mapFieldTransform(gmsFieldValuesAssertion.getTransform()));
}
if (gmsFieldValuesAssertion.hasParameters()) {
result.setParameters(mapParameters(gmsFieldValuesAssertion.getParameters()));
}
return result;
}
private static com.linkedin.datahub.graphql.generated.FieldMetricAssertion
mapFieldMetricAssertion(
final com.linkedin.assertion.FieldMetricAssertion gmsFieldMetricAssertion) {
final com.linkedin.datahub.graphql.generated.FieldMetricAssertion result =
new com.linkedin.datahub.graphql.generated.FieldMetricAssertion();
result.setField(mapSchemaFieldSpec(gmsFieldMetricAssertion.getField()));
result.setMetric(FieldMetricType.valueOf(gmsFieldMetricAssertion.getMetric().name()));
result.setOperator(AssertionStdOperator.valueOf(gmsFieldMetricAssertion.getOperator().name()));
if (gmsFieldMetricAssertion.hasParameters()) {
result.setParameters(mapParameters(gmsFieldMetricAssertion.getParameters()));
}
return result;
}
private static com.linkedin.datahub.graphql.generated.FieldTransform mapFieldTransform(
final com.linkedin.assertion.FieldTransform gmsFieldTransform) {
final com.linkedin.datahub.graphql.generated.FieldTransform result =
new com.linkedin.datahub.graphql.generated.FieldTransform();
result.setType(FieldTransformType.valueOf(gmsFieldTransform.getType().name()));
return result;
}
private static com.linkedin.datahub.graphql.generated.FieldValuesFailThreshold
mapFieldValuesFailThreshold(
final com.linkedin.assertion.FieldValuesFailThreshold gmsFieldValuesFailThreshold) {
final com.linkedin.datahub.graphql.generated.FieldValuesFailThreshold result =
new com.linkedin.datahub.graphql.generated.FieldValuesFailThreshold();
result.setType(
FieldValuesFailThresholdType.valueOf(gmsFieldValuesFailThreshold.getType().name()));
result.setValue(gmsFieldValuesFailThreshold.getValue());
return result;
}
private FieldAssertionMapper() {}
}

View File

@ -0,0 +1,59 @@
package com.linkedin.datahub.graphql.types.assertion;
import com.linkedin.data.template.GetMode;
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo;
import com.linkedin.datahub.graphql.generated.FreshnessAssertionSchedule;
import com.linkedin.datahub.graphql.generated.FreshnessAssertionScheduleType;
import com.linkedin.datahub.graphql.generated.FreshnessAssertionType;
import com.linkedin.datahub.graphql.generated.FreshnessCronSchedule;
import com.linkedin.datahub.graphql.types.dataset.mappers.DatasetFilterMapper;
import javax.annotation.Nullable;
public class FreshnessAssertionMapper extends AssertionMapper {
public static FreshnessAssertionInfo mapFreshnessAssertionInfo(
@Nullable final QueryContext context,
final com.linkedin.assertion.FreshnessAssertionInfo gmsFreshnessAssertionInfo) {
FreshnessAssertionInfo freshnessAssertionInfo = new FreshnessAssertionInfo();
freshnessAssertionInfo.setEntityUrn(gmsFreshnessAssertionInfo.getEntity().toString());
freshnessAssertionInfo.setType(
FreshnessAssertionType.valueOf(gmsFreshnessAssertionInfo.getType().name()));
if (gmsFreshnessAssertionInfo.hasSchedule()) {
freshnessAssertionInfo.setSchedule(
mapFreshnessAssertionSchedule(gmsFreshnessAssertionInfo.getSchedule()));
}
if (gmsFreshnessAssertionInfo.hasFilter()) {
freshnessAssertionInfo.setFilter(
DatasetFilterMapper.map(context, gmsFreshnessAssertionInfo.getFilter()));
}
return freshnessAssertionInfo;
}
private static FreshnessCronSchedule mapFreshnessCronSchedule(
final com.linkedin.assertion.FreshnessCronSchedule gmsCronSchedule) {
FreshnessCronSchedule cronSchedule = new FreshnessCronSchedule();
cronSchedule.setCron(gmsCronSchedule.getCron());
cronSchedule.setTimezone(gmsCronSchedule.getTimezone());
cronSchedule.setWindowStartOffsetMs(gmsCronSchedule.getWindowStartOffsetMs(GetMode.NULL));
return cronSchedule;
}
private static FreshnessAssertionSchedule mapFreshnessAssertionSchedule(
final com.linkedin.assertion.FreshnessAssertionSchedule gmsFreshnessAssertionSchedule) {
FreshnessAssertionSchedule freshnessAssertionSchedule = new FreshnessAssertionSchedule();
freshnessAssertionSchedule.setType(
FreshnessAssertionScheduleType.valueOf(gmsFreshnessAssertionSchedule.getType().name()));
if (gmsFreshnessAssertionSchedule.hasCron()) {
freshnessAssertionSchedule.setCron(
mapFreshnessCronSchedule(gmsFreshnessAssertionSchedule.getCron()));
}
if (gmsFreshnessAssertionSchedule.hasFixedInterval()) {
freshnessAssertionSchedule.setFixedInterval(
mapFixedIntervalSchedule(gmsFreshnessAssertionSchedule.getFixedInterval()));
}
return freshnessAssertionSchedule;
}
private FreshnessAssertionMapper() {}
}

View File

@ -0,0 +1,27 @@
package com.linkedin.datahub.graphql.types.assertion;
import com.linkedin.assertion.SqlAssertionInfo;
import com.linkedin.datahub.graphql.generated.AssertionStdOperator;
import com.linkedin.datahub.graphql.generated.AssertionValueChangeType;
import com.linkedin.datahub.graphql.generated.SqlAssertionType;
public class SqlAssertionMapper extends AssertionMapper {
public static com.linkedin.datahub.graphql.generated.SqlAssertionInfo mapSqlAssertionInfo(
final SqlAssertionInfo gmsSqlAssertionInfo) {
final com.linkedin.datahub.graphql.generated.SqlAssertionInfo result =
new com.linkedin.datahub.graphql.generated.SqlAssertionInfo();
result.setEntityUrn(gmsSqlAssertionInfo.getEntity().toString());
result.setType(SqlAssertionType.valueOf(gmsSqlAssertionInfo.getType().name()));
result.setStatement(gmsSqlAssertionInfo.getStatement());
result.setOperator(AssertionStdOperator.valueOf(gmsSqlAssertionInfo.getOperator().name()));
result.setParameters(mapParameters(gmsSqlAssertionInfo.getParameters()));
if (gmsSqlAssertionInfo.hasChangeType()) {
result.setChangeType(
AssertionValueChangeType.valueOf(gmsSqlAssertionInfo.getChangeType().name()));
}
return result;
}
private SqlAssertionMapper() {}
}

View File

@ -0,0 +1,115 @@
package com.linkedin.datahub.graphql.types.assertion;
import com.linkedin.assertion.VolumeAssertionInfo;
import com.linkedin.datahub.graphql.QueryContext;
import com.linkedin.datahub.graphql.generated.AssertionStdOperator;
import com.linkedin.datahub.graphql.generated.AssertionValueChangeType;
import com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformerType;
import com.linkedin.datahub.graphql.generated.VolumeAssertionType;
import com.linkedin.datahub.graphql.types.dataset.mappers.DatasetFilterMapper;
import javax.annotation.Nullable;
public class VolumeAssertionMapper extends AssertionMapper {
public static com.linkedin.datahub.graphql.generated.VolumeAssertionInfo mapVolumeAssertionInfo(
@Nullable final QueryContext context, final VolumeAssertionInfo gmsVolumeAssertionInfo) {
final com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result =
new com.linkedin.datahub.graphql.generated.VolumeAssertionInfo();
result.setEntityUrn(gmsVolumeAssertionInfo.getEntity().toString());
result.setType(VolumeAssertionType.valueOf(gmsVolumeAssertionInfo.getType().name()));
if (gmsVolumeAssertionInfo.hasFilter()) {
result.setFilter(DatasetFilterMapper.map(context, gmsVolumeAssertionInfo.getFilter()));
}
if (gmsVolumeAssertionInfo.hasRowCountTotal()) {
result.setRowCountTotal(mapRowCountTotal(gmsVolumeAssertionInfo.getRowCountTotal()));
}
if (gmsVolumeAssertionInfo.hasRowCountChange()) {
result.setRowCountChange(mapRowCountChange(gmsVolumeAssertionInfo.getRowCountChange()));
}
if (gmsVolumeAssertionInfo.hasIncrementingSegmentRowCountTotal()) {
result.setIncrementingSegmentRowCountTotal(
mapIncrementingSegmentRowCountTotal(
gmsVolumeAssertionInfo.getIncrementingSegmentRowCountTotal()));
}
if (gmsVolumeAssertionInfo.hasIncrementingSegmentRowCountChange()) {
result.setIncrementingSegmentRowCountChange(
mapIncrementingSegmentRowCountChange(
gmsVolumeAssertionInfo.getIncrementingSegmentRowCountChange()));
}
return result;
}
private static com.linkedin.datahub.graphql.generated.RowCountTotal mapRowCountTotal(
final com.linkedin.assertion.RowCountTotal gmsRowCountTotal) {
final com.linkedin.datahub.graphql.generated.RowCountTotal result =
new com.linkedin.datahub.graphql.generated.RowCountTotal();
result.setOperator(AssertionStdOperator.valueOf(gmsRowCountTotal.getOperator().name()));
result.setParameters(mapParameters(gmsRowCountTotal.getParameters()));
return result;
}
private static com.linkedin.datahub.graphql.generated.RowCountChange mapRowCountChange(
final com.linkedin.assertion.RowCountChange gmsRowCountChange) {
final com.linkedin.datahub.graphql.generated.RowCountChange result =
new com.linkedin.datahub.graphql.generated.RowCountChange();
result.setOperator(AssertionStdOperator.valueOf(gmsRowCountChange.getOperator().name()));
result.setParameters(mapParameters(gmsRowCountChange.getParameters()));
result.setType(AssertionValueChangeType.valueOf(gmsRowCountChange.getType().name()));
return result;
}
private static com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountTotal
mapIncrementingSegmentRowCountTotal(
final com.linkedin.assertion.IncrementingSegmentRowCountTotal
gmsIncrementingSegmentRowCountTotal) {
final com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountTotal result =
new com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountTotal();
result.setOperator(
AssertionStdOperator.valueOf(gmsIncrementingSegmentRowCountTotal.getOperator().name()));
result.setParameters(mapParameters(gmsIncrementingSegmentRowCountTotal.getParameters()));
result.setSegment(mapIncrementingSegmentSpec(gmsIncrementingSegmentRowCountTotal.getSegment()));
return result;
}
private static com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountChange
mapIncrementingSegmentRowCountChange(
final com.linkedin.assertion.IncrementingSegmentRowCountChange
gmsIncrementingSegmentRowCountChange) {
final com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountChange result =
new com.linkedin.datahub.graphql.generated.IncrementingSegmentRowCountChange();
result.setOperator(
AssertionStdOperator.valueOf(gmsIncrementingSegmentRowCountChange.getOperator().name()));
result.setParameters(mapParameters(gmsIncrementingSegmentRowCountChange.getParameters()));
result.setSegment(
mapIncrementingSegmentSpec(gmsIncrementingSegmentRowCountChange.getSegment()));
result.setType(
AssertionValueChangeType.valueOf(gmsIncrementingSegmentRowCountChange.getType().name()));
return result;
}
private static com.linkedin.datahub.graphql.generated.IncrementingSegmentSpec
mapIncrementingSegmentSpec(final com.linkedin.assertion.IncrementingSegmentSpec gmsSegment) {
final com.linkedin.datahub.graphql.generated.IncrementingSegmentSpec result =
new com.linkedin.datahub.graphql.generated.IncrementingSegmentSpec();
result.setField(mapSchemaFieldSpec(gmsSegment.getField()));
if (gmsSegment.hasTransformer()) {
result.setTransformer(mapIncrementingSegmentFieldTransformer(gmsSegment.getTransformer()));
}
return result;
}
private static com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformer
mapIncrementingSegmentFieldTransformer(
final com.linkedin.assertion.IncrementingSegmentFieldTransformer gmsTransformer) {
final com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformer result =
new com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformer();
result.setType(
IncrementingSegmentFieldTransformerType.valueOf(gmsTransformer.getType().name()));
if (gmsTransformer.hasNativeType()) {
result.setNativeType(gmsTransformer.getNativeType());
}
return result;
}
private VolumeAssertionMapper() {}
}

View File

@ -51,7 +51,7 @@ public class SchemaFieldMapper {
return result;
}
private SchemaFieldDataType mapSchemaFieldDataType(
public SchemaFieldDataType mapSchemaFieldDataType(
@Nonnull final com.linkedin.schema.SchemaFieldDataType dataTypeUnion) {
final com.linkedin.schema.SchemaFieldDataType.Type type = dataTypeUnion.getType();
if (type.isBytesType()) {

View File

@ -0,0 +1,896 @@
"""
Defines a schema field, each with a specified path and type.
"""
type SchemaAssertionField {
"""
The standard V1 path of the field within the schema.
"""
path: String!
"""
The std type of the field
"""
type: SchemaFieldDataType!
"""
Optional: The specific native or standard type of the field.
"""
nativeType: String
}
"""
Defines the required compatibility level for the schema assertion to pass.
"""
enum SchemaAssertionCompatibility {
"""
The schema must be exactly the same as the expected schema.
"""
EXACT_MATCH
"""
The schema must be a superset of the expected schema.
"""
SUPERSET
"""
The schema must be a subset of the expected schema.
"""
SUBSET
}
"""
The source of an assertion
"""
enum AssertionSourceType {
"""
The assertion was defined natively on DataHub by a user.
"""
NATIVE
"""
The assertion was defined and managed externally of DataHub.
"""
EXTERNAL
"""
The assertion was inferred, e.g. from offline AI / ML models.
"""
INFERRED
}
"""
The type of an Freshness assertion
"""
enum FreshnessAssertionType {
"""
An assertion defined against a Dataset Change Operation - insert, update, delete, etc
"""
DATASET_CHANGE
"""
An assertion defined against a Data Job run
"""
DATA_JOB_RUN
}
extend type AssertionInfo {
"""
Information about an Freshness Assertion
"""
freshnessAssertion: FreshnessAssertionInfo
"""
Information about an Volume Assertion
"""
volumeAssertion: VolumeAssertionInfo
"""
Information about a SQL Assertion
"""
sqlAssertion: SqlAssertionInfo
"""
Information about a Field Assertion
"""
fieldAssertion: FieldAssertionInfo
"""
Schema assertion, e.g. defining the expected structure for an asset.
"""
schemaAssertion: SchemaAssertionInfo
"""
The source or origin of the Assertion definition.
"""
source: AssertionSource
"""
The time that the status last changed and the actor who changed it
"""
lastUpdated: AuditStamp
}
extend type Assertion {
"""
The actions associated with the Assertion
"""
actions: AssertionActions
}
"""
Some actions associated with an assertion
"""
type AssertionActions {
"""
Actions to be executed on successful assertion run.
"""
onSuccess: [AssertionAction!]!
"""
Actions to be executed on failed assertion run.
"""
onFailure: [AssertionAction!]!
}
"""
An action associated with an assertion
"""
type AssertionAction {
"""
The type of the actions
"""
type: AssertionActionType!
}
"""
The type of the Action
"""
enum AssertionActionType {
"""
Raise an incident.
"""
RAISE_INCIDENT
"""
Resolve open incidents related to the assertion.
"""
RESOLVE_INCIDENT
}
"""
Information about an Freshness assertion.
"""
type FreshnessAssertionInfo {
"""
The urn of the entity that the Freshness assertion is related to
"""
entityUrn: String!
"""
The type of the Freshness Assertion
"""
type: FreshnessAssertionType!
"""
Produce FAIL Assertion Result if the asset is not updated on the cadence and within the time range described by the schedule.
"""
schedule: FreshnessAssertionSchedule!
"""
A filter applied when querying an external Dataset or Table
"""
filter: DatasetFilter
}
"""
Attributes defining a single Freshness schedule.
"""
type FreshnessAssertionSchedule {
"""
The type of schedule
"""
type: FreshnessAssertionScheduleType!
"""
A cron schedule. This is populated if the type is CRON.
"""
cron: FreshnessCronSchedule
"""
A fixed interval schedule. This is populated if the type is FIXED_INTERVAL.
"""
fixedInterval: FixedIntervalSchedule
}
"""
The type of an Freshness assertion
"""
enum FreshnessAssertionScheduleType {
"""
An schedule based on a CRON schedule representing the expected event times.
"""
CRON
"""
A scheduled based on a recurring fixed schedule which is used to compute the expected operation window. E.g. "every 24 hours".
"""
FIXED_INTERVAL
}
"""
A cron-formatted schedule
"""
type FreshnessCronSchedule {
"""
A cron-formatted execution interval, as a cron string, e.g. 1 * * * *
"""
cron: String!
"""
Timezone in which the cron interval applies, e.g. America/Los Angeles
"""
timezone: String!
"""
An optional offset in milliseconds to SUBTRACT from the timestamp generated by the cron schedule
to generate the lower bounds of the "Freshness window", or the window of time in which an event must have occurred in order for the Freshness
to be considering passing.
If left empty, the start of the Freshness window will be the _end_ of the previously evaluated Freshness window.
"""
windowStartOffsetMs: Long
}
"""
A fixed interval schedule.
"""
type FixedIntervalSchedule {
"""
Interval unit such as minute/hour/day etc.
"""
unit: DateInterval!
"""
How many units. Defaults to 1.
"""
multiple: Int!
}
"""
The source of an Assertion
"""
type AssertionSource {
"""
The source type
"""
type: AssertionSourceType!
"""
The time at which the assertion was initially created and the actor who created it
"""
created: AuditStamp
}
"""
Information about the field to use in an assertion
"""
type SchemaFieldSpec {
"""
The field path
"""
path: String!
"""
The DataHub standard schema field type.
"""
type: String!
"""
The native field type
"""
nativeType: String!
}
"""
An enum to represent a type of change in an assertion value, metric, or measurement.
"""
enum AssertionValueChangeType {
"""
A change that is defined in absolute terms.
"""
ABSOLUTE
"""
A change that is defined in relative terms using percentage change
from the original value.
"""
PERCENTAGE
}
"""
A type of volume (row count) assertion
"""
enum VolumeAssertionType {
"""
A volume assertion that is evaluated against the total row count of a dataset.
"""
ROW_COUNT_TOTAL
"""
A volume assertion that is evaluated against an incremental row count of a dataset,
or a row count change.
"""
ROW_COUNT_CHANGE
"""
A volume assertion that checks the latest "segment" in a table based on an incrementing
column to check whether it's row count falls into a particular range.
This can be used to monitor the row count of an incrementing date-partition column segment.
"""
INCREMENTING_SEGMENT_ROW_COUNT_TOTAL
"""
A volume assertion that compares the row counts in neighboring "segments" or "partitions"
of an incrementing column. This can be used to track changes between subsequent date partition
in a table, for example.
"""
INCREMENTING_SEGMENT_ROW_COUNT_CHANGE
}
"""
Attributes defining an ROW_COUNT_TOTAL volume assertion.
"""
type RowCountTotal {
"""
The operator you'd like to apply.
Note that only numeric operators are valid inputs:
GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO,
BETWEEN.
"""
operator: AssertionStdOperator!
"""
The parameters you'd like to provide as input to the operator.
Note that only numeric parameter types are valid inputs: NUMBER.
"""
parameters: AssertionStdParameters!
}
"""
Attributes defining an ROW_COUNT_CHANGE volume assertion.
"""
type RowCountChange {
"""
The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage.
"""
type: AssertionValueChangeType!
"""
The operator you'd like to apply.
Note that only numeric operators are valid inputs:
GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO,
BETWEEN.
"""
operator: AssertionStdOperator!
"""
The parameters you'd like to provide as input to the operator.
Note that only numeric parameter types are valid inputs: NUMBER.
"""
parameters: AssertionStdParameters!
}
"""
Attributes defining an INCREMENTING_SEGMENT_ROW_COUNT_TOTAL volume assertion.
"""
type IncrementingSegmentRowCountTotal {
"""
A specification of how the 'segment' can be derived using a column and an optional transformer function.
"""
segment: IncrementingSegmentSpec!
"""
The operator you'd like to apply.
Note that only numeric operators are valid inputs:
GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO,
BETWEEN.
"""
operator: AssertionStdOperator!
"""
The parameters you'd like to provide as input to the operator.
Note that only numeric parameter types are valid inputs: NUMBER.
"""
parameters: AssertionStdParameters!
}
"""
Attributes defining an INCREMENTING_SEGMENT_ROW_COUNT_CHANGE volume assertion.
"""
type IncrementingSegmentRowCountChange {
"""
A specification of how the 'segment' can be derived using a column and an optional transformer function.
"""
segment: IncrementingSegmentSpec!
"""
The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage.
"""
type: AssertionValueChangeType!
"""
The operator you'd like to apply to the row count value
Note that only numeric operators are valid inputs:
GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO,
BETWEEN.
"""
operator: AssertionStdOperator!
"""
The parameters you'd like to provide as input to the operator.
Note that only numeric parameter types are valid inputs: NUMBER.
"""
parameters: AssertionStdParameters!
}
"""
Core attributes required to identify an incrementing segment in a table. This type is mainly useful
for tables that constantly increase with new rows being added on a particular cadence (e.g. fact or event tables).
An incrementing segment represents a logical chunk of data which is INSERTED
into a dataset on a regular interval, along with the presence of a constantly-incrementing column
value such as an event time, date partition, or last modified column.
An incrementing segment is principally identified by 2 key attributes combined:
1. A field or column that represents the incrementing value. New rows that are inserted will be identified using this column.
Note that the value of this column may not by itself represent the "bucket" or the "segment" in which the row falls.
2. [Optional] An transformer function that may be applied to the selected column value in order
to obtain the final "segment identifier" or "bucket identifier". Rows that have the same value after applying the transformation
will be grouped into the same segment, using which the final value (e.g. row count) will be determined.
"""
type IncrementingSegmentSpec {
"""
The field to use to generate segments. It must be constantly incrementing as new rows are inserted.
"""
field: SchemaFieldSpec!
"""
Optional transformer function to apply to the field in order to obtain the final segment or bucket identifier.
If not provided, then no operator will be applied to the field. (identity function)
"""
transformer: IncrementingSegmentFieldTransformer
}
"""
The definition of the transformer function that should be applied to a given field / column value in a dataset
in order to determine the segment or bucket that it belongs to, which in turn is used to evaluate
volume assertions.
"""
type IncrementingSegmentFieldTransformer {
"""
The 'standard' operator type. Note that not all source systems will support all operators.
"""
type: IncrementingSegmentFieldTransformerType!
"""
The 'native' transformer type, useful as a back door if a custom transformer is required.
This field is required if the type is NATIVE.
"""
nativeType: String
}
"""
The 'standard' transformer type. Note that not all source systems will support all operators.
"""
enum IncrementingSegmentFieldTransformerType {
"""
Rounds a timestamp (in seconds) down to the start of the month.
"""
TIMESTAMP_MS_TO_MINUTE
"""
Rounds a timestamp (in milliseconds) down to the nearest hour.
"""
TIMESTAMP_MS_TO_HOUR
"""
Rounds a timestamp (in milliseconds) down to the start of the day.
"""
TIMESTAMP_MS_TO_DATE
"""
Rounds a timestamp (in milliseconds) down to the start of the month
"""
TIMESTAMP_MS_TO_MONTH
"""
Rounds a timestamp (in milliseconds) down to the start of the year
"""
TIMESTAMP_MS_TO_YEAR
"""
Rounds a numeric value down to the nearest integer.
"""
FLOOR
"""
Rounds a numeric value up to the nearest integer.
"""
CEILING
"""
A backdoor to provide a native operator type specific to a given source system like
Snowflake, Redshift, BQ, etc.
"""
NATIVE
}
"""
A definition of a Volume (row count) assertion.
"""
type VolumeAssertionInfo {
"""
The entity targeted by this Volume check.
"""
entityUrn: String!
"""
The type of the freshness assertion being monitored.
"""
type: VolumeAssertionType!
"""
Produce FAILURE Assertion Result if the row count of the asset does not meet specific requirements.
Required if type is 'ROW_COUNT_TOTAL'.
"""
rowCountTotal: RowCountTotal
"""
Produce FAILURE Assertion Result if the row count delta of the asset does not meet specific requirements.
Required if type is 'ROW_COUNT_CHANGE'.
"""
rowCountChange: RowCountChange
"""
Produce FAILURE Assertion Result if the latest incrementing segment row count total of the asset
does not meet specific requirements. Required if type is 'INCREMENTING_SEGMENT_ROW_COUNT_TOTAL'.
"""
incrementingSegmentRowCountTotal: IncrementingSegmentRowCountTotal
"""
Produce FAILURE Assertion Result if the incrementing segment row count delta of the asset
does not meet specific requirements. Required if type is 'INCREMENTING_SEGMENT_ROW_COUNT_CHANGE'.
"""
incrementingSegmentRowCountChange: IncrementingSegmentRowCountChange
"""
A definition of the specific filters that should be applied, when performing monitoring.
If not provided, there is no filter, and the full table is under consideration.
"""
filter: DatasetFilter
}
"""
The type of the SQL assertion being monitored.
"""
enum SqlAssertionType {
"""
A SQL Metric Assertion, e.g. one based on a numeric value returned by an arbitrary SQL query.
"""
METRIC
"""
A SQL assertion that is evaluated against the CHANGE in a metric assertion over time.
"""
METRIC_CHANGE
}
"""
Attributes defining a SQL Assertion
"""
type SqlAssertionInfo {
"""
The type of the SQL assertion being monitored.
"""
type: SqlAssertionType!
"""
The entity targeted by this SQL check.
"""
entityUrn: String!
"""
The SQL statement to be executed when evaluating the assertion.
"""
statement: String!
"""
The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage.
Required if the type is METRIC_CHANGE.
"""
changeType: AssertionValueChangeType
"""
The operator you'd like to apply to the result of the SQL query.
"""
operator: AssertionStdOperator!
"""
The parameters you'd like to provide as input to the operator.
"""
parameters: AssertionStdParameters!
}
"""
The type of a Field assertion
"""
enum FieldAssertionType {
"""
An assertion used to validate the values contained with a field / column given a set of rows.
"""
FIELD_VALUES
"""
An assertion used to validate the value of a common field / column metric (e.g. aggregation)
such as null count + percentage, min, max, median, and more.
"""
FIELD_METRIC
}
"""
The type of the Field Transform
"""
enum FieldTransformType {
"""
Obtain the length of a string field / column (applicable to string types)
"""
LENGTH
}
"""
The type of failure threshold.
"""
enum FieldValuesFailThresholdType {
"""
The maximum number of column values (i.e. rows) that are allowed
to fail the defined expectations before the assertion officially fails.
"""
COUNT
"""
The maximum percentage of rows that are allowed
to fail the defined column expectations before the assertion officially fails.
"""
PERCENTAGE
}
"""
A standard metric that can be derived from the set of values
for a specific field / column of a dataset / table.
"""
enum FieldMetricType {
"""
The number of unique values found in the column value set
"""
UNIQUE_COUNT
"""
The percentage of unique values to total rows for the dataset
"""
UNIQUE_PERCENTAGE
"""
The number of null values found in the column value set
"""
NULL_COUNT
"""
The percentage of null values to total rows for the dataset
"""
NULL_PERCENTAGE
"""
The minimum value in the column set (applies to numeric columns)
"""
MIN
"""
The maximum value in the column set (applies to numeric columns)
"""
MAX
"""
The mean length found in the column set (applies to numeric columns)
"""
MEAN
"""
The median length found in the column set (applies to numeric columns)
"""
MEDIAN
"""
The stddev length found in the column set (applies to numeric columns)
"""
STDDEV
"""
The number of negative values found in the value set (applies to numeric columns)
"""
NEGATIVE_COUNT
"""
The percentage of negative values to total rows for the dataset (applies to numeric columns)
"""
NEGATIVE_PERCENTAGE
"""
The number of zero values found in the value set (applies to numeric columns)
"""
ZERO_COUNT
"""
The percentage of zero values to total rows for the dataset (applies to numeric columns)
"""
ZERO_PERCENTAGE
"""
The minimum length found in the column set (applies to string columns)
"""
MIN_LENGTH
"""
The maximum length found in the column set (applies to string columns)
"""
MAX_LENGTH
"""
The number of empty string values found in the value set (applies to string columns).
Note: This is a completely different metric different from NULL_COUNT!
"""
EMPTY_COUNT
"""
The percentage of empty string values to total rows for the dataset (applies to string columns).
Note: This is a completely different metric different from NULL_PERCENTAGE!
"""
EMPTY_PERCENTAGE
}
"""
A definition of a Field (Column) assertion.
"""
type FieldAssertionInfo {
"""
The type of the field assertion being monitored.
"""
type: FieldAssertionType!
"""
The entity targeted by this Field check.
"""
entityUrn: String!
"""
The definition of an assertion that validates individual values of a field / column for a set of rows.
"""
fieldValuesAssertion: FieldValuesAssertion
"""
The definition of an assertion that validates a common metric obtained about a field / column for a set of rows.
"""
fieldMetricAssertion: FieldMetricAssertion
"""
A definition of the specific filters that should be applied, when performing monitoring.
If not provided, there is no filter, and the full table is under consideration.
"""
filter: DatasetFilter
}
"""
A definition of a Field Values assertion.
"""
type FieldValuesAssertion {
"""
The field under evaluation.
"""
field: SchemaFieldSpec!
"""
An optional transform to apply to field values before evaluating the operator.
"""
transform: FieldTransform
"""
The predicate to evaluate against a single value of the field.
Depending on the operator, parameters may be required
"""
operator: AssertionStdOperator!
"""
Standard parameters required for the assertion.
"""
parameters: AssertionStdParameters
"""
Additional customization about when the assertion should be officially considered failing.
"""
failThreshold: FieldValuesFailThreshold!
"""
Whether to ignore or allow nulls when running the values assertion.
"""
excludeNulls: Boolean!
}
"""
Definition of a transform applied to the values of a column / field.
"""
type FieldTransform {
"""
The type of the field transform.
"""
type: FieldTransformType!
}
type FieldValuesFailThreshold {
"""
The type of failure threshold.
"""
type: FieldValuesFailThresholdType!
"""
The value of the threshold, either representing a count or percentage.
"""
value: Long!
}
"""
A definition of a Field Metric assertion.
"""
type FieldMetricAssertion {
"""
The field under evaluation
"""
field: SchemaFieldSpec!
"""
The specific metric to assert against.
"""
metric: FieldMetricType!
"""
The predicate to evaluate against the metric for the field / column.
"""
operator: AssertionStdOperator!
"""
Standard parameters required for the assertion.
"""
parameters: AssertionStdParameters
}
"""
Information about an Schema assertion
"""
type SchemaAssertionInfo {
"""
The entity targeted by this schema assertion.
"""
entityUrn: String!
"""
A single field in the schema assertion.
"""
fields: [SchemaAssertionField!]!
"""
A definition of the expected structure for the asset
Deprecated! Use the simpler 'fields' instead.
"""
schema: SchemaMetadata
"""
The compatibility level required for the assertion to pass.
"""
compatibility: SchemaAssertionCompatibility!
}

View File

@ -7508,6 +7508,11 @@ type BatchSpec {
The result type of an assertion, success or failure.
"""
enum AssertionResultType {
"""
The assertion has not yet been fully evaluated.
"""
INIT
"""
The assertion succeeded.
"""
@ -7517,6 +7522,11 @@ enum AssertionResultType {
The assertion failed.
"""
FAILURE
"""
The assertion errored.
"""
ERROR
}
"""
@ -7678,6 +7688,16 @@ enum AssertionStdOperator {
"""
NOT_IN
"""
Value being asserted is true.
"""
IS_TRUE
"""
Value being asserted is false.
"""
IS_FALSE
"""
Other
"""
@ -7824,6 +7844,11 @@ type AssertionRunEventsResult {
"""
succeeded: Int!
"""
The number of errored run events
"""
errored: Int!
"""
The run events themselves
"""

View File

@ -136,6 +136,36 @@ enum IncidentState {
A specific type of incident
"""
enum IncidentType {
"""
A Freshness Assertion has failed, triggering the incident.
Raised on assets where assertions are configured to generate incidents.
"""
FRESHNESS
"""
A Volume Assertion has failed, triggering the incident.
Raised on assets where assertions are configured to generate incidents.
"""
VOLUME
"""
A Field Assertion has failed, triggering the incident.
Raised on assets where assertions are configured to generate incidents.
"""
FIELD
"""
A SQL Assertion has failed, triggering the incident.
Raised on assets where assertions are configured to generate incidents.
"""
SQL
"""
A Schema has failed, triggering the incident.
Raised on assets where assertions are configured to generate incidents.
"""
DATA_SCHEMA
"""
An operational incident, e.g. failure to materialize a dataset, or failure to execute a task / pipeline.
"""
@ -174,6 +204,11 @@ enum IncidentSourceType {
The incident was created manually, from either the API or the UI.
"""
MANUAL
"""
An assertion has failed, triggering the incident.
"""
ASSERTION_FAILURE
}
"""

View File

@ -97,6 +97,7 @@ public class AssertionRunEventResolverTest {
assertEquals(result.getTotal(), 1);
assertEquals(result.getFailed(), 0);
assertEquals(result.getSucceeded(), 1);
assertEquals(result.getErrored(), 0);
com.linkedin.datahub.graphql.generated.AssertionRunEvent graphqlRunEvent =
resolver.get(mockEnv).get().getRunEvents().get(0);

View File

@ -0,0 +1,346 @@
package com.linkedin.datahub.graphql.types.assertion;
import static org.testng.Assert.assertEquals;
import com.google.common.collect.ImmutableList;
import com.linkedin.assertion.AssertionInfo;
import com.linkedin.assertion.AssertionSource;
import com.linkedin.assertion.AssertionStdAggregation;
import com.linkedin.assertion.AssertionStdOperator;
import com.linkedin.assertion.AssertionStdParameter;
import com.linkedin.assertion.AssertionStdParameterType;
import com.linkedin.assertion.AssertionStdParameters;
import com.linkedin.assertion.AssertionType;
import com.linkedin.assertion.DatasetAssertionInfo;
import com.linkedin.assertion.DatasetAssertionScope;
import com.linkedin.assertion.FreshnessAssertionInfo;
import com.linkedin.assertion.FreshnessAssertionSchedule;
import com.linkedin.assertion.FreshnessAssertionScheduleType;
import com.linkedin.assertion.FreshnessAssertionType;
import com.linkedin.assertion.FreshnessCronSchedule;
import com.linkedin.assertion.SchemaAssertionCompatibility;
import com.linkedin.assertion.SchemaAssertionInfo;
import com.linkedin.common.GlobalTags;
import com.linkedin.common.TagAssociationArray;
import com.linkedin.common.UrnArray;
import com.linkedin.common.urn.TagUrn;
import com.linkedin.common.urn.UrnUtils;
import com.linkedin.data.DataMap;
import com.linkedin.data.template.StringMap;
import com.linkedin.datahub.graphql.generated.Assertion;
import com.linkedin.datahub.graphql.generated.FixedIntervalSchedule;
import com.linkedin.entity.Aspect;
import com.linkedin.entity.EntityResponse;
import com.linkedin.entity.EnvelopedAspect;
import com.linkedin.entity.EnvelopedAspectMap;
import com.linkedin.metadata.Constants;
import com.linkedin.schema.MySqlDDL;
import com.linkedin.schema.SchemaField;
import com.linkedin.schema.SchemaFieldArray;
import com.linkedin.schema.SchemaFieldDataType;
import com.linkedin.schema.SchemaMetadata;
import com.linkedin.schema.StringType;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import org.testng.Assert;
import org.testng.annotations.Test;
public class AssertionMapperTest {
@Test
public void testMapDatasetAssertion() {
// Case 1: Without nullable fields
AssertionInfo input = createFreshnessAssertionInfoWithoutNullableFields();
EntityResponse datasetAssertionEntityResponse = createAssertionInfoEntityResponse(input);
Assertion output = AssertionMapper.map(null, datasetAssertionEntityResponse);
verifyAssertionInfo(input, output);
// Case 2: With nullable fields
input = createFreshnessAssertionInfoWithNullableFields();
EntityResponse datasetAssertionEntityResponseWithNullables =
createAssertionInfoEntityResponse(input);
output = AssertionMapper.map(null, datasetAssertionEntityResponseWithNullables);
verifyAssertionInfo(input, output);
}
@Test
public void testMapTags() throws Exception {
HashMap<String, EnvelopedAspect> aspects = new HashMap<>();
AssertionInfo info = createFreshnessAssertionInfoWithoutNullableFields();
EnvelopedAspect envelopedTagsAspect = new EnvelopedAspect();
GlobalTags tags = new GlobalTags();
tags.setTags(
new TagAssociationArray(
new TagAssociationArray(
Collections.singletonList(
new com.linkedin.common.TagAssociation()
.setTag(TagUrn.createFromString("urn:li:tag:test"))))));
envelopedTagsAspect.setValue(new Aspect(tags.data()));
aspects.put(Constants.ASSERTION_INFO_ASPECT_NAME, createEnvelopedAspect(info.data()));
aspects.put(Constants.GLOBAL_TAGS_ASPECT_NAME, createEnvelopedAspect(tags.data()));
EntityResponse response = createEntityResponse(aspects);
Assertion assertion = AssertionMapper.map(null, response);
assertEquals(assertion.getTags().getTags().size(), 1);
assertEquals(
assertion.getTags().getTags().get(0).getTag().getUrn().toString(), "urn:li:tag:test");
}
@Test
public void testMapFreshnessAssertion() {
// Case 1: Without nullable fields
AssertionInfo inputInfo = createFreshnessAssertionInfoWithoutNullableFields();
EntityResponse freshnessAssertionEntityResponse = createAssertionInfoEntityResponse(inputInfo);
Assertion output = AssertionMapper.map(null, freshnessAssertionEntityResponse);
verifyAssertionInfo(inputInfo, output);
// Case 2: With nullable fields
inputInfo = createDatasetAssertionInfoWithNullableFields();
EntityResponse freshnessAssertionEntityResponseWithNullables =
createAssertionInfoEntityResponse(inputInfo);
output = AssertionMapper.map(null, freshnessAssertionEntityResponseWithNullables);
verifyAssertionInfo(inputInfo, output);
}
@Test
public void testMapDataSchemaAssertion() {
AssertionInfo input = createSchemaAssertion();
EntityResponse schemaAssertionEntityResponse = createAssertionInfoEntityResponse(input);
Assertion output = AssertionMapper.map(null, schemaAssertionEntityResponse);
verifyAssertionInfo(input, output);
}
private void verifyAssertionInfo(AssertionInfo input, Assertion output) {
Assert.assertNotNull(output);
Assert.assertNotNull(output.getInfo());
Assert.assertEquals(
output.getInfo().getType().toString(), output.getInfo().getType().toString());
if (input.hasDatasetAssertion()) {
verifyDatasetAssertion(input.getDatasetAssertion(), output.getInfo().getDatasetAssertion());
}
if (input.hasFreshnessAssertion()) {
verifyFreshnessAssertion(
input.getFreshnessAssertion(), output.getInfo().getFreshnessAssertion());
}
if (input.hasSchemaAssertion()) {
verifySchemaAssertion(input.getSchemaAssertion(), output.getInfo().getSchemaAssertion());
}
if (input.hasSource()) {
verifySource(input.getSource(), output.getInfo().getSource());
}
}
private void verifyDatasetAssertion(
DatasetAssertionInfo input,
com.linkedin.datahub.graphql.generated.DatasetAssertionInfo output) {
Assert.assertEquals(output.getOperator().toString(), input.getOperator().toString());
Assert.assertEquals(output.getOperator().toString(), input.getOperator().toString());
Assert.assertEquals(output.getScope().toString(), input.getScope().toString());
Assert.assertEquals(output.getDatasetUrn(), input.getDataset().toString());
if (input.hasAggregation()) {
Assert.assertEquals(output.getAggregation().toString(), input.getAggregation().toString());
}
if (input.hasNativeType()) {
Assert.assertEquals(output.getNativeType(), input.getNativeType().toString());
}
if (input.hasLogic()) {
Assert.assertEquals(output.getLogic(), input.getLogic());
}
if (input.hasFields()) {
Assert.assertTrue(
input.getFields().stream()
.allMatch(
field ->
output.getFields().stream()
.anyMatch(outField -> field.toString().equals(outField.getUrn()))));
}
}
private void verifyFreshnessAssertion(
FreshnessAssertionInfo input,
com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo output) {
Assert.assertEquals(output.getType().toString(), input.getType().toString());
Assert.assertEquals(output.getEntityUrn(), input.getEntity().toString());
if (input.hasSchedule()) {
verifyFreshnessSchedule(input.getSchedule(), output.getSchedule());
}
}
private void verifySchemaAssertion(
SchemaAssertionInfo input,
com.linkedin.datahub.graphql.generated.SchemaAssertionInfo output) {
Assert.assertEquals(output.getEntityUrn(), input.getEntity().toString());
Assert.assertEquals(output.getCompatibility().toString(), input.getCompatibility().toString());
Assert.assertEquals(
output.getSchema().getFields().size(), input.getSchema().getFields().size());
}
private void verifyCronSchedule(
FreshnessCronSchedule input,
com.linkedin.datahub.graphql.generated.FreshnessCronSchedule output) {
Assert.assertEquals(output.getCron(), input.getCron());
Assert.assertEquals(output.getTimezone(), input.getTimezone());
if (input.hasWindowStartOffsetMs()) {
Assert.assertEquals(output.getWindowStartOffsetMs(), input.getWindowStartOffsetMs());
}
}
private void verifyFreshnessSchedule(
FreshnessAssertionSchedule input,
com.linkedin.datahub.graphql.generated.FreshnessAssertionSchedule output) {
Assert.assertEquals(output.getType().toString(), input.getType().toString());
if (input.hasCron()) {
verifyCronSchedule(input.getCron(), output.getCron());
}
if (input.hasFixedInterval()) {
verifyFixedIntervalSchedule(input.getFixedInterval(), output.getFixedInterval());
}
}
private void verifyFixedIntervalSchedule(
com.linkedin.assertion.FixedIntervalSchedule input, FixedIntervalSchedule output) {
Assert.assertEquals(output.getMultiple(), (int) input.getMultiple());
Assert.assertEquals(output.getUnit().toString(), input.getUnit().toString());
}
private void verifySource(
AssertionSource input, com.linkedin.datahub.graphql.generated.AssertionSource output) {
Assert.assertEquals(output.getType().toString(), input.getType().toString());
}
private EntityResponse createAssertionInfoEntityResponse(final AssertionInfo info) {
HashMap<String, EnvelopedAspect> aspects = new HashMap<>();
aspects.put(Constants.ASSERTION_INFO_ASPECT_NAME, createEnvelopedAspect(info.data()));
return createEntityResponse(aspects);
}
private EntityResponse createEntityResponse(Map<String, EnvelopedAspect> aspects) {
EntityResponse entityResponse = new EntityResponse();
entityResponse.setUrn(UrnUtils.getUrn("urn:li:assertion:1"));
entityResponse.setAspects(new EnvelopedAspectMap(new HashMap<>()));
aspects.forEach(
(aspectName, envelopedAspect) -> {
entityResponse.getAspects().put(aspectName, envelopedAspect);
});
return entityResponse;
}
private EnvelopedAspect createEnvelopedAspect(DataMap dataMap) {
EnvelopedAspect envelopedAspect = new EnvelopedAspect();
envelopedAspect.setValue(new Aspect(dataMap));
return envelopedAspect;
}
private AssertionInfo createDatasetAssertionInfoWithoutNullableFields() {
AssertionInfo info = new AssertionInfo();
info.setType(com.linkedin.assertion.AssertionType.DATASET);
DatasetAssertionInfo datasetAssertionInfo = new DatasetAssertionInfo();
datasetAssertionInfo.setDataset(UrnUtils.getUrn("urn:li:dataset:1"));
datasetAssertionInfo.setScope(DatasetAssertionScope.DATASET_COLUMN);
datasetAssertionInfo.setOperator(AssertionStdOperator.GREATER_THAN);
info.setDatasetAssertion(datasetAssertionInfo);
return info;
}
private AssertionInfo createDatasetAssertionInfoWithNullableFields() {
AssertionInfo infoWithoutNullables = createDatasetAssertionInfoWithoutNullableFields();
DatasetAssertionInfo baseInfo = infoWithoutNullables.getDatasetAssertion();
baseInfo.setFields(
new UrnArray(
Arrays.asList(
UrnUtils.getUrn(
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,name,PROD),field)"))));
baseInfo.setAggregation(AssertionStdAggregation.SUM);
baseInfo.setParameters(createAssertionStdParameters());
baseInfo.setNativeType("native_type");
baseInfo.setNativeParameters(new StringMap(Collections.singletonMap("key", "value")));
baseInfo.setLogic("sample_logic");
infoWithoutNullables.setSource(
new AssertionSource().setType(com.linkedin.assertion.AssertionSourceType.INFERRED));
return infoWithoutNullables;
}
private AssertionInfo createFreshnessAssertionInfoWithoutNullableFields() {
AssertionInfo info = new AssertionInfo();
info.setType(AssertionType.FRESHNESS);
FreshnessAssertionInfo freshnessAssertionInfo = new FreshnessAssertionInfo();
freshnessAssertionInfo.setEntity(
UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hive,name,PROD)"));
freshnessAssertionInfo.setType(FreshnessAssertionType.DATASET_CHANGE);
info.setFreshnessAssertion(freshnessAssertionInfo);
return info;
}
private AssertionInfo createFreshnessAssertionInfoWithNullableFields() {
AssertionInfo infoWithoutNullables = createFreshnessAssertionInfoWithoutNullableFields();
FreshnessAssertionInfo baseInfo = infoWithoutNullables.getFreshnessAssertion();
baseInfo.setSchedule(createFreshnessAssertionSchedule());
infoWithoutNullables.setSource(
new AssertionSource().setType(com.linkedin.assertion.AssertionSourceType.INFERRED));
return infoWithoutNullables;
}
private AssertionInfo createSchemaAssertion() {
AssertionInfo info = new AssertionInfo();
info.setType(AssertionType.DATA_SCHEMA);
SchemaAssertionInfo schemaAssertionInfo = new SchemaAssertionInfo();
schemaAssertionInfo.setEntity(UrnUtils.getUrn("urn:li:dataset:1"));
schemaAssertionInfo.setCompatibility(SchemaAssertionCompatibility.SUPERSET);
schemaAssertionInfo.setSchema(
new SchemaMetadata()
.setCluster("Test")
.setHash("Test")
.setPlatformSchema(SchemaMetadata.PlatformSchema.create(new MySqlDDL()))
.setFields(
new SchemaFieldArray(
ImmutableList.of(
new SchemaField()
.setType(
new SchemaFieldDataType()
.setType(SchemaFieldDataType.Type.create(new StringType())))
.setNullable(false)
.setNativeDataType("string")
.setFieldPath("test")))));
return info;
}
private AssertionStdParameters createAssertionStdParameters() {
AssertionStdParameters parameters = new AssertionStdParameters();
parameters.setValue(createAssertionStdParameter());
parameters.setMinValue(createAssertionStdParameter());
parameters.setMaxValue(createAssertionStdParameter());
return parameters;
}
private AssertionStdParameter createAssertionStdParameter() {
AssertionStdParameter parameter = new AssertionStdParameter();
parameter.setType(AssertionStdParameterType.NUMBER);
parameter.setValue("100");
return parameter;
}
private FreshnessAssertionSchedule createFreshnessAssertionSchedule() {
FreshnessAssertionSchedule schedule = new FreshnessAssertionSchedule();
schedule.setType(FreshnessAssertionScheduleType.CRON);
schedule.setCron(createCronSchedule());
return schedule;
}
private FreshnessCronSchedule createCronSchedule() {
FreshnessCronSchedule cronSchedule = new FreshnessCronSchedule();
cronSchedule.setCron("0 0 * * *");
cronSchedule.setTimezone("UTC");
return cronSchedule;
}
}

View File

@ -7,6 +7,10 @@ import com.datahub.authentication.Authentication;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.linkedin.assertion.AssertionAction;
import com.linkedin.assertion.AssertionActionArray;
import com.linkedin.assertion.AssertionActionType;
import com.linkedin.assertion.AssertionActions;
import com.linkedin.assertion.AssertionInfo;
import com.linkedin.assertion.AssertionType;
import com.linkedin.common.DataPlatformInstance;
@ -48,6 +52,17 @@ public class AssertionTypeTest {
new DataPlatformInstance()
.setPlatform(new DataPlatformUrn("snowflake"))
.setInstance(null, SetMode.IGNORE_NULL);
// Acryl SaaS Only
private static final AssertionActions TEST_ASSERTION_ACTIONS =
new AssertionActions()
.setOnSuccess(
new AssertionActionArray(
ImmutableList.of(
new AssertionAction().setType(AssertionActionType.RAISE_INCIDENT))))
.setOnFailure(
new AssertionActionArray(
ImmutableList.of(
new AssertionAction().setType(AssertionActionType.RESOLVE_INCIDENT))));
private static final String TEST_ASSERTION_URN_2 = "urn:li:assertion:guid-2";
@ -69,6 +84,9 @@ public class AssertionTypeTest {
assertion1Aspects.put(
Constants.ASSERTION_INFO_ASPECT_NAME,
new EnvelopedAspect().setValue(new Aspect(TEST_ASSERTION_INFO.data())));
assertion1Aspects.put(
Constants.ASSERTION_ACTIONS_ASPECT_NAME,
new EnvelopedAspect().setValue(new Aspect(TEST_ASSERTION_ACTIONS.data())));
Mockito.when(
client.batchGetV2(
any(),
@ -112,6 +130,12 @@ public class AssertionTypeTest {
assertEquals(assertion.getInfo().getType().toString(), AssertionType.DATASET.toString());
assertEquals(assertion.getInfo().getDatasetAssertion(), null);
assertEquals(assertion.getPlatform().getUrn(), "urn:li:dataPlatform:snowflake");
assertEquals(
assertion.getActions().getOnSuccess().get(0).getType(),
com.linkedin.datahub.graphql.generated.AssertionActionType.RAISE_INCIDENT);
assertEquals(
assertion.getActions().getOnFailure().get(0).getType(),
com.linkedin.datahub.graphql.generated.AssertionActionType.RESOLVE_INCIDENT);
// Assert second element is null.
assertNull(result.get(1));

View File

@ -0,0 +1,100 @@
package com.linkedin.datahub.graphql.types.assertion;
import com.linkedin.assertion.AssertionStdOperator;
import com.linkedin.assertion.FieldAssertionInfo;
import com.linkedin.assertion.FieldAssertionType;
import com.linkedin.assertion.FieldMetricAssertion;
import com.linkedin.assertion.FieldMetricType;
import com.linkedin.assertion.FieldTransform;
import com.linkedin.assertion.FieldTransformType;
import com.linkedin.assertion.FieldValuesAssertion;
import com.linkedin.assertion.FieldValuesFailThreshold;
import com.linkedin.assertion.FieldValuesFailThresholdType;
import com.linkedin.common.urn.Urn;
import com.linkedin.dataset.DatasetFilter;
import com.linkedin.dataset.DatasetFilterType;
import com.linkedin.schema.SchemaFieldSpec;
import org.testng.Assert;
import org.testng.annotations.Test;
public class FieldAssertionMapperTest {
@Test
public void testMapFieldValuesAssertionInfo() throws Exception {
FieldAssertionInfo fieldAssertionInfo =
new FieldAssertionInfo()
.setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"))
.setFilter(
new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;"))
.setType(FieldAssertionType.FIELD_VALUES)
.setFieldValuesAssertion(
new FieldValuesAssertion()
.setExcludeNulls(true)
.setFailThreshold(
new FieldValuesFailThreshold()
.setType(FieldValuesFailThresholdType.PERCENTAGE)
.setValue(5L))
.setField(
new SchemaFieldSpec()
.setPath("path")
.setType("STRING")
.setNativeType("VARCHAR"))
.setOperator(AssertionStdOperator.IS_TRUE)
.setTransform(new FieldTransform().setType(FieldTransformType.LENGTH)));
com.linkedin.datahub.graphql.generated.FieldAssertionInfo result =
FieldAssertionMapper.mapFieldAssertionInfo(null, fieldAssertionInfo);
Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)");
Assert.assertEquals(
result.getType(), com.linkedin.datahub.graphql.generated.FieldAssertionType.FIELD_VALUES);
Assert.assertEquals(
result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL);
Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;");
Assert.assertEquals(result.getFieldValuesAssertion().getField().getPath(), "path");
Assert.assertEquals(result.getFieldValuesAssertion().getField().getType(), "STRING");
Assert.assertEquals(result.getFieldValuesAssertion().getField().getNativeType(), "VARCHAR");
Assert.assertEquals(
result.getFieldValuesAssertion().getOperator(),
com.linkedin.datahub.graphql.generated.AssertionStdOperator.IS_TRUE);
Assert.assertEquals(
result.getFieldValuesAssertion().getTransform().getType(),
com.linkedin.datahub.graphql.generated.FieldTransformType.LENGTH);
Assert.assertEquals(result.getFieldValuesAssertion().getExcludeNulls(), true);
Assert.assertEquals(
result.getFieldValuesAssertion().getFailThreshold().getType(),
com.linkedin.datahub.graphql.generated.FieldValuesFailThresholdType.PERCENTAGE);
Assert.assertEquals(
result.getFieldValuesAssertion().getFailThreshold().getValue(), Long.valueOf(5L));
}
@Test
public void testMapFieldMetricAssertionInfo() throws Exception {
FieldAssertionInfo fieldAssertionInfo =
new FieldAssertionInfo()
.setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"))
.setType(FieldAssertionType.FIELD_METRIC)
.setFieldMetricAssertion(
new FieldMetricAssertion()
.setField(
new SchemaFieldSpec()
.setPath("path")
.setType("STRING")
.setNativeType("VARCHAR"))
.setOperator(AssertionStdOperator.IS_TRUE)
.setMetric(FieldMetricType.MEDIAN));
com.linkedin.datahub.graphql.generated.FieldAssertionInfo result =
FieldAssertionMapper.mapFieldAssertionInfo(null, fieldAssertionInfo);
Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)");
Assert.assertEquals(
result.getType(), com.linkedin.datahub.graphql.generated.FieldAssertionType.FIELD_METRIC);
Assert.assertEquals(result.getFieldMetricAssertion().getField().getPath(), "path");
Assert.assertEquals(result.getFieldMetricAssertion().getField().getType(), "STRING");
Assert.assertEquals(result.getFieldMetricAssertion().getField().getNativeType(), "VARCHAR");
Assert.assertEquals(
result.getFieldMetricAssertion().getOperator(),
com.linkedin.datahub.graphql.generated.AssertionStdOperator.IS_TRUE);
Assert.assertEquals(
result.getFieldMetricAssertion().getMetric(),
com.linkedin.datahub.graphql.generated.FieldMetricType.MEDIAN);
}
}

View File

@ -0,0 +1,82 @@
package com.linkedin.datahub.graphql.types.assertion;
import com.linkedin.assertion.FixedIntervalSchedule;
import com.linkedin.assertion.FreshnessAssertionInfo;
import com.linkedin.assertion.FreshnessAssertionSchedule;
import com.linkedin.assertion.FreshnessAssertionScheduleType;
import com.linkedin.assertion.FreshnessAssertionType;
import com.linkedin.assertion.FreshnessCronSchedule;
import com.linkedin.common.urn.Urn;
import com.linkedin.dataset.DatasetFilter;
import com.linkedin.dataset.DatasetFilterType;
import com.linkedin.timeseries.CalendarInterval;
import org.testng.Assert;
import org.testng.annotations.Test;
public class FreshnessAssertionMapperTest {
@Test
public void testMapCronFreshnessAssertionInfo() throws Exception {
FreshnessAssertionInfo freshnessAssertionInfo =
new FreshnessAssertionInfo()
.setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"))
.setType(FreshnessAssertionType.DATASET_CHANGE)
.setFilter(
new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;"))
.setSchedule(
new FreshnessAssertionSchedule()
.setType(FreshnessAssertionScheduleType.CRON)
.setCron(
new FreshnessCronSchedule()
.setCron("0 0 0 * * ? *")
.setTimezone("America/Los_Angeles")
.setWindowStartOffsetMs(10L)));
com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo result =
FreshnessAssertionMapper.mapFreshnessAssertionInfo(null, freshnessAssertionInfo);
Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)");
Assert.assertEquals(
result.getType(),
com.linkedin.datahub.graphql.generated.FreshnessAssertionType.DATASET_CHANGE);
Assert.assertEquals(
result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL);
Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;");
Assert.assertEquals(
result.getSchedule().getType(),
com.linkedin.datahub.graphql.generated.FreshnessAssertionScheduleType.CRON);
Assert.assertEquals(result.getSchedule().getCron().getCron(), "0 0 0 * * ? *");
Assert.assertEquals(result.getSchedule().getCron().getTimezone(), "America/Los_Angeles");
Assert.assertEquals(result.getSchedule().getCron().getWindowStartOffsetMs(), Long.valueOf(10L));
}
@Test
public void testMapFixedIntervalFreshnessAssertionInfo() throws Exception {
FreshnessAssertionInfo freshnessAssertionInfo =
new FreshnessAssertionInfo()
.setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"))
.setType(FreshnessAssertionType.DATASET_CHANGE)
.setFilter(
new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;"))
.setSchedule(
new FreshnessAssertionSchedule()
.setType(FreshnessAssertionScheduleType.FIXED_INTERVAL)
.setFixedInterval(
new FixedIntervalSchedule().setUnit(CalendarInterval.DAY).setMultiple(10)));
com.linkedin.datahub.graphql.generated.FreshnessAssertionInfo result =
FreshnessAssertionMapper.mapFreshnessAssertionInfo(null, freshnessAssertionInfo);
Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)");
Assert.assertEquals(
result.getType(),
com.linkedin.datahub.graphql.generated.FreshnessAssertionType.DATASET_CHANGE);
Assert.assertEquals(
result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL);
Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;");
Assert.assertEquals(
result.getSchedule().getType(),
com.linkedin.datahub.graphql.generated.FreshnessAssertionScheduleType.FIXED_INTERVAL);
Assert.assertEquals(
result.getSchedule().getFixedInterval().getUnit(),
com.linkedin.datahub.graphql.generated.DateInterval.DAY);
Assert.assertEquals(result.getSchedule().getFixedInterval().getMultiple(), 10);
}
}

View File

@ -0,0 +1,78 @@
package com.linkedin.datahub.graphql.types.assertion;
import com.linkedin.assertion.AssertionStdOperator;
import com.linkedin.assertion.AssertionStdParameter;
import com.linkedin.assertion.AssertionStdParameterType;
import com.linkedin.assertion.AssertionStdParameters;
import com.linkedin.assertion.AssertionValueChangeType;
import com.linkedin.assertion.SqlAssertionInfo;
import com.linkedin.assertion.SqlAssertionType;
import com.linkedin.common.urn.Urn;
import org.testng.Assert;
import org.testng.annotations.Test;
public class SqlAssertionMapperTest {
@Test
public void testMapMetricSqlAssertionInfo() throws Exception {
SqlAssertionInfo sqlAssertionInfo =
new SqlAssertionInfo()
.setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"))
.setType(SqlAssertionType.METRIC)
.setStatement("SELECT COUNT(*) FROM foo.bar.baz")
.setOperator(AssertionStdOperator.GREATER_THAN)
.setParameters(
new AssertionStdParameters()
.setValue(
new AssertionStdParameter()
.setType(AssertionStdParameterType.NUMBER)
.setValue(("5"))));
com.linkedin.datahub.graphql.generated.SqlAssertionInfo result =
SqlAssertionMapper.mapSqlAssertionInfo(sqlAssertionInfo);
Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)");
Assert.assertEquals(
result.getType(), com.linkedin.datahub.graphql.generated.SqlAssertionType.METRIC);
Assert.assertEquals(result.getStatement(), "SELECT COUNT(*) FROM foo.bar.baz");
Assert.assertEquals(
result.getOperator(),
com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN);
Assert.assertEquals(
result.getParameters().getValue().getType(),
com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER);
Assert.assertEquals(result.getParameters().getValue().getValue(), "5");
}
@Test
public void testMapMetricChangeSqlAssertionInfo() throws Exception {
SqlAssertionInfo sqlAssertionInfo =
new SqlAssertionInfo()
.setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"))
.setType(SqlAssertionType.METRIC_CHANGE)
.setStatement("SELECT COUNT(*) FROM foo.bar.baz")
.setChangeType(AssertionValueChangeType.ABSOLUTE)
.setOperator(AssertionStdOperator.GREATER_THAN)
.setParameters(
new AssertionStdParameters()
.setValue(
new AssertionStdParameter()
.setType(AssertionStdParameterType.NUMBER)
.setValue(("5"))));
com.linkedin.datahub.graphql.generated.SqlAssertionInfo result =
SqlAssertionMapper.mapSqlAssertionInfo(sqlAssertionInfo);
Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)");
Assert.assertEquals(
result.getType(), com.linkedin.datahub.graphql.generated.SqlAssertionType.METRIC_CHANGE);
Assert.assertEquals(result.getStatement(), "SELECT COUNT(*) FROM foo.bar.baz");
Assert.assertEquals(
result.getOperator(),
com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN);
Assert.assertEquals(
result.getParameters().getValue().getType(),
com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER);
Assert.assertEquals(result.getParameters().getValue().getValue(), "5");
Assert.assertEquals(
result.getChangeType(),
com.linkedin.datahub.graphql.generated.AssertionValueChangeType.ABSOLUTE);
}
}

View File

@ -0,0 +1,207 @@
package com.linkedin.datahub.graphql.types.assertion;
import com.linkedin.assertion.AssertionStdOperator;
import com.linkedin.assertion.AssertionStdParameter;
import com.linkedin.assertion.AssertionStdParameterType;
import com.linkedin.assertion.AssertionStdParameters;
import com.linkedin.assertion.AssertionValueChangeType;
import com.linkedin.assertion.IncrementingSegmentFieldTransformer;
import com.linkedin.assertion.IncrementingSegmentFieldTransformerType;
import com.linkedin.assertion.IncrementingSegmentRowCountChange;
import com.linkedin.assertion.IncrementingSegmentRowCountTotal;
import com.linkedin.assertion.RowCountChange;
import com.linkedin.assertion.RowCountTotal;
import com.linkedin.assertion.VolumeAssertionInfo;
import com.linkedin.assertion.VolumeAssertionType;
import com.linkedin.common.urn.Urn;
import com.linkedin.dataset.DatasetFilter;
import com.linkedin.dataset.DatasetFilterType;
import com.linkedin.schema.SchemaFieldSpec;
import org.testng.Assert;
import org.testng.annotations.Test;
public class VolumeAssertionMapperTest {
@Test
public void testMapRowCountTotalVolumeAssertionInfo() throws Exception {
VolumeAssertionInfo volumeAssertionInfo =
new VolumeAssertionInfo()
.setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"))
.setType(VolumeAssertionType.ROW_COUNT_TOTAL)
.setFilter(
new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;"))
.setRowCountTotal(
new RowCountTotal()
.setOperator(AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO)
.setParameters(
new AssertionStdParameters()
.setValue(
new AssertionStdParameter()
.setType(AssertionStdParameterType.NUMBER)
.setValue("10"))));
com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result =
VolumeAssertionMapper.mapVolumeAssertionInfo(null, volumeAssertionInfo);
Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)");
Assert.assertEquals(
result.getType(),
com.linkedin.datahub.graphql.generated.VolumeAssertionType.ROW_COUNT_TOTAL);
Assert.assertEquals(
result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL);
Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;");
Assert.assertEquals(
result.getRowCountTotal().getOperator(),
com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO);
Assert.assertEquals(
result.getRowCountTotal().getParameters().getValue().getType(),
com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER);
Assert.assertEquals(result.getRowCountTotal().getParameters().getValue().getValue(), "10");
}
@Test
public void testMapRowCountChangeVolumeAssertionInfo() throws Exception {
VolumeAssertionInfo volumeAssertionInfo =
new VolumeAssertionInfo()
.setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"))
.setType(VolumeAssertionType.ROW_COUNT_CHANGE)
.setFilter(
new DatasetFilter().setType(DatasetFilterType.SQL).setSql("WHERE value > 5;"))
.setRowCountChange(
new RowCountChange()
.setOperator(AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO)
.setParameters(
new AssertionStdParameters()
.setValue(
new AssertionStdParameter()
.setType(AssertionStdParameterType.NUMBER)
.setValue("10")))
.setType(AssertionValueChangeType.ABSOLUTE));
com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result =
VolumeAssertionMapper.mapVolumeAssertionInfo(null, volumeAssertionInfo);
Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)");
Assert.assertEquals(
result.getType(),
com.linkedin.datahub.graphql.generated.VolumeAssertionType.ROW_COUNT_CHANGE);
Assert.assertEquals(
result.getFilter().getType(), com.linkedin.datahub.graphql.generated.DatasetFilterType.SQL);
Assert.assertEquals(result.getFilter().getSql(), "WHERE value > 5;");
Assert.assertEquals(
result.getRowCountChange().getOperator(),
com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO);
Assert.assertEquals(
result.getRowCountChange().getParameters().getValue().getType(),
com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER);
Assert.assertEquals(result.getRowCountChange().getParameters().getValue().getValue(), "10");
Assert.assertEquals(
result.getRowCountChange().getType(),
com.linkedin.datahub.graphql.generated.AssertionValueChangeType.ABSOLUTE);
}
@Test
public void testMapIncrementingSegmentRowCountTotalVolumeAssertionInfo() throws Exception {
VolumeAssertionInfo volumeAssertionInfo =
new VolumeAssertionInfo()
.setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"))
.setType(VolumeAssertionType.INCREMENTING_SEGMENT_ROW_COUNT_TOTAL)
.setIncrementingSegmentRowCountTotal(
new IncrementingSegmentRowCountTotal()
.setOperator(AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO)
.setParameters(
new AssertionStdParameters()
.setValue(
new AssertionStdParameter()
.setType(AssertionStdParameterType.NUMBER)
.setValue("10")))
.setSegment(
new com.linkedin.assertion.IncrementingSegmentSpec()
.setField(
new SchemaFieldSpec()
.setPath("path")
.setNativeType("VARCHAR")
.setType("STRING"))
.setTransformer(
new IncrementingSegmentFieldTransformer()
.setType(IncrementingSegmentFieldTransformerType.CEILING)
.setNativeType("CEILING"))));
com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result =
VolumeAssertionMapper.mapVolumeAssertionInfo(null, volumeAssertionInfo);
Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)");
Assert.assertEquals(
result.getType(),
com.linkedin.datahub.graphql.generated.VolumeAssertionType
.INCREMENTING_SEGMENT_ROW_COUNT_TOTAL);
Assert.assertEquals(
result.getIncrementingSegmentRowCountTotal().getOperator(),
com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO);
Assert.assertEquals(
result.getIncrementingSegmentRowCountTotal().getParameters().getValue().getType(),
com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER);
Assert.assertEquals(
result.getIncrementingSegmentRowCountTotal().getParameters().getValue().getValue(), "10");
Assert.assertEquals(
result.getIncrementingSegmentRowCountTotal().getSegment().getField().getPath(), "path");
Assert.assertEquals(
result.getIncrementingSegmentRowCountTotal().getSegment().getField().getNativeType(),
"VARCHAR");
Assert.assertEquals(
result.getIncrementingSegmentRowCountTotal().getSegment().getField().getType(), "STRING");
Assert.assertEquals(
result.getIncrementingSegmentRowCountTotal().getSegment().getTransformer().getType(),
com.linkedin.datahub.graphql.generated.IncrementingSegmentFieldTransformerType.CEILING);
Assert.assertEquals(
result.getIncrementingSegmentRowCountTotal().getSegment().getTransformer().getNativeType(),
"CEILING");
}
@Test
public void testMapIncrementingSegmentRowCountChangeVolumeAssertionInfo() throws Exception {
VolumeAssertionInfo volumeAssertionInfo =
new VolumeAssertionInfo()
.setEntity(new Urn("urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)"))
.setType(VolumeAssertionType.INCREMENTING_SEGMENT_ROW_COUNT_CHANGE)
.setIncrementingSegmentRowCountChange(
new IncrementingSegmentRowCountChange()
.setType(AssertionValueChangeType.ABSOLUTE)
.setOperator(AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO)
.setParameters(
new AssertionStdParameters()
.setValue(
new AssertionStdParameter()
.setType(AssertionStdParameterType.NUMBER)
.setValue("10")))
.setSegment(
new com.linkedin.assertion.IncrementingSegmentSpec()
.setField(
new SchemaFieldSpec()
.setPath("path")
.setNativeType("VARCHAR")
.setType("STRING"))));
com.linkedin.datahub.graphql.generated.VolumeAssertionInfo result =
VolumeAssertionMapper.mapVolumeAssertionInfo(null, volumeAssertionInfo);
Assert.assertEquals(result.getEntityUrn(), "urn:li:dataset:(urn:li:dataPlatform:foo,bar,baz)");
Assert.assertEquals(
result.getType(),
com.linkedin.datahub.graphql.generated.VolumeAssertionType
.INCREMENTING_SEGMENT_ROW_COUNT_CHANGE);
Assert.assertEquals(
result.getIncrementingSegmentRowCountChange().getType(),
com.linkedin.datahub.graphql.generated.AssertionValueChangeType.ABSOLUTE);
Assert.assertEquals(
result.getIncrementingSegmentRowCountChange().getOperator(),
com.linkedin.datahub.graphql.generated.AssertionStdOperator.GREATER_THAN_OR_EQUAL_TO);
Assert.assertEquals(
result.getIncrementingSegmentRowCountChange().getParameters().getValue().getType(),
com.linkedin.datahub.graphql.generated.AssertionStdParameterType.NUMBER);
Assert.assertEquals(
result.getIncrementingSegmentRowCountChange().getParameters().getValue().getValue(), "10");
Assert.assertEquals(
result.getIncrementingSegmentRowCountChange().getSegment().getField().getPath(), "path");
Assert.assertEquals(
result.getIncrementingSegmentRowCountChange().getSegment().getField().getNativeType(),
"VARCHAR");
Assert.assertEquals(
result.getIncrementingSegmentRowCountChange().getSegment().getField().getType(), "STRING");
}
}

View File

@ -9,6 +9,7 @@ cat ../../datahub-graphql-core/src/main/resources/app.graphql >> combined.graphq
cat ../../datahub-graphql-core/src/main/resources/auth.graphql >> combined.graphql
cat ../../datahub-graphql-core/src/main/resources/constraints.graphql >> combined.graphql
cat ../../datahub-graphql-core/src/main/resources/entity.graphql >> combined.graphql
cat ../../datahub-graphql-core/src/main/resources/assertions.graphql >> combined.graphql
cat ../../datahub-graphql-core/src/main/resources/ingestion.graphql >> combined.graphql
cat ../../datahub-graphql-core/src/main/resources/recommendation.graphql >> combined.graphql
cat ../../datahub-graphql-core/src/main/resources/search.graphql >> combined.graphql

View File

@ -285,6 +285,7 @@ public class Constants {
public static final String ASSERTION_INFO_ASPECT_NAME = "assertionInfo";
public static final String ASSERTION_RUN_EVENT_ASPECT_NAME = "assertionRunEvent";
public static final String ASSERTION_RUN_EVENT_STATUS_COMPLETE = "COMPLETE";
public static final String ASSERTION_ACTIONS_ASPECT_NAME = "assertionActions";
// Tests
public static final String TEST_ENTITY_NAME = "test";

View File

@ -0,0 +1,76 @@
version: 1
namespace: test-config-id-1
assertions:
# Freshness Assertion
- entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD)
type: freshness
lookback_interval: "1 hour"
last_modified_field: col_timestamp
schedule:
type: cron
cron: 0 * * * *
meta:
entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES
entity_schema:
- col: col_date
native_type: DATE
# Volume Assertion
- type: volume
entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD)
metric: row_count
condition:
type: less_than_or_equal_to
value: 1000
schedule:
type: cron
cron: 0 * * * *
meta:
entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES
entity_schema:
- col: col_date
native_type: DATE
# Field Metric Assertion
- type: field
entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD)
field: col_date
metric: null_count
condition:
type: equal_to
value: 0
schedule:
type: cron
cron: 0 * * * *
meta:
entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES
entity_schema:
- col: col_date
native_type: DATE
# Field Value Assertion
- type: field
entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.purchase_event,PROD)
field: quantity
condition:
type: between
min: 0
max: 10
schedule:
type: on_table_change
meta:
entity_qualified_name: TEST_DB.PUBLIC.PURCHASE_EVENT
entity_schema:
- col: quantity
native_type: FLOAT
# Custom SQL Metric Assertion
- type: sql
entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.purchase_event,PROD)
statement: select mode(quantity) from test_db.public.purchase_event
condition:
type: equal_to
value: 5
schedule:
type: on_table_change
meta:
entity_qualified_name: TEST_DB.PUBLIC.PURCHASE_EVENT
entity_schema:
- col: quantity
native_type: FLOAT

View File

@ -0,0 +1,57 @@
from abc import abstractmethod
from typing import Optional
from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger
from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field
from datahub.metadata.com.linkedin.pegasus2avro.assertion import AssertionInfo
class BaseAssertionProtocol(v1_ConfigModel):
@abstractmethod
def get_id(self) -> str:
pass
@abstractmethod
def get_assertion_info_aspect(
self,
) -> AssertionInfo:
pass
@abstractmethod
def get_assertion_trigger(
self,
) -> Optional[AssertionTrigger]:
pass
class BaseAssertion(v1_ConfigModel):
id_raw: Optional[str] = v1_Field(
default=None,
description="The raw id of the assertion."
"If provided, this is used when creating identifier for this assertion"
"along with assertion type and entity.",
)
id: Optional[str] = v1_Field(
default=None,
description="The id of the assertion."
"If provided, this is used as identifier for this assertion."
"If provided, no other assertion fields are considered to create identifier.",
)
description: Optional[str] = None
# Can contain metadata extracted from datahub. e.g.
# - entity qualified name
# - entity schema
meta: Optional[dict] = None
class BaseEntityAssertion(BaseAssertion):
entity: str = v1_Field(
description="The entity urn that the assertion is associated with"
)
trigger: Optional[AssertionTrigger] = v1_Field(
description="The trigger schedule for assertion", alias="schedule"
)

View File

@ -0,0 +1,41 @@
from typing import List, Optional
from ruamel.yaml import YAML
from typing_extensions import Literal
from datahub.api.entities.assertion.datahub_assertion import DataHubAssertion
from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field
class AssertionsConfigSpec(v1_ConfigModel):
"""
Declarative configuration specification for datahub assertions.
This model is used as a simpler, Python-native representation to define assertions.
It can be easily parsed from a equivalent YAML file.
Currently, this is converted into series of assertion MCPs that can be emitted to DataHub.
In future, this would invoke datahub GraphQL API to upsert assertions.
"""
version: Literal[1]
id: Optional[str] = v1_Field(
default=None,
alias="namespace",
description="Unique identifier of assertions configuration file",
)
assertions: List[DataHubAssertion]
@classmethod
def from_yaml(
cls,
file: str,
) -> "AssertionsConfigSpec":
with open(file) as fp:
yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip)
orig_dictionary = yaml.load(fp)
parsed_spec = AssertionsConfigSpec.parse_obj(orig_dictionary)
# parsed_spec._original_yaml_dict = orig_dictionary
return parsed_spec

View File

@ -0,0 +1,304 @@
import json
from typing import List, Optional, Union
from typing_extensions import Literal, Protocol
from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel
from datahub.metadata.schema_classes import (
AssertionStdOperatorClass,
AssertionStdParameterClass,
AssertionStdParametersClass,
AssertionStdParameterTypeClass,
)
class Operator(Protocol):
"""Specification for an assertion operator.
This class exists only for documentation (not used in typing checking).
"""
operator: str
def id(self) -> str:
...
def generate_parameters(self) -> AssertionStdParametersClass:
...
def _generate_assertion_std_parameter(
value: Union[str, int, float, list]
) -> AssertionStdParameterClass:
if isinstance(value, str):
return AssertionStdParameterClass(
value=value, type=AssertionStdParameterTypeClass.STRING
)
elif isinstance(value, (int, float)):
return AssertionStdParameterClass(
value=str(value), type=AssertionStdParameterTypeClass.NUMBER
)
elif isinstance(value, list):
return AssertionStdParameterClass(
value=json.dumps(value), type=AssertionStdParameterTypeClass.LIST
)
else:
raise ValueError(
f"Unsupported assertion parameter {value} of type {type(value)}"
)
Param = Union[str, int, float, List[Union[str, float, int]]]
def _generate_assertion_std_parameters(
value: Optional[Param] = None,
min_value: Optional[Param] = None,
max_value: Optional[Param] = None,
) -> AssertionStdParametersClass:
return AssertionStdParametersClass(
value=_generate_assertion_std_parameter(value) if value else None,
minValue=_generate_assertion_std_parameter(min_value) if min_value else None,
maxValue=_generate_assertion_std_parameter(max_value) if max_value else None,
)
class EqualToOperator(v1_ConfigModel):
type: Literal["equal_to"]
value: Union[str, int, float]
operator: str = AssertionStdOperatorClass.EQUAL_TO
def id(self) -> str:
return f"{self.type}-{self.value}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
class NotEqualToOperator(v1_ConfigModel):
type: Literal["not_equal_to"]
value: Union[str, int, float]
operator: str = AssertionStdOperatorClass.NOT_EQUAL_TO
def id(self) -> str:
return f"{self.type}-{self.value}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
class BetweenOperator(v1_ConfigModel):
type: Literal["between"]
min: Union[int, float]
max: Union[int, float]
operator: str = AssertionStdOperatorClass.BETWEEN
def id(self) -> str:
return f"{self.type}-{self.min}-{self.max}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(
min_value=self.min, max_value=self.max
)
class LessThanOperator(v1_ConfigModel):
type: Literal["less_than"]
value: Union[int, float]
operator: str = AssertionStdOperatorClass.LESS_THAN
def id(self) -> str:
return f"{self.type}-{self.value}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
class GreaterThanOperator(v1_ConfigModel):
type: Literal["greater_than"]
value: Union[int, float]
operator: str = AssertionStdOperatorClass.GREATER_THAN
def id(self) -> str:
return f"{self.type}-{self.value}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
class LessThanOrEqualToOperator(v1_ConfigModel):
type: Literal["less_than_or_equal_to"]
value: Union[int, float]
operator: str = AssertionStdOperatorClass.LESS_THAN_OR_EQUAL_TO
def id(self) -> str:
return f"{self.type}-{self.value}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
class GreaterThanOrEqualToOperator(v1_ConfigModel):
type: Literal["greater_than_or_equal_to"]
value: Union[int, float]
operator: str = AssertionStdOperatorClass.GREATER_THAN_OR_EQUAL_TO
def id(self) -> str:
return f"{self.type}-{self.value}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
class InOperator(v1_ConfigModel):
type: Literal["in"]
value: List[Union[str, float, int]]
operator: str = AssertionStdOperatorClass.IN
def id(self) -> str:
return f"{self.type}-{self.value}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
class NotInOperator(v1_ConfigModel):
type: Literal["not_in"]
value: List[Union[str, float, int]]
operator: str = AssertionStdOperatorClass.NOT_IN
def id(self) -> str:
return f"{self.type}-{self.value}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
class IsNullOperator(v1_ConfigModel):
type: Literal["is_null"]
operator: str = AssertionStdOperatorClass.NULL
def id(self) -> str:
return f"{self.type}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters()
class NotNullOperator(v1_ConfigModel):
type: Literal["is_not_null"]
operator: str = AssertionStdOperatorClass.NOT_NULL
def id(self) -> str:
return f"{self.type}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters()
class IsTrueOperator(v1_ConfigModel):
type: Literal["is_true"]
operator: str = AssertionStdOperatorClass.IS_TRUE
def id(self) -> str:
return f"{self.type}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters()
class IsFalseOperator(v1_ConfigModel):
type: Literal["is_false"]
operator: str = AssertionStdOperatorClass.IS_FALSE
def id(self) -> str:
return f"{self.type}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters()
class ContainsOperator(v1_ConfigModel):
type: Literal["contains"]
value: str
operator: str = AssertionStdOperatorClass.CONTAIN
def id(self) -> str:
return f"{self.type}-{self.value}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
class EndsWithOperator(v1_ConfigModel):
type: Literal["ends_with"]
value: str
operator: str = AssertionStdOperatorClass.END_WITH
def id(self) -> str:
return f"{self.type}-{self.value}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
class StartsWithOperator(v1_ConfigModel):
type: Literal["starts_with"]
value: str
operator: str = AssertionStdOperatorClass.START_WITH
def id(self) -> str:
return f"{self.type}-{self.value}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
class MatchesRegexOperator(v1_ConfigModel):
type: Literal["matches_regex"]
value: str
operator: str = AssertionStdOperatorClass.REGEX_MATCH
def id(self) -> str:
return f"{self.type}-{self.value}"
def generate_parameters(self) -> AssertionStdParametersClass:
return _generate_assertion_std_parameters(value=self.value)
Operators = Union[
InOperator,
NotInOperator,
EqualToOperator,
NotEqualToOperator,
BetweenOperator,
LessThanOperator,
LessThanOrEqualToOperator,
GreaterThanOperator,
GreaterThanOrEqualToOperator,
IsNullOperator,
NotNullOperator,
IsTrueOperator,
IsFalseOperator,
ContainsOperator,
EndsWithOperator,
StartsWithOperator,
MatchesRegexOperator,
]

View File

@ -0,0 +1,52 @@
from datetime import timedelta
from typing import Union
import humanfriendly
from typing_extensions import Literal
from datahub.configuration.pydantic_migration_helpers import (
v1_ConfigModel,
v1_Field,
v1_validator,
)
class CronTrigger(v1_ConfigModel):
type: Literal["cron"]
cron: str = v1_Field(
description="The cron expression to use. See https://crontab.guru/ for help."
)
timezone: str = v1_Field(
"UTC",
description="The timezone to use for the cron schedule. Defaults to UTC.",
)
class IntervalTrigger(v1_ConfigModel):
type: Literal["interval"]
interval: timedelta
@v1_validator("interval", pre=True)
def lookback_interval_to_timedelta(cls, v):
if isinstance(v, str):
seconds = humanfriendly.parse_timespan(v)
return timedelta(seconds=seconds)
raise ValueError("Invalid value.")
class EntityChangeTrigger(v1_ConfigModel):
type: Literal["on_table_change"]
class ManualTrigger(v1_ConfigModel):
type: Literal["manual"]
class AssertionTrigger(v1_ConfigModel):
__root__: Union[
CronTrigger, IntervalTrigger, EntityChangeTrigger, ManualTrigger
] = v1_Field(discriminator="type")
@property
def trigger(self):
return self.__root__

View File

@ -0,0 +1,81 @@
from abc import abstractmethod
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Dict, List, Literal
from datahub.api.entities.assertion.assertion_config_spec import AssertionsConfigSpec
from datahub.ingestion.api.report import Report
from datahub.utilities.lossy_collections import LossyDict, LossyList
class StrEnum(str, Enum):
pass
class CompileResultArtifactType(StrEnum):
SQL_QUERIES = "SQL_QUERIES"
COMPILE_REPORT = "COMPILE_REPORT"
@dataclass
class CompileResultArtifact(Report):
name: str
type: CompileResultArtifactType
path: Path
description: str
@dataclass
class AssertionCompilationReport(Report):
"""Additional details to debug compilation"""
num_processed: int = 0
num_compile_succeeded: int = 0
num_compile_failed: int = 0 # Likely due to assertion not supported in platform
warnings: LossyDict[str, LossyList[str]] = field(default_factory=LossyDict)
failures: LossyDict[str, LossyList[str]] = field(default_factory=LossyDict)
artifacts: List[Path] = field(default_factory=list)
def report_warning(self, key: str, reason: str) -> None:
warnings = self.warnings.get(key, LossyList())
warnings.append(reason)
self.warnings[key] = warnings
def report_failure(self, key: str, reason: str) -> None:
failures = self.failures.get(key, LossyList())
failures.append(reason)
self.failures[key] = failures
@dataclass
class AssertionCompilationResult:
"""Results of compilation step , along with detailed report object"""
platform: str
status: Literal["success", "failure"]
report: AssertionCompilationReport = field(
default_factory=AssertionCompilationReport
)
artifacts: List[CompileResultArtifact] = field(default_factory=list)
def add_artifact(self, artifact: CompileResultArtifact) -> None:
self.artifacts.append(artifact)
self.report.artifacts.append(artifact.path)
class AssertionCompiler:
@classmethod
@abstractmethod
def create(cls, output_dir: str, extras: Dict[str, str]) -> "AssertionCompiler":
pass
@abstractmethod
def compile(
self, assertion_config_spec: AssertionsConfigSpec
) -> AssertionCompilationResult:
pass

View File

@ -0,0 +1,35 @@
from typing import Optional, Union
from datahub.api.entities.assertion.assertion import BaseAssertionProtocol
from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger
from datahub.api.entities.assertion.field_assertion import FieldAssertion
from datahub.api.entities.assertion.freshness_assertion import FreshnessAssertion
from datahub.api.entities.assertion.sql_assertion import SQLAssertion
from datahub.api.entities.assertion.volume_assertion import VolumeAssertion
from datahub.configuration.pydantic_migration_helpers import v1_Field
from datahub.metadata.com.linkedin.pegasus2avro.assertion import AssertionInfo
class DataHubAssertion(BaseAssertionProtocol):
__root__: Union[
FreshnessAssertion,
VolumeAssertion,
SQLAssertion,
FieldAssertion,
# TODO: Add SchemaAssertion
] = v1_Field(discriminator="type")
@property
def assertion(self):
return self.__root__.assertion
def get_assertion_info_aspect(
self,
) -> AssertionInfo:
return self.__root__.get_assertion_info_aspect()
def get_id(self) -> str:
return self.__root__.get_id()
def get_assertion_trigger(self) -> Optional[AssertionTrigger]:
return self.__root__.get_assertion_trigger()

View File

@ -0,0 +1,158 @@
from enum import Enum
from typing import Optional, Union
from typing_extensions import Literal
from datahub.api.entities.assertion.assertion import (
BaseAssertionProtocol,
BaseEntityAssertion,
)
from datahub.api.entities.assertion.assertion_operator import Operators
from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger
from datahub.api.entities.assertion.field_metric import FieldMetric
from datahub.api.entities.assertion.filter import DatasetFilter
from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field
from datahub.emitter.mce_builder import datahub_guid
from datahub.metadata.com.linkedin.pegasus2avro.assertion import (
AssertionInfo,
AssertionType,
FieldAssertionInfo,
FieldAssertionType,
)
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldSpec
from datahub.metadata.schema_classes import (
FieldMetricAssertionClass,
FieldTransformClass,
FieldTransformTypeClass,
FieldValuesAssertionClass,
FieldValuesFailThresholdClass,
FieldValuesFailThresholdTypeClass,
)
class FieldValuesFailThreshold(v1_ConfigModel):
type: Literal["count", "percentage"] = v1_Field(default="count")
value: int = v1_Field(default=0)
def to_field_values_failure_threshold(self) -> FieldValuesFailThresholdClass:
return FieldValuesFailThresholdClass(
type=(
FieldValuesFailThresholdTypeClass.COUNT
if self.type == Literal["count"]
else FieldValuesFailThresholdTypeClass.PERCENTAGE
),
value=self.value,
)
class FieldTransform(Enum):
LENGTH = "length"
class FieldValuesAssertion(BaseEntityAssertion):
type: Literal["field"]
field: str
field_transform: Optional[FieldTransform] = v1_Field(default=None)
operator: Operators = v1_Field(discriminator="type", alias="condition")
filters: Optional[DatasetFilter] = v1_Field(default=None)
failure_threshold: FieldValuesFailThreshold = v1_Field(
default=FieldValuesFailThreshold()
)
exclude_nulls: bool = v1_Field(default=True)
def get_assertion_info(
self,
) -> AssertionInfo:
return AssertionInfo(
description=self.description,
type=AssertionType.FIELD,
fieldAssertion=FieldAssertionInfo(
type=FieldAssertionType.FIELD_VALUES,
entity=self.entity,
fieldValuesAssertion=FieldValuesAssertionClass(
field=SchemaFieldSpec(
path=self.field,
type="", # Not required
nativeType="", # Not required
),
operator=self.operator.operator,
parameters=self.operator.generate_parameters(),
failThreshold=self.failure_threshold.to_field_values_failure_threshold(),
excludeNulls=self.exclude_nulls,
transform=(
FieldTransformClass(type=FieldTransformTypeClass.LENGTH)
if self.field_transform == Literal["length"]
else None
),
),
),
)
def get_id(self) -> str:
guid_dict = {
"entity": self.entity,
"type": self.type,
"field": self.field,
"operator": str(self.operator.operator),
"id_raw": self.id_raw,
}
return self.id or datahub_guid(guid_dict)
class FieldMetricAssertion(BaseEntityAssertion):
type: Literal["field"]
field: str
operator: Operators = v1_Field(discriminator="type", alias="condition")
metric: FieldMetric
filters: Optional[DatasetFilter] = v1_Field(default=None)
def get_assertion_info(
self,
) -> AssertionInfo:
return AssertionInfo(
description=self.description,
type=AssertionType.FIELD,
fieldAssertion=FieldAssertionInfo(
type=FieldAssertionType.FIELD_METRIC,
entity=self.entity,
fieldMetricAssertion=FieldMetricAssertionClass(
field=SchemaFieldSpec(
path=self.field,
type="", # Not required
nativeType="", # Not required
),
metric=self.metric.name,
operator=self.operator.operator,
parameters=self.operator.generate_parameters(),
),
),
)
def get_id(self) -> str:
guid_dict = {
"entity": self.entity,
"type": self.type,
"field": self.field,
"metric": self.metric.value,
"id_raw": self.id_raw,
}
return self.id or datahub_guid(guid_dict)
class FieldAssertion(BaseAssertionProtocol):
__root__: Union[FieldMetricAssertion, FieldValuesAssertion]
@property
def assertion(self):
return self.__root__
def get_id(self) -> str:
return self.__root__.get_id()
def get_assertion_info_aspect(
self,
) -> AssertionInfo:
return self.__root__.get_assertion_info()
def get_assertion_trigger(self) -> Optional[AssertionTrigger]:
return self.__root__.trigger

View File

@ -0,0 +1,21 @@
from enum import Enum
class FieldMetric(Enum):
UNIQUE_COUNT = "unique_count"
UNIQUE_PERCENTAGE = "unique_percentage"
NULL_COUNT = "null_count"
NULL_PERCENTAGE = "null_percentage"
MIN = "min"
MAX = "max"
MEAN = "mean"
MEDIAN = "median"
STDDEV = "stddev"
NEGATIVE_COUNT = "negative_count"
NEGATIVE_PERCENTAGE = "negative_percentage"
ZERO_COUNT = "zero_count"
ZERO_PERCENTAGE = "zero_percentage"
MIN_LENGTH = "min_length"
MAX_LENGTH = "max_length"
EMPTY_COUNT = "empty_count"
EMPTY_PERCENTAGE = "empty_percentage"

View File

@ -0,0 +1,13 @@
from typing_extensions import Literal
from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel
class SqlFilter(v1_ConfigModel):
type: Literal["sql"]
sql: str
DatasetFilter = SqlFilter
# class DatasetFilter(v1_ConfigModel):
# __root__: Union[SqlFilter] = v1_Field(discriminator="type")

View File

@ -0,0 +1,124 @@
from datetime import timedelta
from enum import Enum
from typing import Optional, Union
import humanfriendly
from typing_extensions import Literal
from datahub.api.entities.assertion.assertion import (
BaseAssertionProtocol,
BaseEntityAssertion,
)
from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger
from datahub.api.entities.assertion.filter import DatasetFilter
from datahub.configuration.pydantic_migration_helpers import v1_Field, v1_validator
from datahub.emitter.mce_builder import datahub_guid
from datahub.metadata.com.linkedin.pegasus2avro.assertion import (
AssertionInfo,
AssertionType,
FixedIntervalSchedule,
FreshnessAssertionInfo,
FreshnessAssertionSchedule,
FreshnessAssertionScheduleType,
FreshnessAssertionType,
FreshnessCronSchedule,
)
from datahub.metadata.com.linkedin.pegasus2avro.timeseries import CalendarInterval
class FreshnessSourceType(Enum):
LAST_MODIFIED_COLUMN = "last_modified_column"
class CronFreshnessAssertion(BaseEntityAssertion):
type: Literal["freshness"]
freshness_type: Literal["cron"]
cron: str = v1_Field(
description="The cron expression to use. See https://crontab.guru/ for help."
)
timezone: str = v1_Field(
"UTC",
description="The timezone to use for the cron schedule. Defaults to UTC.",
)
source_type: FreshnessSourceType = v1_Field(
default=FreshnessSourceType.LAST_MODIFIED_COLUMN
)
last_modified_field: str
filters: Optional[DatasetFilter] = v1_Field(default=None)
def get_assertion_info(
self,
) -> AssertionInfo:
return AssertionInfo(
description=self.description,
type=AssertionType.FRESHNESS,
freshnessAssertion=FreshnessAssertionInfo(
type=FreshnessAssertionType.DATASET_CHANGE,
entity=self.entity,
schedule=FreshnessAssertionSchedule(
type=FreshnessAssertionScheduleType.CRON,
cron=FreshnessCronSchedule(cron=self.cron, timezone=self.timezone),
),
),
)
class FixedIntervalFreshnessAssertion(BaseEntityAssertion):
type: Literal["freshness"]
freshness_type: Literal["interval"] = v1_Field(default="interval")
lookback_interval: timedelta
filters: Optional[DatasetFilter] = v1_Field(default=None)
source_type: FreshnessSourceType = v1_Field(
default=FreshnessSourceType.LAST_MODIFIED_COLUMN
)
last_modified_field: str
@v1_validator("lookback_interval", pre=True)
def lookback_interval_to_timedelta(cls, v):
if isinstance(v, str):
seconds = humanfriendly.parse_timespan(v)
return timedelta(seconds=seconds)
raise ValueError("Invalid value.")
def get_assertion_info(
self,
) -> AssertionInfo:
return AssertionInfo(
description=self.description,
type=AssertionType.FRESHNESS,
freshnessAssertion=FreshnessAssertionInfo(
type=FreshnessAssertionType.DATASET_CHANGE,
entity=self.entity,
schedule=FreshnessAssertionSchedule(
type=FreshnessAssertionScheduleType.FIXED_INTERVAL,
fixedInterval=FixedIntervalSchedule(
unit=CalendarInterval.SECOND,
multiple=self.lookback_interval.seconds,
),
),
),
)
class FreshnessAssertion(BaseAssertionProtocol):
__root__: Union[FixedIntervalFreshnessAssertion, CronFreshnessAssertion]
@property
def assertion(self):
return self.__root__
def get_id(self) -> str:
guid_dict = {
"entity": self.__root__.entity,
"type": self.__root__.type,
"id_raw": self.__root__.id_raw,
}
return self.__root__.id or datahub_guid(guid_dict)
def get_assertion_info_aspect(
self,
) -> AssertionInfo:
return self.__root__.get_assertion_info()
def get_assertion_trigger(self) -> Optional[AssertionTrigger]:
return self.__root__.trigger

View File

@ -0,0 +1,91 @@
from typing import Optional, Union
from typing_extensions import Literal
from datahub.api.entities.assertion.assertion import (
BaseAssertionProtocol,
BaseEntityAssertion,
)
from datahub.api.entities.assertion.assertion_operator import Operators
from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger
from datahub.configuration.pydantic_migration_helpers import v1_Field
from datahub.emitter.mce_builder import datahub_guid
from datahub.metadata.com.linkedin.pegasus2avro.assertion import (
AssertionInfo,
AssertionType,
AssertionValueChangeType,
SqlAssertionInfo,
SqlAssertionType,
)
class SqlMetricAssertion(BaseEntityAssertion):
type: Literal["sql"]
statement: str
operator: Operators = v1_Field(discriminator="type", alias="condition")
def get_assertion_info(
self,
) -> AssertionInfo:
return AssertionInfo(
description=self.description,
type=AssertionType.SQL,
sqlAssertion=SqlAssertionInfo(
type=SqlAssertionType.METRIC,
entity=self.entity,
statement=self.statement,
operator=self.operator.operator,
parameters=self.operator.generate_parameters(),
),
)
class SqlMetricChangeAssertion(BaseEntityAssertion):
type: Literal["sql"]
statement: str
change_type: Literal["absolute", "percentage"]
operator: Operators = v1_Field(discriminator="type", alias="condition")
def get_assertion_info(
self,
) -> AssertionInfo:
return AssertionInfo(
description=self.description,
type=AssertionType.SQL,
sqlAssertion=SqlAssertionInfo(
type=SqlAssertionType.METRIC_CHANGE,
entity=self.entity,
statement=self.statement,
changeType=(
AssertionValueChangeType.ABSOLUTE
if self.change_type == Literal["absolute"]
else AssertionValueChangeType.PERCENTAGE
),
operator=self.operator.operator,
parameters=self.operator.generate_parameters(),
),
)
class SQLAssertion(BaseAssertionProtocol):
__root__: Union[SqlMetricAssertion, SqlMetricChangeAssertion] = v1_Field()
@property
def assertion(self):
return self.__root__
def get_id(self) -> str:
guid_dict = {
"entity": self.__root__.entity,
"type": self.__root__.type,
"id_raw": self.__root__.id_raw,
}
return self.__root__.id or datahub_guid(guid_dict)
def get_assertion_info_aspect(
self,
) -> AssertionInfo:
return self.__root__.get_assertion_info()
def get_assertion_trigger(self) -> Optional[AssertionTrigger]:
return self.__root__.trigger

View File

@ -0,0 +1,98 @@
from typing import Optional, Union
from typing_extensions import Literal
from datahub.api.entities.assertion.assertion import (
BaseAssertionProtocol,
BaseEntityAssertion,
)
from datahub.api.entities.assertion.assertion_operator import Operators
from datahub.api.entities.assertion.assertion_trigger import AssertionTrigger
from datahub.api.entities.assertion.filter import DatasetFilter
from datahub.configuration.pydantic_migration_helpers import v1_Field
from datahub.emitter.mce_builder import datahub_guid
from datahub.metadata.com.linkedin.pegasus2avro.assertion import (
AssertionInfo,
AssertionType,
AssertionValueChangeType,
RowCountChange,
RowCountTotal,
VolumeAssertionInfo,
VolumeAssertionType,
)
class RowCountTotalVolumeAssertion(BaseEntityAssertion):
type: Literal["volume"]
metric: Literal["row_count"] = v1_Field(default="row_count")
operator: Operators = v1_Field(discriminator="type", alias="condition")
filters: Optional[DatasetFilter] = v1_Field(default=None)
def get_assertion_info(
self,
) -> AssertionInfo:
return AssertionInfo(
description=self.description,
type=AssertionType.VOLUME,
volumeAssertion=VolumeAssertionInfo(
type=VolumeAssertionType.ROW_COUNT_TOTAL,
entity=self.entity,
rowCountTotal=RowCountTotal(
operator=self.operator.operator,
parameters=self.operator.generate_parameters(),
),
),
)
class RowCountChangeVolumeAssertion(BaseEntityAssertion):
type: Literal["volume"]
metric: Literal["row_count"] = v1_Field(default="row_count")
change_type: Literal["absolute", "percentage"]
operator: Operators = v1_Field(discriminator="type", alias="condition")
filters: Optional[DatasetFilter] = v1_Field(default=None)
def get_assertion_info(
self,
) -> AssertionInfo:
return AssertionInfo(
description=self.description,
type=AssertionType.VOLUME,
volumeAssertion=VolumeAssertionInfo(
type=VolumeAssertionType.ROW_COUNT_CHANGE,
entity=self.entity,
rowCountChange=RowCountChange(
type=(
AssertionValueChangeType.ABSOLUTE
if self.change_type == Literal["absolute"]
else AssertionValueChangeType.PERCENTAGE
),
operator=self.operator.operator,
parameters=self.operator.generate_parameters(),
),
),
)
class VolumeAssertion(BaseAssertionProtocol):
__root__: Union[RowCountTotalVolumeAssertion, RowCountChangeVolumeAssertion]
@property
def assertion(self):
return self.__root__
def get_id(self) -> str:
guid_dict = {
"entity": self.__root__.entity,
"type": self.__root__.type,
"id_raw": self.__root__.id_raw,
}
return self.__root__.id or datahub_guid(guid_dict)
def get_assertion_info_aspect(
self,
) -> AssertionInfo:
return self.__root__.get_assertion_info()
def get_assertion_trigger(self) -> Optional[AssertionTrigger]:
return self.__root__.trigger

View File

@ -0,0 +1,151 @@
import logging
import os
from pathlib import Path
from typing import Dict, List, Optional
import click
from click_default_group import DefaultGroup
from datahub.api.entities.assertion.assertion_config_spec import AssertionsConfigSpec
from datahub.api.entities.assertion.compiler_interface import (
AssertionCompilationResult,
CompileResultArtifact,
CompileResultArtifactType,
)
from datahub.emitter.mce_builder import make_assertion_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.graph.client import get_default_graph
from datahub.integrations.assertion.registry import ASSERTION_PLATFORMS
from datahub.telemetry import telemetry
from datahub.upgrade import upgrade
logger = logging.getLogger(__name__)
REPORT_FILE_NAME = "compile_report.json"
@click.group(cls=DefaultGroup, default="upsert")
def assertions() -> None:
"""A group of commands to interact with the Assertion entity in DataHub."""
pass
@assertions.command()
@click.option("-f", "--file", required=True, type=click.Path(exists=True))
@upgrade.check_upgrade
@telemetry.with_telemetry()
def upsert(file: str) -> None:
"""Upsert (create or update) a set of assertions in DataHub."""
assertions_spec: AssertionsConfigSpec = AssertionsConfigSpec.from_yaml(file)
with get_default_graph() as graph:
for assertion_spec in assertions_spec.assertions:
try:
mcp = MetadataChangeProposalWrapper(
entityUrn=make_assertion_urn(assertion_spec.get_id()),
aspect=assertion_spec.get_assertion_info_aspect(),
)
graph.emit_mcp(mcp)
# TODO: Validate uniqueness of assertion ids. Report if duplicates found.
# TODO: Use upsert graphql endpoints here instead of graph.emit_mcp.
click.secho(f"Update succeeded for urn {mcp.entityUrn}.", fg="green")
except Exception as e:
logger.exception(e)
click.secho(
f"Update failed for {mcp.entityUrn}: {e}",
fg="red",
)
@assertions.command()
@click.option("-f", "--file", required=True, type=click.Path(exists=True))
@click.option("-p", "--platform", required=True, type=str)
@click.option("-o", "--output-to", required=False, type=click.Path(exists=True))
@click.option(
"-x",
"--extras",
required=False,
multiple=True,
default=[],
help="Platform-specific extra key-value inputs in form key=value",
)
@upgrade.check_upgrade
@telemetry.with_telemetry()
def compile(
file: str, platform: str, output_to: Optional[str], extras: List[str]
) -> None:
"""Compile a set of assertions for input assertion platform.
Note that this does not run any code or execute any queries on assertion platform
and only creates artifacts specific to assertion platform that can be executed manually.
In future, we may introduce separate command to automatically apply these compiled changes
in assertion platform. Currently, generated result artifacts are stored in target folder
unless another folder is specified using option `--output-to <folder>`.
"""
if platform not in ASSERTION_PLATFORMS:
click.secho(
f"Platform {platform} is not supported.",
fg="red",
)
if output_to is None:
output_to = f"{os.getcwd()}/target"
if not os.path.isdir(output_to):
os.mkdir(output_to)
assertions_spec: AssertionsConfigSpec = AssertionsConfigSpec.from_yaml(file)
try:
compiler = ASSERTION_PLATFORMS[platform].create(
output_dir=output_to, extras=extras_list_to_dict(extras)
)
result = compiler.compile(assertions_spec)
write_report_file(output_to, result)
click.secho("Compile report:", bold=True)
click.echo(result.report.as_string())
if result.status == "failure":
click.secho("Failure", fg="yellow", bold=True)
else:
click.secho("Success", fg="green", bold=True)
except Exception as e:
logger.exception(e)
click.secho(
f"Compile failed: {e}",
fg="red",
)
def write_report_file(output_to: str, result: AssertionCompilationResult) -> None:
report_path = Path(output_to) / REPORT_FILE_NAME
with (report_path).open("w") as f:
result.add_artifact(
CompileResultArtifact(
name=REPORT_FILE_NAME,
path=report_path,
type=CompileResultArtifactType.COMPILE_REPORT,
description="Detailed report about compile status",
)
)
f.write(result.report.as_json())
def extras_list_to_dict(extras: List[str]) -> Dict[str, str]:
extra_properties: Dict[str, str] = dict()
for x in extras:
parts = x.split("=")
assert (
len(parts) == 2
), f"Invalid value for extras {x}, should be in format key=value"
extra_properties[parts[0]] = parts[1]
return extra_properties
# TODO: support for
# Immediate:
# 1. delete assertions (from datahub)
# Later:
# 3. execute compiled assertions on assertion platform (Later, requires connection details to platform),
# 4. cleanup assertions from assertion platform (generate artifacts. optionally execute)

View File

@ -25,6 +25,7 @@ from datahub.cli.get_cli import get
from datahub.cli.ingest_cli import ingest
from datahub.cli.migrate import migrate
from datahub.cli.put_cli import put
from datahub.cli.specific.assertions_cli import assertions
from datahub.cli.specific.datacontract_cli import datacontract
from datahub.cli.specific.dataproduct_cli import dataproduct
from datahub.cli.specific.dataset_cli import dataset
@ -164,6 +165,7 @@ datahub.add_command(dataset)
datahub.add_command(properties)
datahub.add_command(forms)
datahub.add_command(datacontract)
datahub.add_command(assertions)
try:
from datahub.cli.lite_cli import lite

View File

@ -0,0 +1,129 @@
import logging
from datetime import datetime
from typing import Callable, Iterable, List, Optional
from pydantic import BaseModel
from datahub.emitter.mce_builder import (
make_assertion_urn,
make_data_platform_urn,
make_dataplatform_instance_urn,
)
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config
from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
from datahub.ingestion.source.snowflake.snowflake_utils import (
SnowflakeCommonMixin,
SnowflakeConnectionMixin,
SnowflakeQueryMixin,
)
from datahub.metadata.com.linkedin.pegasus2avro.assertion import (
AssertionResult,
AssertionResultType,
AssertionRunEvent,
AssertionRunStatus,
)
from datahub.metadata.com.linkedin.pegasus2avro.common import DataPlatformInstance
from datahub.utilities.time import datetime_to_ts_millis
logger: logging.Logger = logging.getLogger(__name__)
class DataQualityMonitoringResult(BaseModel):
MEASUREMENT_TIME: datetime
METRIC_NAME: str
TABLE_NAME: str
TABLE_SCHEMA: str
TABLE_DATABASE: str
VALUE: int
class SnowflakeAssertionsHandler(
SnowflakeCommonMixin, SnowflakeQueryMixin, SnowflakeConnectionMixin
):
def __init__(
self,
config: SnowflakeV2Config,
report: SnowflakeV2Report,
dataset_urn_builder: Callable[[str], str],
) -> None:
self.config = config
self.report = report
self.logger = logger
self.dataset_urn_builder = dataset_urn_builder
self.connection = None
self._urns_processed: List[str] = []
def get_assertion_workunits(
self, discovered_datasets: List[str]
) -> Iterable[MetadataWorkUnit]:
self.connection = self.create_connection()
if self.connection is None:
return
cur = self.query(
SnowflakeQuery.dmf_assertion_results(
datetime_to_ts_millis(self.config.start_time),
datetime_to_ts_millis(self.config.end_time),
)
)
for db_row in cur:
mcp = self._process_result_row(db_row, discovered_datasets)
if mcp:
yield mcp.as_workunit(is_primary_source=False)
if mcp.entityUrn and mcp.entityUrn not in self._urns_processed:
self._urns_processed.append(mcp.entityUrn)
yield self._gen_platform_instance_wu(mcp.entityUrn)
def _gen_platform_instance_wu(self, urn: str) -> MetadataWorkUnit:
# Construct a MetadataChangeProposalWrapper object for assertion platform
return MetadataChangeProposalWrapper(
entityUrn=urn,
aspect=DataPlatformInstance(
platform=make_data_platform_urn(self.platform),
instance=(
make_dataplatform_instance_urn(
self.platform, self.config.platform_instance
)
if self.config.platform_instance
else None
),
),
).as_workunit(is_primary_source=False)
def _process_result_row(
self, result_row: dict, discovered_datasets: List[str]
) -> Optional[MetadataChangeProposalWrapper]:
try:
result = DataQualityMonitoringResult.parse_obj(result_row)
assertion_guid = result.METRIC_NAME.split("__")[-1].lower()
status = bool(result.VALUE) # 1 if PASS, 0 if FAIL
assertee = self.get_dataset_identifier(
result.TABLE_NAME, result.TABLE_SCHEMA, result.TABLE_DATABASE
)
if assertee in discovered_datasets:
return MetadataChangeProposalWrapper(
entityUrn=make_assertion_urn(assertion_guid),
aspect=AssertionRunEvent(
timestampMillis=datetime_to_ts_millis(result.MEASUREMENT_TIME),
runId=result.MEASUREMENT_TIME.strftime("%Y-%m-%dT%H:%M:%SZ"),
asserteeUrn=self.dataset_urn_builder(assertee),
status=AssertionRunStatus.COMPLETE,
assertionUrn=make_assertion_urn(assertion_guid),
result=AssertionResult(
type=(
AssertionResultType.SUCCESS
if status
else AssertionResultType.FAILURE
)
),
),
)
except Exception as e:
self.report.report_warning("assertion-result-parse-failure", str(e))
return None

View File

@ -164,6 +164,12 @@ class SnowflakeV2Config(
"username.",
)
include_assertion_results: bool = Field(
default=False,
description="Whether to ingest assertion run results for assertions created using Datahub"
" assertions CLI in snowflake",
)
@validator("convert_urns_to_lowercase")
def validate_convert_urns_to_lowercase(cls, v):
if not v:

View File

@ -1016,3 +1016,26 @@ class SnowflakeQuery:
ORDER BY
h.downstream_table_name
"""
@staticmethod
def dmf_assertion_results(start_time_millis: int, end_time_millis: int) -> str:
pattern = r"datahub\\_\\_%"
escape_pattern = r"\\"
return f"""
SELECT
MEASUREMENT_TIME AS "MEASUREMENT_TIME",
METRIC_NAME AS "METRIC_NAME",
TABLE_NAME AS "TABLE_NAME",
TABLE_SCHEMA AS "TABLE_SCHEMA",
TABLE_DATABASE AS "TABLE_DATABASE",
VALUE::INT AS "VALUE"
FROM
SNOWFLAKE.LOCAL.DATA_QUALITY_MONITORING_RESULTS
WHERE
MEASUREMENT_TIME >= to_timestamp_ltz({start_time_millis}, 3)
AND MEASUREMENT_TIME < to_timestamp_ltz({end_time_millis}, 3)
AND METRIC_NAME ilike '{pattern}' escape '{escape_pattern}'
ORDER BY MEASUREMENT_TIME ASC;
"""

View File

@ -50,6 +50,9 @@ from datahub.ingestion.source.snowflake.constants import (
SnowflakeEdition,
SnowflakeObjectDomain,
)
from datahub.ingestion.source.snowflake.snowflake_assertion import (
SnowflakeAssertionsHandler,
)
from datahub.ingestion.source.snowflake.snowflake_config import (
SnowflakeV2Config,
TagOption,
@ -604,6 +607,11 @@ class SnowflakeV2Source(
) and self.usage_extractor:
yield from self.usage_extractor.get_usage_workunits(discovered_datasets)
if self.config.include_assertion_results:
yield from SnowflakeAssertionsHandler(
self.config, self.report, self.gen_dataset_urn
).get_assertion_workunits(discovered_datasets)
def report_cache_info(self) -> None:
lru_cache_functions: List[Callable] = [
self.data_dictionary.get_tables_for_database,

View File

@ -0,0 +1,61 @@
from functools import lru_cache
from typing import List, Optional, Tuple, TypedDict
from datahub.api.entities.assertion.assertion import BaseEntityAssertion
from datahub.ingestion.graph.client import get_default_graph
from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProperties
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaMetadata
from datahub.utilities.urns.urn import Urn
class ColumnDict(TypedDict):
col: str
native_type: str
@lru_cache
def get_qualified_name_from_datahub(urn: str) -> Optional[str]:
with get_default_graph() as graph:
props: Optional[DatasetProperties] = graph.get_aspect(urn, DatasetProperties)
if props is not None:
return props.qualifiedName
return None
@lru_cache
def get_schema_from_datahub(urn: str) -> Optional[List[ColumnDict]]:
with get_default_graph() as graph:
schema: Optional[SchemaMetadata] = graph.get_aspect(urn, SchemaMetadata)
if schema is not None:
return [
{"col": field.fieldPath, "native_type": field.nativeDataType}
for field in schema.fields
]
return None
def get_entity_name(assertion: BaseEntityAssertion) -> Tuple[str, str, str]:
if assertion.meta and assertion.meta.get("entity_qualified_name"):
parts = assertion.meta["entity_qualified_name"].split(".")
else:
qualified_name = get_qualified_name_from_datahub(assertion.entity)
if qualified_name is not None:
parts = qualified_name.split(".")
else:
urn_id = Urn.create_from_string(assertion.entity).entity_ids[1]
parts = urn_id.split(".")
if len(parts) > 3:
parts = parts[-3:]
assert len(parts) == 3
database = parts[-3]
schema = parts[-2]
table = parts[-1]
return database, schema, table
def get_entity_schema(assertion: BaseEntityAssertion) -> Optional[List[ColumnDict]]:
if assertion.meta and assertion.meta.get("entity_schema"):
return assertion.meta.get("entity_schema")
elif get_schema_from_datahub(assertion.entity):
return get_schema_from_datahub(assertion.entity)
return None

View File

@ -0,0 +1,8 @@
from typing import Dict, Type
from datahub.api.entities.assertion.compiler_interface import AssertionCompiler
from datahub.integrations.assertion.snowflake.compiler import SnowflakeAssertionCompiler
ASSERTION_PLATFORMS: Dict[str, Type[AssertionCompiler]] = {
"snowflake": SnowflakeAssertionCompiler
}

View File

@ -0,0 +1,237 @@
import logging
import os
from pathlib import Path
from typing import Dict, Tuple
from datahub.api.entities.assertion.assertion_config_spec import AssertionsConfigSpec
from datahub.api.entities.assertion.assertion_operator import LessThanOrEqualToOperator
from datahub.api.entities.assertion.assertion_trigger import (
AssertionTrigger,
CronTrigger,
EntityChangeTrigger,
IntervalTrigger,
)
from datahub.api.entities.assertion.compiler_interface import (
AssertionCompilationResult,
AssertionCompiler,
CompileResultArtifact,
CompileResultArtifactType,
)
from datahub.api.entities.assertion.datahub_assertion import DataHubAssertion
from datahub.api.entities.assertion.field_assertion import FieldValuesAssertion
from datahub.api.entities.assertion.freshness_assertion import (
FixedIntervalFreshnessAssertion,
)
from datahub.emitter.mce_builder import make_assertion_urn
from datahub.integrations.assertion.common import get_entity_name, get_entity_schema
from datahub.integrations.assertion.snowflake.dmf_generator import SnowflakeDMFHandler
from datahub.integrations.assertion.snowflake.field_metric_sql_generator import (
SnowflakeFieldMetricSQLGenerator,
)
from datahub.integrations.assertion.snowflake.field_values_metric_sql_generator import (
SnowflakeFieldValuesMetricSQLGenerator,
)
from datahub.integrations.assertion.snowflake.metric_operator_sql_generator import (
SnowflakeMetricEvalOperatorSQLGenerator,
)
from datahub.integrations.assertion.snowflake.metric_sql_generator import (
SnowflakeMetricSQLGenerator,
)
logger = logging.Logger(__name__)
DMF_DEFINITIONS_FILE_NAME = "dmf_definitions.sql"
DMF_ASSOCIATIONS_FILE_NAME = "dmf_associations.sql"
DMF_SCHEMA_PROPERTY_KEY = "DMF_SCHEMA"
class SnowflakeAssertionCompiler(AssertionCompiler):
def __init__(self, output_dir: str, extras: Dict[str, str]) -> None:
self.output_dir = Path(output_dir)
self.extras = extras
self.metric_generator = SnowflakeMetricSQLGenerator(
SnowflakeFieldMetricSQLGenerator(), SnowflakeFieldValuesMetricSQLGenerator()
)
self.metric_evaluator = SnowflakeMetricEvalOperatorSQLGenerator()
self.dmf_handler = SnowflakeDMFHandler()
self._entity_schedule_history: Dict[str, AssertionTrigger] = dict()
@classmethod
def create(
cls, output_dir: str, extras: Dict[str, str]
) -> "SnowflakeAssertionCompiler":
assert os.path.exists(
output_dir
), f"Specified location {output_dir} does not exist."
assert os.path.isdir(
output_dir
), f"Specified location {output_dir} is not a folder."
assert any(
x.upper() == DMF_SCHEMA_PROPERTY_KEY for x in extras
), "Must specify value for DMF schema using -x DMF_SCHEMA=<db.schema>"
return SnowflakeAssertionCompiler(output_dir, extras)
def compile(
self, assertion_config_spec: AssertionsConfigSpec
) -> AssertionCompilationResult:
result = AssertionCompilationResult("snowflake", "success")
# TODO: Create/Report permissions sql
dmf_definitions_path = self.output_dir / DMF_DEFINITIONS_FILE_NAME
dmf_associations_path = self.output_dir / DMF_ASSOCIATIONS_FILE_NAME
with (dmf_definitions_path).open("w") as definitions, (
dmf_associations_path
).open("w") as associations:
for assertion_spec in assertion_config_spec.assertions:
result.report.num_processed += 1
try:
start_line = f"\n-- Start of Assertion {assertion_spec.get_id()}\n"
(dmf_definition, dmf_association) = self.process_assertion(
assertion_spec
)
end_line = f"\n-- End of Assertion {assertion_spec.get_id()}\n"
definitions.write(start_line)
definitions.write(dmf_definition)
definitions.write(end_line)
associations.write(start_line)
associations.write(dmf_association)
associations.write(end_line)
result.report.num_compile_succeeded += 1
except Exception as e:
result.status = "failure"
result.report.report_failure(
assertion_spec.get_id(),
f"Failed to compile assertion of type {assertion_spec.assertion.type} due to error: {e}",
)
result.report.num_compile_failed += 1
if result.report.num_compile_succeeded > 0:
result.add_artifact(
CompileResultArtifact(
name=DMF_DEFINITIONS_FILE_NAME,
path=dmf_definitions_path,
type=CompileResultArtifactType.SQL_QUERIES,
description="SQL file containing DMF create definitions equivalent to Datahub Assertions",
)
)
result.add_artifact(
CompileResultArtifact(
name=DMF_ASSOCIATIONS_FILE_NAME,
path=dmf_associations_path,
type=CompileResultArtifactType.SQL_QUERIES,
description="ALTER TABLE queries to associate DMFs to table to run on configured schedule.",
)
)
return result
def process_assertion(self, assertion: DataHubAssertion) -> Tuple[str, str]:
# TODO: support schema assertion ?
# For freshness assertion, metric is difference in seconds between assertion execution time
# and last time table was updated.
# For field values assertion, metric is number or percentage of rows that do not satify
# operator condition.
# For remaining assertions, numeric metric is discernible in assertion definition itself.
metric_definition = self.metric_generator.metric_sql(assertion.assertion)
if isinstance(assertion.assertion, FixedIntervalFreshnessAssertion):
assertion_sql = self.metric_evaluator.operator_sql(
LessThanOrEqualToOperator(
type="less_than_or_equal_to",
value=assertion.assertion.lookback_interval.total_seconds(),
),
metric_definition,
)
elif isinstance(assertion.assertion, FieldValuesAssertion):
assertion_sql = self.metric_evaluator.operator_sql(
LessThanOrEqualToOperator(
type="less_than_or_equal_to",
value=assertion.assertion.failure_threshold.value,
),
metric_definition,
)
else:
assertion_sql = self.metric_evaluator.operator_sql(
assertion.assertion.operator, metric_definition
)
dmf_name = get_dmf_name(assertion)
dmf_schema_name = self.extras[DMF_SCHEMA_PROPERTY_KEY]
args_create_dmf, args_add_dmf = get_dmf_args(assertion)
entity_name = get_entity_name(assertion.assertion)
self._entity_schedule_history.setdefault(
assertion.assertion.entity, assertion.assertion.trigger
)
if (
assertion.assertion.entity in self._entity_schedule_history
and self._entity_schedule_history[assertion.assertion.entity]
!= assertion.assertion.trigger
):
raise ValueError(
"Assertions on same entity must have same schedules as of now."
f" Found different schedules on entity {assertion.assertion.entity} ->"
f" ({self._entity_schedule_history[assertion.assertion.entity].trigger}),"
f" ({assertion.assertion.trigger.trigger})"
)
dmf_schedule = get_dmf_schedule(assertion.assertion.trigger)
dmf_definition = self.dmf_handler.create_dmf(
f"{dmf_schema_name}.{dmf_name}",
args_create_dmf,
assertion.assertion.description
or f"Created via DataHub for assertion {make_assertion_urn(assertion.get_id())} of type {assertion.assertion.type}",
assertion_sql,
)
dmf_association = self.dmf_handler.add_dmf_to_table(
f"{dmf_schema_name}.{dmf_name}",
args_add_dmf,
dmf_schedule,
".".join(entity_name),
)
return dmf_definition, dmf_association
def get_dmf_name(assertion: DataHubAssertion) -> str:
return f"datahub__{assertion.get_id()}"
def get_dmf_args(assertion: DataHubAssertion) -> Tuple[str, str]:
"""Returns Tuple with
- Args used to create DMF
- Args used to add DMF to table"""
# Snowflake does not allow creating custom data metric
# function without column name argument.
# So we fetch any one column from table's schema
args_create_dmf = "ARGT TABLE({col_name} {col_type})"
args_add_dmf = "{col_name}"
entity_schema = get_entity_schema(assertion.assertion)
if entity_schema:
for col_dict in entity_schema:
return args_create_dmf.format(
col_name=col_dict["col"], col_type=col_dict["native_type"]
), args_add_dmf.format(col_name=col_dict["col"])
raise ValueError("entity schema not available")
def get_dmf_schedule(trigger: AssertionTrigger) -> str:
if isinstance(trigger.trigger, EntityChangeTrigger):
return "TRIGGER_ON_CHANGES"
elif isinstance(trigger.trigger, CronTrigger):
return f"USING CRON {trigger.trigger.cron} {trigger.trigger.timezone}"
elif isinstance(trigger.trigger, IntervalTrigger):
return f"{trigger.trigger.interval.seconds/60} MIN"
else:
raise ValueError(f"Unsupported trigger type {type(trigger.trigger)}")

View File

@ -0,0 +1,22 @@
class SnowflakeDMFHandler:
def create_dmf(
self, dmf_name: str, dmf_args: str, dmf_comment: str, dmf_sql: str
) -> str:
return f"""
CREATE or REPLACE DATA METRIC FUNCTION
{dmf_name} ({dmf_args})
RETURNS NUMBER
COMMENT = '{dmf_comment}'
AS
$$
{dmf_sql}
$$;
"""
def add_dmf_to_table(
self, dmf_name: str, dmf_col_args: str, dmf_schedule: str, table_identifier: str
) -> str:
return f"""
ALTER TABLE {table_identifier} SET DATA_METRIC_SCHEDULE = '{dmf_schedule}';
ALTER TABLE {table_identifier} ADD DATA METRIC FUNCTION {dmf_name} ON ({dmf_col_args});
"""

View File

@ -0,0 +1,154 @@
from typing import List, Optional
from datahub.api.entities.assertion.field_assertion import FieldMetricAssertion
from datahub.api.entities.assertion.field_metric import FieldMetric
from datahub.integrations.assertion.common import get_entity_name
class SnowflakeFieldMetricSQLGenerator:
def unique_count_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
return f"""select count(distinct {field_name})
from {entity_name} {self._setup_where_clause([dataset_filter])}"""
def unique_percentage_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
return f"""select count(distinct {field_name})/count(*)
from {entity_name} {self._setup_where_clause([dataset_filter])}"""
def null_count_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
where_clause = self._setup_where_clause(
[dataset_filter, f"{field_name} is null"]
)
return f"""select count(*)
from {entity_name} {where_clause}"""
def null_percentage_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
return f"""select ({self.null_count_sql(field_name, entity_name, dataset_filter)})/count(*)
from {entity_name} {self._setup_where_clause([dataset_filter])}"""
def min_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
return f"""select min({field_name})
from {entity_name} {self._setup_where_clause([dataset_filter])}"""
def max_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
return f"""select max({field_name})
from {entity_name} {self._setup_where_clause([dataset_filter])}"""
def mean_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
return f"""select avg({field_name})
from {entity_name} {self._setup_where_clause([dataset_filter])}"""
def median_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
return f"""select median({field_name})
from {entity_name} {self._setup_where_clause([dataset_filter])}"""
def stddev_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
return f"""select stddev({field_name})
from {entity_name} {self._setup_where_clause([dataset_filter])}"""
def negative_count_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
where_clause = self._setup_where_clause([dataset_filter, f"{field_name} < 0"])
return f"""select count(*)
from {entity_name} {where_clause}"""
def negative_percentage_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
return f"""select ({self.negative_count_sql(field_name, entity_name, dataset_filter)})/count(*)
from {entity_name} {self._setup_where_clause([dataset_filter])}"""
def zero_count_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
where_clause = self._setup_where_clause([dataset_filter, f"{field_name} = 0"])
return f"""select count(*)
from {entity_name} {where_clause}"""
def zero_percentage_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
return f"""select ({self.zero_count_sql(field_name, entity_name, dataset_filter)})/count(*)
from {entity_name} {self._setup_where_clause([dataset_filter])}"""
def min_length_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
return f"""select min(length({field_name}))
from {entity_name} {self._setup_where_clause([dataset_filter])}"""
def max_length_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
return f"""select max(length({field_name}))
from {entity_name} {self._setup_where_clause([dataset_filter])}"""
def empty_count_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
where_clause = self._setup_where_clause(
[dataset_filter, f"({field_name} is null or trim({field_name})='')"]
)
return f"""select count(*)
from {entity_name} {where_clause}"""
def empty_percentage_sql(
self, field_name: str, entity_name: str, dataset_filter: Optional[str]
) -> str:
return f"""select ({self.empty_count_sql(field_name, entity_name, dataset_filter)})/count(*)
from {entity_name} {self._setup_where_clause([dataset_filter])}"""
def _setup_where_clause(self, filters: List[Optional[str]]) -> str:
where_clause = " and ".join(f for f in filters if f)
return f"where {where_clause}" if where_clause else ""
def metric_sql(self, assertion: FieldMetricAssertion) -> str:
metric_sql_mapping = {
FieldMetric.UNIQUE_COUNT: self.unique_count_sql,
FieldMetric.UNIQUE_PERCENTAGE: self.unique_percentage_sql,
FieldMetric.NULL_COUNT: self.null_count_sql,
FieldMetric.NULL_PERCENTAGE: self.null_percentage_sql,
FieldMetric.MIN: self.min_sql,
FieldMetric.MAX: self.max_sql,
FieldMetric.MEAN: self.mean_sql,
FieldMetric.MEDIAN: self.median_sql,
FieldMetric.STDDEV: self.stddev_sql,
FieldMetric.NEGATIVE_COUNT: self.negative_count_sql,
FieldMetric.NEGATIVE_PERCENTAGE: self.negative_percentage_sql,
FieldMetric.ZERO_COUNT: self.zero_count_sql,
FieldMetric.ZERO_PERCENTAGE: self.zero_percentage_sql,
FieldMetric.MIN_LENGTH: self.min_length_sql,
FieldMetric.MAX_LENGTH: self.max_length_sql,
FieldMetric.EMPTY_COUNT: self.empty_count_sql,
FieldMetric.EMPTY_PERCENTAGE: self.empty_percentage_sql,
}
entity_name = ".".join(get_entity_name(assertion))
return metric_sql_mapping[assertion.metric](
assertion.field,
entity_name,
(
assertion.filters.sql
if assertion.filters and assertion.filters.sql
else None
),
)

View File

@ -0,0 +1,283 @@
from functools import singledispatchmethod
from typing import List, Optional
from datahub.api.entities.assertion.assertion_operator import (
BetweenOperator,
ContainsOperator,
EndsWithOperator,
EqualToOperator,
GreaterThanOperator,
GreaterThanOrEqualToOperator,
InOperator,
IsFalseOperator,
IsNullOperator,
IsTrueOperator,
LessThanOperator,
LessThanOrEqualToOperator,
MatchesRegexOperator,
NotEqualToOperator,
NotInOperator,
NotNullOperator,
Operators,
StartsWithOperator,
)
from datahub.api.entities.assertion.field_assertion import (
FieldTransform,
FieldValuesAssertion,
)
from datahub.integrations.assertion.common import get_entity_name
class SnowflakeFieldValuesMetricSQLGenerator:
@singledispatchmethod
def values_metric_sql(
self,
operators: Operators,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
"""
Generates SQL that would return boolean value for each table row.
1 if FAIL and 0 if PASS. Note the unusual reversal of 1 and 0.
This is deliberate, as metric represents number of failing rows.
"""
raise ValueError(f"Unsupported values metric operator type {type(operators)} ")
@values_metric_sql.register
def _(
self,
operators: InOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when {transformed_field} in {tuple(operators.value)} then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: NotInOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when {transformed_field} not in {tuple(operators.value)} then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: EqualToOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when {transformed_field} = {operators.value} then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: NotEqualToOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when {transformed_field} != {operators.value} then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: BetweenOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when {transformed_field} between {operators.min} and {operators.max} then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: LessThanOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when {transformed_field} < {operators.value} then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: LessThanOrEqualToOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when {transformed_field} <= {operators.value} then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: GreaterThanOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when {transformed_field} > {operators.value} then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: GreaterThanOrEqualToOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when {transformed_field} >= {operators.value} then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: IsNullOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when {transformed_field} is null then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: NotNullOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when {transformed_field} is not null then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: IsTrueOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when {transformed_field} then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: IsFalseOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when not {transformed_field} then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: ContainsOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when contains({transformed_field},'{operators.value}') then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: StartsWithOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when startswith({transformed_field},'{operators.value}') then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: EndsWithOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when endswith({transformed_field},'{operators.value}') then 0 else 1 end
from {entity_name} {where_clause}"""
@values_metric_sql.register
def _(
self,
operators: MatchesRegexOperator,
entity_name: str,
transformed_field: str,
where_clause: str,
) -> str:
return f"""select case when REGEXP_LIKE({transformed_field},'{operators.value}') then 0 else 1 end
from {entity_name} {where_clause}"""
def _setup_where_clause(self, filters: List[Optional[str]]) -> str:
where_clause = " and ".join(f for f in filters if f)
return f"where {where_clause}" if where_clause else ""
def _setup_field_transform(
self, field: str, transform: Optional[FieldTransform]
) -> str:
if transform is None:
return field
elif transform is FieldTransform.LENGTH:
return f"length({field})"
raise ValueError(f"Unsupported transform type {transform}")
def metric_sql(self, assertion: FieldValuesAssertion) -> str:
"""
Note that this applies negative operator in order to check whether or not
number of invalid value rows are less than configured failThreshold.
Args:
assertion (FieldValuesAssertion): _description_
Returns:
str: _description_
"""
entity_name = ".".join(get_entity_name(assertion))
dataset_filter = (
assertion.filters.sql
if assertion.filters and assertion.filters.sql
else None
)
where_clause = self._setup_where_clause(
[
dataset_filter,
f"{assertion.field} is not null" if assertion.exclude_nulls else None,
]
)
transformed_field = self._setup_field_transform(
assertion.field, assertion.field_transform
)
# this sql would return boolean value for each table row. 1 if fail and 0 if pass.
sql = self.values_metric_sql(
assertion.operator, entity_name, transformed_field, where_clause
)
# metric would be number of failing rows OR percentage of failing rows.
if assertion.failure_threshold.type == "count":
return f"select sum($1) as metric from ({sql})"
else: # percentage
return f"select sum($1)/count(*) as metric from ({sql})"

View File

@ -0,0 +1,68 @@
from functools import singledispatchmethod
from datahub.api.entities.assertion.assertion_operator import (
BetweenOperator,
EqualToOperator,
GreaterThanOperator,
GreaterThanOrEqualToOperator,
IsFalseOperator,
IsNullOperator,
IsTrueOperator,
LessThanOperator,
LessThanOrEqualToOperator,
NotNullOperator,
Operators,
)
class SnowflakeMetricEvalOperatorSQLGenerator:
@singledispatchmethod
def operator_sql(self, operators: Operators, metric_sql: str) -> str:
"""
Generates Operator SQL that applies operator on `metric`
and returns a numeric boolean value 1 if PASS, 0 if FAIL
"""
raise ValueError(f"Unsupported metric operator type {type(operators)} ")
@operator_sql.register
def _(self, operators: EqualToOperator, metric_sql: str) -> str:
return f"select case when metric={operators.value} then 1 else 0 end from ({metric_sql})"
@operator_sql.register
def _(self, operators: BetweenOperator, metric_sql: str) -> str:
return f"select case when metric between {operators.min} and {operators.max} then 1 else 0 end from ({metric_sql})"
@operator_sql.register
def _(self, operators: LessThanOperator, metric_sql: str) -> str:
return f"select case when metric < {operators.value} then 1 else 0 end from ({metric_sql})"
@operator_sql.register
def _(self, operators: LessThanOrEqualToOperator, metric_sql: str) -> str:
return f"select case when metric <= {operators.value} then 1 else 0 end from ({metric_sql})"
@operator_sql.register
def _(self, operators: GreaterThanOperator, metric_sql: str) -> str:
return f"select case when metric > {operators.value} then 1 else 0 end from ({metric_sql})"
@operator_sql.register
def _(self, operators: GreaterThanOrEqualToOperator, metric_sql: str) -> str:
return f"select case when metric >= {operators.value} then 1 else 0 end from ({metric_sql})"
@operator_sql.register
def _(self, operators: NotNullOperator, metric_sql: str) -> str:
return (
f"select case when metric is not null then 1 else 0 end from ({metric_sql})"
)
@operator_sql.register
def _(self, operators: IsNullOperator, metric_sql: str) -> str:
return f"select case when metric is null then 1 else 0 end from ({metric_sql})"
@operator_sql.register
def _(self, operators: IsTrueOperator, metric_sql: str) -> str:
return f"select case when metric then 1 else 0 end from ({metric_sql})"
@operator_sql.register
def _(self, operators: IsFalseOperator, metric_sql: str) -> str:
return f"select case when not metric then 1 else 0 end from ({metric_sql})"

View File

@ -0,0 +1,97 @@
from dataclasses import dataclass
from functools import singledispatchmethod
from datahub.api.entities.assertion.assertion import BaseEntityAssertion
from datahub.api.entities.assertion.field_assertion import (
FieldMetricAssertion,
FieldValuesAssertion,
)
from datahub.api.entities.assertion.freshness_assertion import (
FixedIntervalFreshnessAssertion,
FreshnessSourceType,
)
from datahub.api.entities.assertion.sql_assertion import (
SqlMetricAssertion,
SqlMetricChangeAssertion,
)
from datahub.api.entities.assertion.volume_assertion import (
RowCountChangeVolumeAssertion,
RowCountTotalVolumeAssertion,
)
from datahub.integrations.assertion.common import get_entity_name
from datahub.integrations.assertion.snowflake.field_metric_sql_generator import (
SnowflakeFieldMetricSQLGenerator,
)
from datahub.integrations.assertion.snowflake.field_values_metric_sql_generator import (
SnowflakeFieldValuesMetricSQLGenerator,
)
@dataclass
class SnowflakeMetricSQLGenerator:
field_metric_sql_generator: SnowflakeFieldMetricSQLGenerator
field_values_metric_sql_generator: SnowflakeFieldValuesMetricSQLGenerator
@singledispatchmethod
def metric_sql(
self,
assertion: BaseEntityAssertion,
) -> str:
"""Generates Metric SQL that typically returns a numeric metric"""
raise ValueError(f"Unsupported assertion type {type(assertion)} ")
@metric_sql.register
def _(self, assertion: RowCountChangeVolumeAssertion) -> str:
raise ValueError(f"Unsupported assertion type {type(assertion)} ")
@metric_sql.register
def _(self, assertion: SqlMetricChangeAssertion) -> str:
raise ValueError(f"Unsupported assertion type {type(assertion)} ")
@metric_sql.register
def _(self, assertion: FixedIntervalFreshnessAssertion) -> str:
entity_name = ".".join(get_entity_name(assertion))
if assertion.filters and assertion.filters.sql:
where_clause = f"where {assertion.filters.sql}"
else:
where_clause = ""
if (
assertion.source_type == FreshnessSourceType.LAST_MODIFIED_COLUMN
and assertion.last_modified_field
):
return f"""select timediff(
second,
max({assertion.last_modified_field}::TIMESTAMP_LTZ),
SNOWFLAKE.CORE.DATA_METRIC_SCHEDULED_TIME()
) as metric from {entity_name} {where_clause}"""
else:
raise ValueError(
f"Unsupported freshness source type {assertion.source_type} "
)
@metric_sql.register
def _(self, assertion: RowCountTotalVolumeAssertion) -> str:
# Can not use information schema here due to error -
# Data metric function body cannot refer to the non-deterministic function 'CURRENT_DATABASE_MAIN_METASTORE_ID'.
entity_name = ".".join(get_entity_name(assertion))
if assertion.filters and assertion.filters.sql:
where_clause = f"where {assertion.filters.sql}"
else:
where_clause = ""
return f"select count(*) as metric from {entity_name} {where_clause}"
@metric_sql.register
def _(self, assertion: SqlMetricAssertion) -> str:
return f"select $1 as metric from ({assertion.statement})"
@metric_sql.register
def _(self, assertion: FieldMetricAssertion) -> str:
sql = self.field_metric_sql_generator.metric_sql(assertion)
return f"select $1 as metric from ({sql})"
@metric_sql.register
def _(self, assertion: FieldValuesAssertion) -> str:
return self.field_values_metric_sql_generator.metric_sql(assertion)

View File

@ -0,0 +1,76 @@
version: 1
namespace: test-config-id-1
assertions:
# Freshness Assertion
- entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD)
type: freshness
lookback_interval: "1 hour"
last_modified_field: col_timestamp
schedule:
type: cron
cron: 0 * * * *
meta:
entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES
entity_schema:
- col: col_date
native_type: DATE
# Volume Assertion
- type: volume
entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD)
metric: row_count
condition:
type: less_than_or_equal_to
value: 1000
schedule:
type: cron
cron: 0 * * * *
meta:
entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES
entity_schema:
- col: col_date
native_type: DATE
# Field Metric Assertion
- type: field
entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.test_assertions_all_times,PROD)
field: col_date
metric: null_count
condition:
type: equal_to
value: 0
schedule:
type: cron
cron: 0 * * * *
meta:
entity_qualified_name: TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES
entity_schema:
- col: col_date
native_type: DATE
# Field Value Assertion
- type: field
entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.purchase_event,PROD)
field: quantity
condition:
type: between
min: 0
max: 10
schedule:
type: on_table_change
meta:
entity_qualified_name: TEST_DB.PUBLIC.PURCHASE_EVENT
entity_schema:
- col: quantity
native_type: FLOAT
# Custom SQL Metric Assertion
- type: sql
entity: urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.public.purchase_event,PROD)
statement: select mode(quantity) from test_db.public.purchase_event
condition:
type: equal_to
value: 5
schedule:
type: on_table_change
meta:
entity_qualified_name: TEST_DB.PUBLIC.PURCHASE_EVENT
entity_schema:
- col: quantity
native_type: FLOAT

View File

@ -0,0 +1,13 @@
from datahub.api.entities.assertion.assertion_config_spec import AssertionsConfigSpec
def test_assertion_config_spec_parses_correct_type(pytestconfig):
config_file = (
pytestconfig.rootpath
/ "tests/unit/api/entities/assertion/test_assertion_config.yml"
)
config_spec = AssertionsConfigSpec.from_yaml(config_file)
assert config_spec.version == 1
assert config_spec.id == "test-config-id-1"
assert len(config_spec.assertions) == 5

View File

@ -0,0 +1,35 @@
-- Start of Assertion 025cce4dd4123c0f007908011a9c64d7
ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES SET DATA_METRIC_SCHEDULE = 'USING CRON 0 * * * * UTC';
ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__025cce4dd4123c0f007908011a9c64d7 ON (col_date);
-- End of Assertion 025cce4dd4123c0f007908011a9c64d7
-- Start of Assertion 5c32eef47bd763fece7d21c7cbf6c659
ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES SET DATA_METRIC_SCHEDULE = 'USING CRON 0 * * * * UTC';
ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__5c32eef47bd763fece7d21c7cbf6c659 ON (col_date);
-- End of Assertion 5c32eef47bd763fece7d21c7cbf6c659
-- Start of Assertion 04be4145bd8de10bed3dfcb0cee57842
ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES SET DATA_METRIC_SCHEDULE = 'USING CRON 0 * * * * UTC';
ALTER TABLE TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__04be4145bd8de10bed3dfcb0cee57842 ON (col_date);
-- End of Assertion 04be4145bd8de10bed3dfcb0cee57842
-- Start of Assertion b065942d2bca8a4dbe90cc3ec2d9ca9f
ALTER TABLE TEST_DB.PUBLIC.PURCHASE_EVENT SET DATA_METRIC_SCHEDULE = 'TRIGGER_ON_CHANGES';
ALTER TABLE TEST_DB.PUBLIC.PURCHASE_EVENT ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__b065942d2bca8a4dbe90cc3ec2d9ca9f ON (quantity);
-- End of Assertion b065942d2bca8a4dbe90cc3ec2d9ca9f
-- Start of Assertion 170dbd53f28eedbbaba52ebbf189f6b1
ALTER TABLE TEST_DB.PUBLIC.PURCHASE_EVENT SET DATA_METRIC_SCHEDULE = 'TRIGGER_ON_CHANGES';
ALTER TABLE TEST_DB.PUBLIC.PURCHASE_EVENT ADD DATA METRIC FUNCTION test_db.datahub_dmfs.datahub__170dbd53f28eedbbaba52ebbf189f6b1 ON (quantity);
-- End of Assertion 170dbd53f28eedbbaba52ebbf189f6b1

View File

@ -0,0 +1,71 @@
-- Start of Assertion 025cce4dd4123c0f007908011a9c64d7
CREATE or REPLACE DATA METRIC FUNCTION
test_db.datahub_dmfs.datahub__025cce4dd4123c0f007908011a9c64d7 (ARGT TABLE(col_date DATE))
RETURNS NUMBER
COMMENT = 'Created via DataHub for assertion urn:li:assertion:025cce4dd4123c0f007908011a9c64d7 of type freshness'
AS
$$
select case when metric <= 3600 then 1 else 0 end from (select timediff(
second,
max(col_timestamp::TIMESTAMP_LTZ),
SNOWFLAKE.CORE.DATA_METRIC_SCHEDULED_TIME()
) as metric from TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES )
$$;
-- End of Assertion 025cce4dd4123c0f007908011a9c64d7
-- Start of Assertion 5c32eef47bd763fece7d21c7cbf6c659
CREATE or REPLACE DATA METRIC FUNCTION
test_db.datahub_dmfs.datahub__5c32eef47bd763fece7d21c7cbf6c659 (ARGT TABLE(col_date DATE))
RETURNS NUMBER
COMMENT = 'Created via DataHub for assertion urn:li:assertion:5c32eef47bd763fece7d21c7cbf6c659 of type volume'
AS
$$
select case when metric <= 1000 then 1 else 0 end from (select count(*) as metric from TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES )
$$;
-- End of Assertion 5c32eef47bd763fece7d21c7cbf6c659
-- Start of Assertion 04be4145bd8de10bed3dfcb0cee57842
CREATE or REPLACE DATA METRIC FUNCTION
test_db.datahub_dmfs.datahub__04be4145bd8de10bed3dfcb0cee57842 (ARGT TABLE(col_date DATE))
RETURNS NUMBER
COMMENT = 'Created via DataHub for assertion urn:li:assertion:04be4145bd8de10bed3dfcb0cee57842 of type field'
AS
$$
select case when metric=0 then 1 else 0 end from (select $1 as metric from (select count(*)
from TEST_DB.PUBLIC.TEST_ASSERTIONS_ALL_TIMES where col_date is null))
$$;
-- End of Assertion 04be4145bd8de10bed3dfcb0cee57842
-- Start of Assertion b065942d2bca8a4dbe90cc3ec2d9ca9f
CREATE or REPLACE DATA METRIC FUNCTION
test_db.datahub_dmfs.datahub__b065942d2bca8a4dbe90cc3ec2d9ca9f (ARGT TABLE(quantity FLOAT))
RETURNS NUMBER
COMMENT = 'Created via DataHub for assertion urn:li:assertion:b065942d2bca8a4dbe90cc3ec2d9ca9f of type field'
AS
$$
select case when metric <= 0 then 1 else 0 end from (select sum($1) as metric from (select case when quantity between 0 and 10 then 0 else 1 end
from TEST_DB.PUBLIC.PURCHASE_EVENT where quantity is not null))
$$;
-- End of Assertion b065942d2bca8a4dbe90cc3ec2d9ca9f
-- Start of Assertion 170dbd53f28eedbbaba52ebbf189f6b1
CREATE or REPLACE DATA METRIC FUNCTION
test_db.datahub_dmfs.datahub__170dbd53f28eedbbaba52ebbf189f6b1 (ARGT TABLE(quantity FLOAT))
RETURNS NUMBER
COMMENT = 'Created via DataHub for assertion urn:li:assertion:170dbd53f28eedbbaba52ebbf189f6b1 of type sql'
AS
$$
select case when metric=5 then 1 else 0 end from (select $1 as metric from (select mode(quantity) from test_db.public.purchase_event))
$$;
-- End of Assertion 170dbd53f28eedbbaba52ebbf189f6b1

View File

@ -0,0 +1,42 @@
import filecmp
import os
from datahub.integrations.assertion.snowflake.compiler import (
DMF_ASSOCIATIONS_FILE_NAME,
DMF_DEFINITIONS_FILE_NAME,
)
from tests.test_helpers.click_helpers import run_datahub_cmd
def test_compile_assertion_config_spec_for_snowflake(pytestconfig, tmp_path):
config_file = (
pytestconfig.rootpath
/ "tests/unit/api/entities/assertion/test_assertion_config.yml"
).resolve()
golden_file_path = pytestconfig.rootpath / "tests/unit/cli/assertion/"
run_datahub_cmd(
[
"assertions",
"compile",
"-f",
f"{config_file}",
"-p",
"snowflake",
"-x",
"DMF_SCHEMA=test_db.datahub_dmfs",
"-o",
tmp_path,
],
)
output_file_names = [
DMF_DEFINITIONS_FILE_NAME,
DMF_ASSOCIATIONS_FILE_NAME,
]
for file_name in output_file_names:
assert os.path.exists(tmp_path / file_name)
assert filecmp.cmp(
golden_file_path / file_name, tmp_path / file_name
), f"{file_name} is not as expected"

View File

@ -2,6 +2,7 @@ namespace com.linkedin.assertion
import com.linkedin.common.CustomProperties
import com.linkedin.common.ExternalReference
import com.linkedin.common.AuditStamp
/**
* Information about an assertion
@ -66,10 +67,15 @@ record AssertionInfo includes CustomProperties, ExternalReference {
volumeAssertion: optional VolumeAssertionInfo
/**
* A SQL Assertion definition. This field is populated when the type is SQL.
* A SQL Assertion definition. This field is populated when the type is SQL.
*/
sqlAssertion: optional SqlAssertionInfo
/**
* A Field Assertion definition. This field is populated when the type is FIELD.
*/
fieldAssertion: optional FieldAssertionInfo
/**
* An schema Assertion definition. This field is populated when the type is DATA_SCHEMA
*/
@ -83,6 +89,12 @@ record AssertionInfo includes CustomProperties, ExternalReference {
*/
source: optional AssertionSource
/**
* The time at which the assertion was last updated and the actor who updated it.
* This field is only present for Native assertions updated after this field was introduced.
*/
lastUpdated: optional AuditStamp
/**
* An optional human-readable description of the assertion
*/

View File

@ -33,6 +33,14 @@ record AssertionResultError {
*/
UNSUPPORTED_PLATFORM
/**
* Error while executing a custom SQL assertion
*/
CUSTOM_SQL_ERROR
/**
* Error while executing a field assertion
*/
FIELD_ASSERTION_ERROR
/**
* Unknown error
*/
UNKNOWN_ERROR
@ -42,4 +50,4 @@ record AssertionResultError {
* Additional metadata depending on the type of error
*/
properties: optional map[string, string]
}
}

View File

@ -1,5 +1,7 @@
namespace com.linkedin.assertion
import com.linkedin.common.AuditStamp
/**
* The source of an assertion
*/
@ -24,4 +26,10 @@ record AssertionSource {
*/
INFERRED
}
/**
* The time at which the assertion was initially created and the author who created it.
* This field is only present for Native assertions created after this field was introduced.
*/
created: optional AuditStamp
}

View File

@ -34,6 +34,16 @@ enum AssertionStdOperator {
*/
EQUAL_TO
/**
* Value being asserted is not equal to value. Requires 'value' parameter.
*/
NOT_EQUAL_TO
/**
* Value being asserted is null. Requires no parameters.
*/
NULL
/**
* Value being asserted is not null. Requires no parameters.
*/
@ -69,6 +79,16 @@ enum AssertionStdOperator {
*/
NOT_IN
/**
* Value being asserted is true. Requires no parameters.
*/
IS_TRUE
/**
* Value being asserted is false. Requires no parameters.
*/
IS_FALSE
/**
* Other
*/

View File

@ -13,10 +13,29 @@ record AssertionStdParameter {
* The type of the parameter
*/
type: enum AssertionStdParameterType {
/**
* A string value
*/
STRING
/**
* A numeric value
*/
NUMBER
/**
* A list of values. When used, value should be formatted as a serialized JSON array.
*/
LIST
/**
* A set of values. When used, value should be formatted as a serialized JSON array.
*/
SET
/**
* A value of unknown type
*/
UNKNOWN
}
}

View File

@ -0,0 +1,57 @@
namespace com.linkedin.assertion
import com.linkedin.common.Urn
import com.linkedin.dataset.DatasetFilter
/**
* Attributes defining a Field Assertion.
**/
record FieldAssertionInfo {
/**
* The type of the field assertion being monitored.
*/
@Searchable = {}
type: enum FieldAssertionType {
/**
* An assertion used to validate the values contained with a field / column given a set of rows.
*/
FIELD_VALUES
/**
* An assertion used to validate the value of a common field / column metric (e.g. aggregation) such as null count + percentage,
* min, max, median, and more.
*/
FIELD_METRIC
}
/**
* The entity targeted by this Field check.
*/
@Searchable = {
"fieldType": "URN"
}
@Relationship = {
"name": "Asserts",
"entityTypes": [ "dataset" ]
}
entity: Urn
/**
* The definition of an assertion that validates individual values of a field / column for a set of rows.
* This type of assertion verifies that each column value meets a particular requirement.
*/
fieldValuesAssertion: optional FieldValuesAssertion
/**
* The definition of an assertion that validates a common metric obtained about a field / column for a set of rows.
* This type of assertion verifies that the value of a high-level metric obtained by aggregating over a column meets
* expectations
*/
fieldMetricAssertion: optional FieldMetricAssertion
/**
* A definition of the specific filters that should be applied, when performing monitoring.
* If not provided, there is no filter, and the full table is under consideration.
* If using DataHub Dataset Profiles as the assertion source type, the value of this field will be ignored.
*/
filter: optional DatasetFilter
}

View File

@ -0,0 +1,39 @@
namespace com.linkedin.assertion
import com.linkedin.schema.SchemaFieldSpec
/**
* Attributes defining a field metric assertion, which asserts an expectation against
* a common metric derived from the set of field / column values, for example:
* max, min, median, null count, null percentage, unique count, unique percentage, and more.
*/
record FieldMetricAssertion {
/**
* The field under evaluation
*/
@Searchable = {
"/path": {
"fieldName": "fieldPath"
}
}
field: SchemaFieldSpec
/**
* The specific metric to assert against. This is the value that
* will be obtained by applying a standard operation, such as an aggregation,
* to the selected field.
*/
metric: FieldMetricType
/**
* The predicate to evaluate against the metric for the field / column.
* Depending on the operator, parameters may be required in order to successfully
* evaluate the assertion against the metric value.
*/
operator: AssertionStdOperator
/**
* Standard parameters required for the assertion. e.g. min_value, max_value, value, columns
*/
parameters: optional AssertionStdParameters
}

View File

@ -0,0 +1,94 @@
namespace com.linkedin.assertion
/**
* A standard metric that can be derived from the set of values
* for a specific field / column of a dataset / table.
*/
enum FieldMetricType {
/**
* The number of unique values found in the column value set
*/
UNIQUE_COUNT
/**
* The percentage of unique values to total rows for the dataset
*/
UNIQUE_PERCENTAGE
/**
* The number of null values found in the column value set
*/
NULL_COUNT
/**
* The percentage of null values to total rows for the dataset
*/
NULL_PERCENTAGE
/**
* The minimum value in the column set (applies to numeric columns)
*/
MIN
/**
* The maximum value in the column set (applies to numeric columns)
*/
MAX
/**
* The mean length found in the column set (applies to numeric columns)
*/
MEAN
/**
* The median length found in the column set (applies to numeric columns)
*/
MEDIAN
/**
* The stddev length found in the column set (applies to numeric columns)
*/
STDDEV
/**
* The number of negative values found in the value set (applies to numeric columns)
*/
NEGATIVE_COUNT
/**
* The percentage of negative values to total rows for the dataset (applies to numeric columns)
*/
NEGATIVE_PERCENTAGE
/**
* The number of zero values found in the value set (applies to numeric columns)
*/
ZERO_COUNT
/**
* The percentage of zero values to total rows for the dataset (applies to numeric columns)
*/
ZERO_PERCENTAGE
/**
* The minimum length found in the column set (applies to string columns)
*/
MIN_LENGTH
/**
* The maximum length found in the column set (applies to string columns)
*/
MAX_LENGTH
/**
* The number of empty string values found in the value set (applies to string columns).
* Note: This is a completely different metric different from NULL_COUNT!
*/
EMPTY_COUNT
/**
* The percentage of empty string values to total rows for the dataset (applies to string columns)
* Note: This is a completely different metric different from NULL_PERCENTAGE!
*/
EMPTY_PERCENTAGE
}

View File

@ -0,0 +1,21 @@
namespace com.linkedin.assertion
/**
* Definition of a transform applied to the values of a column / field.
* Note that the applicability of a field transform ultimately depends on the native type
* of the field / column.
*
* Model has single field to permit extension.
*/
record FieldTransform {
/**
* The type of the field transform, e.g. the transformation
* function / operator to apply.
*/
type: enum FieldTransformType {
/**
* Obtain the length of a string field / column (applicable to string types)
*/
LENGTH
}
}

View File

@ -0,0 +1,83 @@
namespace com.linkedin.assertion
import com.linkedin.schema.SchemaFieldSpec
/**
* Attributes defining a field values assertion, which asserts that the values for a field / column
* of a dataset / table matches a set of expectations.
*
* In other words, this type of assertion acts as a semantic constraint applied to fields for a specific column.
*
* TODO: We should display the "failed row count" to the user if the column fails the verification rules.
* TODO: Determine whether we need an "operator" that can be applied to the field.
*/
record FieldValuesAssertion {
/**
* The field under evaluation
*/
@Searchable = {
"/path": {
"fieldName": "fieldPath"
}
}
field: SchemaFieldSpec
/**
* An optional transform to apply to field values
* before evaluating the operator.
*
* If none is applied, the field value will be compared as is.
*/
transform: optional FieldTransform
/**
* The predicate to evaluate against a single value of the field.
* Depending on the operator, parameters may be required in order to successfully
* evaluate the assertion against the field value.
*/
operator: AssertionStdOperator
/**
* Standard parameters required for the assertion. e.g. min_value, max_value, value, columns
*/
parameters: optional AssertionStdParameters
/**
* Additional customization about when the assertion
* should be officially considered failing.
*/
failThreshold: record FieldValuesFailThreshold {
/**
* The type of failure threshold. Either based on the number
* of column values (rows) that fail the expectations, or the percentage
* of the total rows under consideration.
*/
type: enum FieldValuesFailThresholdType {
/*
* The maximum number of column values (i.e. rows) that are allowed
* to fail the defined expectations before the assertion officially fails.
*/
COUNT
/*
* The maximum percentage of rows that are allowed
* to fail the defined column expectations before the assertion officially fails.
*/
PERCENTAGE
} = "COUNT"
/**
* By default this is 0, meaning that ALL column values (i.e. rows) must
* meet the defined expectations.
*/
value: long = 0
}
/**
* Whether to ignore or allow nulls when running the values assertion. (i.e.
* consider only non-null values) using operators OTHER than the IS_NULL operator.
*
* Defaults to true, allowing null values.
*/
excludeNulls: boolean = true
}

View File

@ -4,11 +4,13 @@ import com.linkedin.schema.SchemaFieldSpec
/**
* Lightweight spec used for referencing a particular schema field.
**/
* Lightweight spec used for referencing a particular schema field that is used to compute
* a freshness signal or operation.
* TODO: Since this is now leveraged across assertions & metrics / operations, we should consider moving this to a common package.
*/
record FreshnessFieldSpec includes SchemaFieldSpec {
/**
* The type of the field being used to verify the Freshness Assertion.
* The type of the field being used to verify the Freshness of the asset.
*/
kind: optional FreshnessFieldKind
}

View File

@ -25,5 +25,36 @@ record SchemaAssertionInfo {
* Note that many of the fields of this model, especially those related to metadata (tags, terms)
* will go unused in this context.
*/
schema: SchemaMetadata
// @Relationship = {
// "/foreignKeys/*/foreignFields/*": null,
// "/foreignKeys/*/foreignDataset": null,
// "/fields/*/globalTags/tags/*/tag": null,
// "/fields/*/glossaryTerms/terms/*/urn": null
// }
// @Searchable = {
// "/fields/*/fieldPath": null,
// "/fields/*/description": null,
// "/fields/*/label": null,
// "/fields/*/globalTags/tags/*/tag": null,
// "/fields/*/glossaryTerms/terms/*/urn": null
// }
schema: SchemaMetadata
/**
* The required compatibility level for the schema assertion to pass.
*/
compatibility: optional enum SchemaAssertionCompatibility {
/**
* The actual schema must be exactly the same as the expected schema
*/
EXACT_MATCH,
/**
* The actual schema must be a superset of the expected schema
*/
SUPERSET,
/**
* The actual schema must be a subset of the expected schema
*/
SUBSET
} = "EXACT_MATCH"
}

View File

@ -8,7 +8,7 @@ import com.linkedin.dataset.DatasetFilter
*/
record VolumeAssertionInfo {
/**
* The type of the freshness assertion being monitored.
* The type of the volume assertion being monitored.
*/
@Searchable = {}
type: enum VolumeAssertionType {

View File

@ -12,5 +12,9 @@ record DataQualityContract {
* The assertion representing the Data Quality contract.
* E.g. a table or column-level assertion.
*/
@Relationship = {
"name": "IncludesDataQualityAssertion",
"entityTypes": [ "assertion" ]
}
assertion: Urn
}

View File

@ -9,5 +9,9 @@ record SchemaContract {
/**
* The assertion representing the schema contract.
*/
@Relationship = {
"name": "IncludesSchemaAssertion",
"entityTypes": [ "assertion" ]
}
assertion: Urn
}

View File

@ -22,6 +22,11 @@ record IncidentSource {
* Manually created incident, via UI or API.
*/
MANUAL
/**
* An assertion has failed, triggering the incident.
*/
ASSERTION_FAILURE
}
/**

View File

@ -4,6 +4,36 @@ namespace com.linkedin.incident
* A type of asset incident
*/
enum IncidentType {
/**
* An Freshness Assertion has failed, triggering the incident.
* Raised on entities where assertions are configured to generate incidents.
*/
FRESHNESS
/**
* An Volume Assertion has failed, triggering the incident.
* Raised on entities where assertions are configured to generate incidents.
*/
VOLUME
/**
* A Field Assertion has failed, triggering the incident.
* Raised on entities where assertions are configured to generate incidents.
*/
FIELD
/**
* A raw SQL-statement based assertion has failed, triggering the incident.
* Raised on entities where assertions are configured to generate incidents.
*/
SQL
/**
* A Data Schema assertion has failed, triggering the incident.
* Raised on entities where assertions are configured to generate incidents.
*/
DATA_SCHEMA
/**
* A misc. operational incident, e.g. failure to materialize a dataset.
*/

View File

@ -68,7 +68,7 @@ import org.springframework.context.annotation.Import;
EntityRegistryFactory.class,
DataHubTokenServiceFactory.class,
GitVersionFactory.class,
SiblingGraphServiceFactory.class
SiblingGraphServiceFactory.class,
})
public class GraphQLEngineFactory {
@Autowired