datahub/metadata-ingestion/examples/library/data_quality_mcpw_rest.py
ksrinath 4c24f386a6
feat(model): data quality model (#3787)
Co-authored-by: Ravindra Lanka <rlanka@acryl.io>
Co-authored-by: Mayuri N <mayuri.nehate@gslab.com>
2022-02-02 22:56:08 -08:00

135 lines
4.1 KiB
Python

import time
import datahub.emitter.mce_builder as builder
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.metadata.com.linkedin.pegasus2avro.assertion import (
AssertionInfo,
AssertionResult,
AssertionScope,
AssertionStdOperator,
AssertionType,
BatchAssertionResult,
DatasetColumnAssertion,
DatasetColumnStdAggFunc,
)
from datahub.metadata.com.linkedin.pegasus2avro.events.metadata import ChangeType
from datahub.metadata.schema_classes import PartitionSpecClass
def datasetUrn(tbl: str) -> str:
return builder.make_dataset_urn("postgres", tbl)
def fldUrn(tbl: str, fld: str) -> str:
return f"urn:li:schemaField:({datasetUrn(tbl)}, {fld})"
def assertionUrn(info: AssertionInfo) -> str:
assertionId = builder.datahub_guid(info.to_obj())
return builder.make_assertion_urn(assertionId)
def emitAssertionResult(
assertionResult: AssertionResult, datasetUrn: str
) -> None:
dataset_assertionResult_mcp = MetadataChangeProposalWrapper(
entityType="dataset",
changeType=ChangeType.UPSERT,
entityUrn=datasetUrn,
aspectName="assertionResult",
aspect=assertionResult,
)
# Emit BatchAssertion Result! (timseries aspect)
emitter.emit_mcp(dataset_assertionResult_mcp)
# Construct an assertion object.
assertion_maxVal = AssertionInfo(
datasetFields=[fldUrn("fooTable", "col1")],
datasets=[datasetUrn("fooTable")],
assertionType=AssertionType(
scope=AssertionScope.DATASET_COLUMN,
datasetColumnAssertion=DatasetColumnAssertion(
stdOperator=AssertionStdOperator.LESS_THAN,
nativeOperator="column_value_is_less_than",
stdAggFunc=DatasetColumnStdAggFunc.IDENTITY,
),
),
assertionParameters={"max_value": "99"},
customProperties={"suite_name": "demo_suite"},
)
# Construct a MetadataChangeProposalWrapper object.
assertion_maxVal_mcp = MetadataChangeProposalWrapper(
entityType="assertion",
changeType=ChangeType.UPSERT,
entityUrn=assertionUrn(assertion_maxVal),
aspectName="assertionInfo",
aspect=assertion_maxVal,
)
# Create an emitter to the GMS REST API.
emitter = DatahubRestEmitter("http://localhost:8080")
# Emit Assertion entity info object!
emitter.emit_mcp(assertion_maxVal_mcp)
# Construct batch assertion result object for partition 1 batch
assertionResult_maxVal_batch_partition1 = AssertionResult(
timestampMillis=int(time.time() * 1000),
assertionUrn=assertionUrn(assertion_maxVal),
asserteeUrn=datasetUrn("fooTable"),
partitionSpec=PartitionSpecClass(partition=str([{"country": "IN"}])),
nativeEvaluatorRunId="uuid1",
batchAssertionResult=BatchAssertionResult(
success=True,
externalUrl="http://example.com/uuid1",
actualAggValue=90,
),
)
emitAssertionResult(
assertionResult_maxVal_batch_partition1,
datasetUrn("fooTable"),
)
# Construct batch assertion result object for partition 2 batch
assertionResult_maxVal_batch_partition2 = AssertionResult(
timestampMillis=int(time.time() * 1000),
assertionUrn=assertionUrn(assertion_maxVal),
asserteeUrn=datasetUrn("fooTable"),
partitionSpec=PartitionSpecClass(partition=str([{"country": "US"}])),
nativeEvaluatorRunId="uuid1",
batchAssertionResult=BatchAssertionResult(
success=False,
externalUrl="http://example.com/uuid1",
actualAggValue=101,
),
)
emitAssertionResult(
assertionResult_maxVal_batch_partition2,
datasetUrn("fooTable"),
)
# Construct batch assertion result object for full table batch.
assertionResult_maxVal_batch_fulltable = AssertionResult(
timestampMillis=int(time.time() * 1000),
assertionUrn=assertionUrn(assertion_maxVal),
asserteeUrn=datasetUrn("fooTable"),
nativeEvaluatorRunId="uuid1",
batchAssertionResult=BatchAssertionResult(
success=True,
externalUrl="http://example.com/uuid1",
actualAggValue=93,
),
)
emitAssertionResult(
assertionResult_maxVal_batch_fulltable,
datasetUrn("fooTable"),
)