mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-03 04:10:43 +00:00
feat(model): data quality model (#3787)
Co-authored-by: Ravindra Lanka <rlanka@acryl.io> Co-authored-by: Mayuri N <mayuri.nehate@gslab.com>
This commit is contained in:
parent
ded16809da
commit
4c24f386a6
134
metadata-ingestion/examples/library/data_quality_mcpw_rest.py
Normal file
134
metadata-ingestion/examples/library/data_quality_mcpw_rest.py
Normal file
@ -0,0 +1,134 @@
|
||||
import time
|
||||
|
||||
import datahub.emitter.mce_builder as builder
|
||||
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
||||
from datahub.emitter.rest_emitter import DatahubRestEmitter
|
||||
from datahub.metadata.com.linkedin.pegasus2avro.assertion import (
|
||||
AssertionInfo,
|
||||
AssertionResult,
|
||||
AssertionScope,
|
||||
AssertionStdOperator,
|
||||
AssertionType,
|
||||
BatchAssertionResult,
|
||||
DatasetColumnAssertion,
|
||||
DatasetColumnStdAggFunc,
|
||||
)
|
||||
from datahub.metadata.com.linkedin.pegasus2avro.events.metadata import ChangeType
|
||||
from datahub.metadata.schema_classes import PartitionSpecClass
|
||||
|
||||
|
||||
def datasetUrn(tbl: str) -> str:
|
||||
return builder.make_dataset_urn("postgres", tbl)
|
||||
|
||||
|
||||
def fldUrn(tbl: str, fld: str) -> str:
|
||||
return f"urn:li:schemaField:({datasetUrn(tbl)}, {fld})"
|
||||
|
||||
|
||||
def assertionUrn(info: AssertionInfo) -> str:
|
||||
assertionId = builder.datahub_guid(info.to_obj())
|
||||
return builder.make_assertion_urn(assertionId)
|
||||
|
||||
|
||||
def emitAssertionResult(
|
||||
assertionResult: AssertionResult, datasetUrn: str
|
||||
) -> None:
|
||||
|
||||
dataset_assertionResult_mcp = MetadataChangeProposalWrapper(
|
||||
entityType="dataset",
|
||||
changeType=ChangeType.UPSERT,
|
||||
entityUrn=datasetUrn,
|
||||
aspectName="assertionResult",
|
||||
aspect=assertionResult,
|
||||
)
|
||||
|
||||
# Emit BatchAssertion Result! (timseries aspect)
|
||||
emitter.emit_mcp(dataset_assertionResult_mcp)
|
||||
|
||||
|
||||
# Construct an assertion object.
|
||||
assertion_maxVal = AssertionInfo(
|
||||
datasetFields=[fldUrn("fooTable", "col1")],
|
||||
datasets=[datasetUrn("fooTable")],
|
||||
assertionType=AssertionType(
|
||||
scope=AssertionScope.DATASET_COLUMN,
|
||||
datasetColumnAssertion=DatasetColumnAssertion(
|
||||
stdOperator=AssertionStdOperator.LESS_THAN,
|
||||
nativeOperator="column_value_is_less_than",
|
||||
stdAggFunc=DatasetColumnStdAggFunc.IDENTITY,
|
||||
),
|
||||
),
|
||||
assertionParameters={"max_value": "99"},
|
||||
customProperties={"suite_name": "demo_suite"},
|
||||
)
|
||||
|
||||
# Construct a MetadataChangeProposalWrapper object.
|
||||
assertion_maxVal_mcp = MetadataChangeProposalWrapper(
|
||||
entityType="assertion",
|
||||
changeType=ChangeType.UPSERT,
|
||||
entityUrn=assertionUrn(assertion_maxVal),
|
||||
aspectName="assertionInfo",
|
||||
aspect=assertion_maxVal,
|
||||
)
|
||||
|
||||
# Create an emitter to the GMS REST API.
|
||||
emitter = DatahubRestEmitter("http://localhost:8080")
|
||||
|
||||
# Emit Assertion entity info object!
|
||||
emitter.emit_mcp(assertion_maxVal_mcp)
|
||||
|
||||
# Construct batch assertion result object for partition 1 batch
|
||||
assertionResult_maxVal_batch_partition1 = AssertionResult(
|
||||
timestampMillis=int(time.time() * 1000),
|
||||
assertionUrn=assertionUrn(assertion_maxVal),
|
||||
asserteeUrn=datasetUrn("fooTable"),
|
||||
partitionSpec=PartitionSpecClass(partition=str([{"country": "IN"}])),
|
||||
nativeEvaluatorRunId="uuid1",
|
||||
batchAssertionResult=BatchAssertionResult(
|
||||
success=True,
|
||||
externalUrl="http://example.com/uuid1",
|
||||
actualAggValue=90,
|
||||
),
|
||||
)
|
||||
|
||||
emitAssertionResult(
|
||||
assertionResult_maxVal_batch_partition1,
|
||||
datasetUrn("fooTable"),
|
||||
)
|
||||
|
||||
# Construct batch assertion result object for partition 2 batch
|
||||
assertionResult_maxVal_batch_partition2 = AssertionResult(
|
||||
timestampMillis=int(time.time() * 1000),
|
||||
assertionUrn=assertionUrn(assertion_maxVal),
|
||||
asserteeUrn=datasetUrn("fooTable"),
|
||||
partitionSpec=PartitionSpecClass(partition=str([{"country": "US"}])),
|
||||
nativeEvaluatorRunId="uuid1",
|
||||
batchAssertionResult=BatchAssertionResult(
|
||||
success=False,
|
||||
externalUrl="http://example.com/uuid1",
|
||||
actualAggValue=101,
|
||||
),
|
||||
)
|
||||
|
||||
emitAssertionResult(
|
||||
assertionResult_maxVal_batch_partition2,
|
||||
datasetUrn("fooTable"),
|
||||
)
|
||||
|
||||
# Construct batch assertion result object for full table batch.
|
||||
assertionResult_maxVal_batch_fulltable = AssertionResult(
|
||||
timestampMillis=int(time.time() * 1000),
|
||||
assertionUrn=assertionUrn(assertion_maxVal),
|
||||
asserteeUrn=datasetUrn("fooTable"),
|
||||
nativeEvaluatorRunId="uuid1",
|
||||
batchAssertionResult=BatchAssertionResult(
|
||||
success=True,
|
||||
externalUrl="http://example.com/uuid1",
|
||||
actualAggValue=93,
|
||||
),
|
||||
)
|
||||
|
||||
emitAssertionResult(
|
||||
assertionResult_maxVal_batch_fulltable,
|
||||
datasetUrn("fooTable"),
|
||||
)
|
||||
@ -1,14 +1,17 @@
|
||||
"""Convenience functions for creating MCEs"""
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from enum import Enum
|
||||
from hashlib import md5
|
||||
from typing import Any, List, Optional, Type, TypeVar, Union, cast, get_type_hints
|
||||
|
||||
import typing_inspect
|
||||
from avrogen.dict_wrapper import DictWrapper
|
||||
|
||||
from datahub.configuration.source_common import DEFAULT_ENV as DEFAULT_ENV_CONFIGURATION
|
||||
from datahub.emitter.serialization_helper import pre_json_transform
|
||||
from datahub.metadata.com.linkedin.pegasus2avro.common import GlossaryTerms
|
||||
from datahub.metadata.schema_classes import (
|
||||
AuditStampClass,
|
||||
@ -82,6 +85,18 @@ def dataset_urn_to_key(dataset_urn: str) -> Optional[DatasetKeyClass]:
|
||||
return None
|
||||
|
||||
|
||||
def datahub_guid(obj: dict) -> str:
|
||||
obj_str = json.dumps(
|
||||
pre_json_transform(obj), separators=(",", ":"), sort_keys=True
|
||||
).encode("utf-8")
|
||||
datahub_guid = md5(obj_str).hexdigest()
|
||||
return datahub_guid
|
||||
|
||||
|
||||
def make_assertion_urn(assertion_id: str) -> str:
|
||||
return f"urn:li:assertion:{assertion_id}"
|
||||
|
||||
|
||||
def make_user_urn(username: str) -> str:
|
||||
return f"urn:li:corpuser:{username}"
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1117,7 +1117,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "datasetProfile",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1586847600000, \"rowCount\": 10, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"emp_no\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"10001\", \"10002\", \"10003\", \"10004\", \"10005\", \"10006\", \"10007\", \"10008\", \"10009\", \"10010\"]}, {\"fieldPath\": \"birth_date\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1952-04-19\", \"max\": \"1964-06-02\", \"sampleValues\": [\"1953-09-02\", \"1964-06-02\", \"1959-12-03\", \"1954-05-01\", \"1955-01-21\", \"1953-04-20\", \"1957-05-23\", \"1958-02-19\", \"1952-04-19\", \"1963-06-01\"]}, {\"fieldPath\": \"first_name\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Georgi\", \"Bezalel\", \"Parto\", \"Chirstian\", \"Kyoichi\", \"Anneke\", \"Tzvetan\", \"Saniya\", \"Sumant\", \"Duangkaew\"]}, {\"fieldPath\": \"last_name\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Facello\", \"Simmel\", \"Bamford\", \"Koblick\", \"Maliniak\", \"Preusig\", \"Zielinski\", \"Kalloufi\", \"Peac\", \"Piveteau\"]}, {\"fieldPath\": \"gender\", \"uniqueCount\": 2, \"uniqueProportion\": 0.2, \"nullCount\": 0, \"nullProportion\": 0.0, \"distinctValueFrequencies\": [{\"value\": \"M\", \"frequency\": 5}, {\"value\": \"F\", \"frequency\": 5}], \"sampleValues\": [\"M\", \"F\", \"M\", \"M\", \"M\", \"F\", \"F\", \"M\", \"F\", \"F\"]}, {\"fieldPath\": \"hire_date\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1985-02-18\", \"max\": \"1994-09-15\", \"sampleValues\": [\"1986-06-26\", \"1985-11-21\", \"1986-08-28\", \"1986-12-01\", \"1989-09-12\", \"1989-06-02\", \"1989-02-10\", \"1994-09-15\", \"1985-02-18\", \"1989-08-24\"]}]}",
|
||||
"value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 10, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"emp_no\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"10001\", \"10002\", \"10003\", \"10004\", \"10005\", \"10006\", \"10007\", \"10008\", \"10009\", \"10010\"]}, {\"fieldPath\": \"birth_date\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1952-04-19\", \"max\": \"1964-06-02\", \"sampleValues\": [\"1953-09-02\", \"1964-06-02\", \"1959-12-03\", \"1954-05-01\", \"1955-01-21\", \"1953-04-20\", \"1957-05-23\", \"1958-02-19\", \"1952-04-19\", \"1963-06-01\"]}, {\"fieldPath\": \"first_name\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Georgi\", \"Bezalel\", \"Parto\", \"Chirstian\", \"Kyoichi\", \"Anneke\", \"Tzvetan\", \"Saniya\", \"Sumant\", \"Duangkaew\"]}, {\"fieldPath\": \"last_name\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Facello\", \"Simmel\", \"Bamford\", \"Koblick\", \"Maliniak\", \"Preusig\", \"Zielinski\", \"Kalloufi\", \"Peac\", \"Piveteau\"]}, {\"fieldPath\": \"gender\", \"uniqueCount\": 2, \"uniqueProportion\": 0.2, \"nullCount\": 0, \"nullProportion\": 0.0, \"distinctValueFrequencies\": [{\"value\": \"M\", \"frequency\": 5}, {\"value\": \"F\", \"frequency\": 5}], \"sampleValues\": [\"M\", \"F\", \"M\", \"M\", \"M\", \"F\", \"F\", \"M\", \"F\", \"F\"]}, {\"fieldPath\": \"hire_date\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1985-02-18\", \"max\": \"1994-09-15\", \"sampleValues\": [\"1986-06-26\", \"1985-11-21\", \"1986-08-28\", \"1986-12-01\", \"1989-09-12\", \"1989-06-02\", \"1989-02-10\", \"1994-09-15\", \"1985-02-18\", \"1989-08-24\"]}]}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
@ -1136,7 +1136,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "datasetProfile",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1586847600000, \"rowCount\": 112, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"emp_no\", \"uniqueCount\": 10, \"uniqueProportion\": 0.08928571428571429, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"10001\", \"max\": \"10010\", \"mean\": \"10005.3125\", \"median\": \"10005.0\", \"stdev\": \"2.834889609688869\", \"distinctValueFrequencies\": [{\"value\": \"10001\", \"frequency\": 17}, {\"value\": \"10002\", \"frequency\": 6}, {\"value\": \"10003\", \"frequency\": 7}, {\"value\": \"10004\", \"frequency\": 16}, {\"value\": \"10005\", \"frequency\": 13}, {\"value\": \"10006\", \"frequency\": 12}, {\"value\": \"10007\", \"frequency\": 14}, {\"value\": \"10008\", \"frequency\": 3}, {\"value\": \"10009\", \"frequency\": 18}, {\"value\": \"10010\", \"frequency\": 6}], \"sampleValues\": [\"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10002\", \"10002\", \"10002\"]}, {\"fieldPath\": \"salary\", \"uniqueCount\": 111, \"uniqueProportion\": 0.9910714285714286, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"40000\", \"max\": \"94692\", \"mean\": \"68303.11607142857\", \"median\": \"69544.0\", \"stdev\": \"15505.291475014095\", \"sampleValues\": [\"60117\", \"62102\", \"66074\", \"66596\", \"66961\", \"71046\", \"74333\", \"75286\", \"75994\", \"76884\", \"80013\", \"81025\", \"81097\", \"84917\", \"85112\", \"85097\", \"88958\", \"65909\", \"65909\", \"67534\"]}, {\"fieldPath\": \"from_date\", \"uniqueCount\": 106, \"uniqueProportion\": 0.9464285714285714, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1985-02-18\", \"max\": \"2002-06-22\", \"sampleValues\": [\"1986-06-26\", \"1987-06-26\", \"1988-06-25\", \"1989-06-25\", \"1990-06-25\", \"1991-06-25\", \"1992-06-24\", \"1993-06-24\", \"1994-06-24\", \"1995-06-24\", \"1996-06-23\", \"1997-06-23\", \"1998-06-23\", \"1999-06-23\", \"2000-06-22\", \"2001-06-22\", \"2002-06-22\", \"1996-08-03\", \"1997-08-03\", \"1998-08-03\"]}, {\"fieldPath\": \"to_date\", \"uniqueCount\": 99, \"uniqueProportion\": 0.8839285714285714, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1986-02-18\", \"max\": \"9999-01-01\", \"sampleValues\": [\"1987-06-26\", \"1988-06-25\", \"1989-06-25\", \"1990-06-25\", \"1991-06-25\", \"1992-06-24\", \"1993-06-24\", \"1994-06-24\", \"1995-06-24\", \"1996-06-23\", \"1997-06-23\", \"1998-06-23\", \"1999-06-23\", \"2000-06-22\", \"2001-06-22\", \"2002-06-22\", \"9999-01-01\", \"1997-08-03\", \"1998-08-03\", \"1999-08-03\"]}]}",
|
||||
"value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 112, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"emp_no\", \"uniqueCount\": 10, \"uniqueProportion\": 0.08928571428571429, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"10001\", \"max\": \"10010\", \"mean\": \"10005.3125\", \"median\": \"10005.0\", \"stdev\": \"2.834889609688869\", \"distinctValueFrequencies\": [{\"value\": \"10001\", \"frequency\": 17}, {\"value\": \"10002\", \"frequency\": 6}, {\"value\": \"10003\", \"frequency\": 7}, {\"value\": \"10004\", \"frequency\": 16}, {\"value\": \"10005\", \"frequency\": 13}, {\"value\": \"10006\", \"frequency\": 12}, {\"value\": \"10007\", \"frequency\": 14}, {\"value\": \"10008\", \"frequency\": 3}, {\"value\": \"10009\", \"frequency\": 18}, {\"value\": \"10010\", \"frequency\": 6}], \"sampleValues\": [\"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10002\", \"10002\", \"10002\"]}, {\"fieldPath\": \"salary\", \"uniqueCount\": 111, \"uniqueProportion\": 0.9910714285714286, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"40000\", \"max\": \"94692\", \"mean\": \"68303.11607142857\", \"median\": \"69544.0\", \"stdev\": \"15505.291475014095\", \"sampleValues\": [\"60117\", \"62102\", \"66074\", \"66596\", \"66961\", \"71046\", \"74333\", \"75286\", \"75994\", \"76884\", \"80013\", \"81025\", \"81097\", \"84917\", \"85112\", \"85097\", \"88958\", \"65909\", \"65909\", \"67534\"]}, {\"fieldPath\": \"from_date\", \"uniqueCount\": 106, \"uniqueProportion\": 0.9464285714285714, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1985-02-18\", \"max\": \"2002-06-22\", \"sampleValues\": [\"1986-06-26\", \"1987-06-26\", \"1988-06-25\", \"1989-06-25\", \"1990-06-25\", \"1991-06-25\", \"1992-06-24\", \"1993-06-24\", \"1994-06-24\", \"1995-06-24\", \"1996-06-23\", \"1997-06-23\", \"1998-06-23\", \"1999-06-23\", \"2000-06-22\", \"2001-06-22\", \"2002-06-22\", \"1996-08-03\", \"1997-08-03\", \"1998-08-03\"]}, {\"fieldPath\": \"to_date\", \"uniqueCount\": 99, \"uniqueProportion\": 0.8839285714285714, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1986-02-18\", \"max\": \"9999-01-01\", \"sampleValues\": [\"1987-06-26\", \"1988-06-25\", \"1989-06-25\", \"1990-06-25\", \"1991-06-25\", \"1992-06-24\", \"1993-06-24\", \"1994-06-24\", \"1995-06-24\", \"1996-06-23\", \"1997-06-23\", \"1998-06-23\", \"1999-06-23\", \"2000-06-22\", \"2001-06-22\", \"2002-06-22\", \"9999-01-01\", \"1997-08-03\", \"1998-08-03\", \"1999-08-03\"]}]}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
@ -1155,7 +1155,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "datasetProfile",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1586847600000, \"rowCount\": 5, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 5, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\", \"3\", \"4\", \"5\"]}, {\"fieldPath\": \"company\", \"uniqueCount\": 5, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Company A\", \"Company B\", \"Company C\", \"Company D\", \"Company E\"]}, {\"fieldPath\": \"last_name\", \"uniqueCount\": 5, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Axen\", \"Bedecs\", \"Donnell\", \"Gratacos Solsona\", \"Lee\"]}, {\"fieldPath\": \"first_name\", \"uniqueCount\": 5, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Anna\", \"Antonio\", \"Christina\", \"Martin\", \"Thomas\"]}, {\"fieldPath\": \"email_address\", \"uniqueCount\": 0, \"nullCount\": 5, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"priority\", \"uniqueCount\": 3, \"uniqueProportion\": 0.75, \"nullCount\": 1, \"nullProportion\": 0.2, \"min\": \"3.8\", \"max\": \"4.9\", \"mean\": \"4.175000011920929\", \"median\": \"4.0\", \"distinctValueFrequencies\": [{\"value\": \"3.8\", \"frequency\": 1}, {\"value\": \"4.0\", \"frequency\": 2}, {\"value\": \"4.9\", \"frequency\": 1}], \"sampleValues\": [\"4.0\", \"4.9\", \"4.0\", \"3.8\"]}]}",
|
||||
"value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 5, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 5, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\", \"3\", \"4\", \"5\"]}, {\"fieldPath\": \"company\", \"uniqueCount\": 5, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Company A\", \"Company B\", \"Company C\", \"Company D\", \"Company E\"]}, {\"fieldPath\": \"last_name\", \"uniqueCount\": 5, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Axen\", \"Bedecs\", \"Donnell\", \"Gratacos Solsona\", \"Lee\"]}, {\"fieldPath\": \"first_name\", \"uniqueCount\": 5, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Anna\", \"Antonio\", \"Christina\", \"Martin\", \"Thomas\"]}, {\"fieldPath\": \"email_address\", \"uniqueCount\": 0, \"nullCount\": 5, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"priority\", \"uniqueCount\": 3, \"uniqueProportion\": 0.75, \"nullCount\": 1, \"nullProportion\": 0.2, \"min\": \"3.8\", \"max\": \"4.9\", \"mean\": \"4.175000011920929\", \"median\": \"4.0\", \"distinctValueFrequencies\": [{\"value\": \"3.8\", \"frequency\": 1}, {\"value\": \"4.0\", \"frequency\": 2}, {\"value\": \"4.9\", \"frequency\": 1}], \"sampleValues\": [\"4.0\", \"4.9\", \"4.0\", \"3.8\"]}]}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
@ -1174,7 +1174,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "datasetProfile",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1586847600000, \"rowCount\": 0, \"columnCount\": 3, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 0, \"nullCount\": 0, \"sampleValues\": []}, {\"fieldPath\": \"description\", \"uniqueCount\": 0, \"nullCount\": 0, \"sampleValues\": []}, {\"fieldPath\": \"customer_id\", \"uniqueCount\": 0, \"nullCount\": 0, \"sampleValues\": []}]}",
|
||||
"value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 0, \"columnCount\": 3, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 0, \"nullCount\": 0, \"sampleValues\": []}, {\"fieldPath\": \"description\", \"uniqueCount\": 0, \"nullCount\": 0, \"sampleValues\": []}, {\"fieldPath\": \"customer_id\", \"uniqueCount\": 0, \"nullCount\": 0, \"sampleValues\": []}]}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
@ -1193,7 +1193,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "datasetProfile",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1586847600000, \"rowCount\": 0, \"columnCount\": 1, \"fieldProfiles\": [{\"fieldPath\": \"dummy\", \"uniqueCount\": 0, \"nullCount\": 0, \"sampleValues\": []}]}",
|
||||
"value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 0, \"columnCount\": 1, \"fieldProfiles\": [{\"fieldPath\": \"dummy\", \"uniqueCount\": 0, \"nullCount\": 0, \"sampleValues\": []}]}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "operation",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1631664000000, \"lastUpdatedTimestamp\": 1631664000000, \"actor\": \"urn:li:corpuser:test-name\", \"operationType\": \"INSERT\"}",
|
||||
"value": "{\"timestampMillis\": 1631664000000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"lastUpdatedTimestamp\": 1631664000000, \"actor\": \"urn:li:corpuser:test-name\", \"operationType\": \"INSERT\"}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
@ -26,7 +26,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "operation",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1631664000000, \"lastUpdatedTimestamp\": 1631664000000, \"actor\": \"urn:li:corpuser:real_shirshanka\", \"operationType\": \"INSERT\"}",
|
||||
"value": "{\"timestampMillis\": 1631664000000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"lastUpdatedTimestamp\": 1631664000000, \"actor\": \"urn:li:corpuser:real_shirshanka\", \"operationType\": \"INSERT\"}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
@ -45,7 +45,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "operation",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1631664000000, \"lastUpdatedTimestamp\": 1631664000000, \"actor\": \"urn:li:corpuser:test-name\", \"operationType\": \"DELETE\"}",
|
||||
"value": "{\"timestampMillis\": 1631664000000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"lastUpdatedTimestamp\": 1631664000000, \"actor\": \"urn:li:corpuser:test-name\", \"operationType\": \"DELETE\"}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
@ -64,7 +64,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "operation",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1631664000000, \"lastUpdatedTimestamp\": 1631664000000, \"actor\": \"urn:li:corpuser:real_shirshanka\", \"operationType\": \"DELETE\"}",
|
||||
"value": "{\"timestampMillis\": 1631664000000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"lastUpdatedTimestamp\": 1631664000000, \"actor\": \"urn:li:corpuser:real_shirshanka\", \"operationType\": \"DELETE\"}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
@ -83,7 +83,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "datasetUsageStatistics",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1631577600000, \"eventGranularity\": {\"unit\": \"DAY\", \"multiple\": 1}, \"uniqueUserCount\": 1, \"totalSqlQueries\": 1, \"topSqlQueries\": [\"select userid from users\"], \"userCounts\": [{\"user\": \"urn:li:corpuser:test-name\", \"count\": 1, \"userEmail\": \"test-name@acryl.io\"}], \"fieldCounts\": []}",
|
||||
"value": "{\"timestampMillis\": 1631577600000, \"eventGranularity\": {\"unit\": \"DAY\", \"multiple\": 1}, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"uniqueUserCount\": 1, \"totalSqlQueries\": 1, \"topSqlQueries\": [\"select userid from users\"], \"userCounts\": [{\"user\": \"urn:li:corpuser:test-name\", \"count\": 1, \"userEmail\": \"test-name@acryl.io\"}], \"fieldCounts\": []}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
@ -102,7 +102,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "datasetUsageStatistics",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1631577600000, \"eventGranularity\": {\"unit\": \"DAY\", \"multiple\": 1}, \"uniqueUserCount\": 1, \"totalSqlQueries\": 1, \"topSqlQueries\": [\"select catid from category\"], \"userCounts\": [{\"user\": \"urn:li:corpuser:real_shirshanka\", \"count\": 1, \"userEmail\": \"real_shirshanka@acryl.io\"}], \"fieldCounts\": []}",
|
||||
"value": "{\"timestampMillis\": 1631577600000, \"eventGranularity\": {\"unit\": \"DAY\", \"multiple\": 1}, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"uniqueUserCount\": 1, \"totalSqlQueries\": 1, \"topSqlQueries\": [\"select catid from category\"], \"userCounts\": [{\"user\": \"urn:li:corpuser:real_shirshanka\", \"count\": 1, \"userEmail\": \"real_shirshanka@acryl.io\"}], \"fieldCounts\": []}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "datasetUsageStatistics",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1634169600000, \"eventGranularity\": {\"unit\": \"DAY\", \"multiple\": 1}, \"uniqueUserCount\": 1, \"totalSqlQueries\": 2, \"topSqlQueries\": [\"select * from testcatalog.testschema.testtable limit 100\"], \"userCounts\": [{\"user\": \"urn:li:corpuser:test-name\", \"count\": 2, \"userEmail\": \"test-name@acryl.io\"}], \"fieldCounts\": [{\"fieldPath\": \"column1\", \"count\": 2}, {\"fieldPath\": \"column2\", \"count\": 2}]}",
|
||||
"value": "{\"timestampMillis\": 1634169600000, \"eventGranularity\": {\"unit\": \"DAY\", \"multiple\": 1}, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"uniqueUserCount\": 1, \"totalSqlQueries\": 2, \"topSqlQueries\": [\"select * from testcatalog.testschema.testtable limit 100\"], \"userCounts\": [{\"user\": \"urn:li:corpuser:test-name\", \"count\": 2, \"userEmail\": \"test-name@acryl.io\"}], \"fieldCounts\": [{\"fieldPath\": \"column1\", \"count\": 2}, {\"fieldPath\": \"column2\", \"count\": 2}]}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
|
||||
@ -531,7 +531,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "datasetProfile",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 3, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\", \"3\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 1\", \"Book 2\", \"Book 3\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"ABC\", \"PQR\", \"XYZ\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"tags\", \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"genre_ids\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}]}",
|
||||
"value": "{\"timestampMillis\": 1632398400000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 3, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\", \"3\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 1\", \"Book 2\", \"Book 3\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"ABC\", \"PQR\", \"XYZ\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"tags\", \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"genre_ids\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}]}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
@ -550,7 +550,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "datasetProfile",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 2, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"book_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 0.5, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"distinctValueFrequencies\": [{\"value\": \"2021-09-27\", \"frequency\": 2}], \"sampleValues\": [\"2021-09-27\", \"2021-09-27\"]}, {\"fieldPath\": \"return_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 1, \"nullProportion\": 0.5, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}",
|
||||
"value": "{\"timestampMillis\": 1632398400000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 2, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"book_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 0.5, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"distinctValueFrequencies\": [{\"value\": \"2021-09-27\", \"frequency\": 2}], \"sampleValues\": [\"2021-09-27\", \"2021-09-27\"]}, {\"fieldPath\": \"return_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 1, \"nullProportion\": 0.5, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
@ -569,7 +569,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "datasetProfile",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 2, \"columnCount\": 2, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Member 1\", \"Member 2\"]}]}",
|
||||
"value": "{\"timestampMillis\": 1632398400000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 2, \"columnCount\": 2, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Member 1\", \"Member 2\"]}]}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
@ -588,7 +588,7 @@
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "datasetProfile",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 1, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 2\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"PQR\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 1, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}",
|
||||
"value": "{\"timestampMillis\": 1632398400000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 1, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 2\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"PQR\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 1, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}",
|
||||
"contentType": "application/json"
|
||||
},
|
||||
"systemMetadata": {
|
||||
|
||||
@ -19,6 +19,9 @@ public class MappingsBuilder {
|
||||
public static final String EVENT_FIELD = "event";
|
||||
public static final String SYSTEM_METADATA_FIELD = "systemMetadata";
|
||||
public static final String IS_EXPLODED_FIELD = "isExploded";
|
||||
public static final String PARTITION_SPEC = "partitionSpec";
|
||||
public static final String PARTITION_SPEC_PARTITION = "partition";
|
||||
public static final String PARTITION_SPEC_TIME_PARTITION = "timePartition";
|
||||
|
||||
private MappingsBuilder() {
|
||||
}
|
||||
@ -36,6 +39,9 @@ public class MappingsBuilder {
|
||||
mappings.put(TIMESTAMP_FIELD, ImmutableMap.of("type", "date"));
|
||||
mappings.put(TIMESTAMP_MILLIS_FIELD, ImmutableMap.of("type", "date"));
|
||||
mappings.put(EVENT_GRANULARITY, ImmutableMap.of("type", "keyword"));
|
||||
mappings.put(PARTITION_SPEC, ImmutableMap.of("properties",
|
||||
ImmutableMap.of(PARTITION_SPEC_PARTITION, ImmutableMap.of("type", "keyword"), PARTITION_SPEC_TIME_PARTITION,
|
||||
ImmutableMap.of("type", "keyword"))));
|
||||
mappings.put(EVENT_FIELD, ImmutableMap.of("type", "object", "enabled", false));
|
||||
mappings.put(SYSTEM_METADATA_FIELD, ImmutableMap.of("type", "object", "enabled", false));
|
||||
mappings.put(IS_EXPLODED_FIELD, ImmutableMap.of("type", "boolean"));
|
||||
|
||||
@ -12,6 +12,7 @@ import com.linkedin.metadata.models.TimeseriesFieldSpec;
|
||||
import com.linkedin.metadata.models.registry.EntityRegistry;
|
||||
import com.linkedin.metadata.query.filter.Filter;
|
||||
import com.linkedin.metadata.search.utils.ESUtils;
|
||||
import com.linkedin.metadata.timeseries.elastic.indexbuilder.MappingsBuilder;
|
||||
import com.linkedin.metadata.utils.elasticsearch.IndexConvention;
|
||||
import com.linkedin.timeseries.AggregationSpec;
|
||||
import com.linkedin.timeseries.GenericTable;
|
||||
@ -190,6 +191,16 @@ public class ESAggregatedStatsDAO {
|
||||
return timeseriesFieldCollectionSpec.getPegasusSchema().getType();
|
||||
}
|
||||
} else if (memberParts.length == 2) {
|
||||
// Check if partitionSpec
|
||||
if (memberParts[0].equals(MappingsBuilder.PARTITION_SPEC)) {
|
||||
if (memberParts[1].equals(MappingsBuilder.PARTITION_SPEC_PARTITION) || memberParts[1].equals(
|
||||
MappingsBuilder.PARTITION_SPEC_TIME_PARTITION)) {
|
||||
return DataSchema.Type.STRING;
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unknown partitionSpec member" + memberParts[1]);
|
||||
}
|
||||
}
|
||||
|
||||
// This is either a collection key/stat.
|
||||
TimeseriesFieldCollectionSpec timeseriesFieldCollectionSpec =
|
||||
aspectSpec.getTimeseriesFieldCollectionSpecMap().get(memberParts[0]);
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.linkedin.metadata.timeseries.transformer;
|
||||
|
||||
import com.datahub.util.RecordUtils;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
@ -7,10 +8,10 @@ import com.fasterxml.jackson.databind.node.ArrayNode;
|
||||
import com.fasterxml.jackson.databind.node.JsonNodeFactory;
|
||||
import com.fasterxml.jackson.databind.node.ObjectNode;
|
||||
import com.linkedin.common.urn.Urn;
|
||||
import com.linkedin.data.DataMap;
|
||||
import com.linkedin.data.schema.ArrayDataSchema;
|
||||
import com.linkedin.data.schema.DataSchema;
|
||||
import com.linkedin.data.template.RecordTemplate;
|
||||
import com.datahub.util.RecordUtils;
|
||||
import com.linkedin.metadata.extractor.FieldExtractor;
|
||||
import com.linkedin.metadata.models.AspectSpec;
|
||||
import com.linkedin.metadata.models.TimeseriesFieldCollectionSpec;
|
||||
@ -87,6 +88,30 @@ public class TimeseriesAspectTransformer {
|
||||
throw new IllegalArgumentException("Failed to convert eventGranulairty to Json string!", e);
|
||||
}
|
||||
}
|
||||
// PartitionSpec handling
|
||||
DataMap partitionSpec = (DataMap) timeseriesAspect.data().get(MappingsBuilder.PARTITION_SPEC);
|
||||
if (partitionSpec != null) {
|
||||
Object partition = partitionSpec.get(MappingsBuilder.PARTITION_SPEC_PARTITION);
|
||||
Object timePartition = partitionSpec.get(MappingsBuilder.PARTITION_SPEC_TIME_PARTITION);
|
||||
if (partition != null && timePartition != null) {
|
||||
throw new IllegalArgumentException("Both partition and timePartition cannot be specified in partitionSpec!");
|
||||
} else if (partition != null) {
|
||||
ObjectNode partitionDoc = JsonNodeFactory.instance.objectNode();
|
||||
partitionDoc.put(MappingsBuilder.PARTITION_SPEC_PARTITION, partition.toString());
|
||||
document.set(MappingsBuilder.PARTITION_SPEC, partitionDoc);
|
||||
} else if (timePartition != null) {
|
||||
ObjectNode timePartitionDoc = JsonNodeFactory.instance.objectNode();
|
||||
try {
|
||||
timePartitionDoc.put(MappingsBuilder.PARTITION_SPEC_TIME_PARTITION,
|
||||
OBJECT_MAPPER.writeValueAsString(timePartition));
|
||||
} catch (JsonProcessingException e) {
|
||||
throw new IllegalArgumentException("Failed to convert timePartition to Json string!", e);
|
||||
}
|
||||
document.set(MappingsBuilder.PARTITION_SPEC, timePartitionDoc);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Both partition and timePartition cannot be null in partitionSpec.");
|
||||
}
|
||||
}
|
||||
String messageId = (String) timeseriesAspect.data().get(MappingsBuilder.MESSAGE_ID_FIELD);
|
||||
if (messageId != null) {
|
||||
document.put(MappingsBuilder.MESSAGE_ID_FIELD, messageId);
|
||||
@ -198,6 +223,10 @@ public class TimeseriesAspectTransformer {
|
||||
if (messageId != null) {
|
||||
docId += messageId.toString();
|
||||
}
|
||||
JsonNode partitionSpec = document.get(MappingsBuilder.PARTITION_SPEC);
|
||||
if (partitionSpec != null) {
|
||||
docId += partitionSpec.toString();
|
||||
}
|
||||
|
||||
return DigestUtils.md5Hex(docId);
|
||||
}
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
package com.linkedin.metadata.timeseries.elastic;
|
||||
|
||||
import com.datahub.test.BatchType;
|
||||
import com.datahub.test.ComplexNestedRecord;
|
||||
import com.datahub.test.TestEntityComponentProfile;
|
||||
import com.datahub.test.TestEntityComponentProfileArray;
|
||||
import com.datahub.test.TestEntityProfile;
|
||||
@ -11,6 +13,8 @@ import com.linkedin.common.urn.TestEntityUrn;
|
||||
import com.linkedin.common.urn.Urn;
|
||||
import com.linkedin.data.template.StringArray;
|
||||
import com.linkedin.data.template.StringArrayArray;
|
||||
import com.linkedin.data.template.StringMap;
|
||||
import com.linkedin.data.template.StringMapArray;
|
||||
import com.linkedin.metadata.aspect.EnvelopedAspect;
|
||||
import com.linkedin.metadata.models.AspectSpec;
|
||||
import com.linkedin.metadata.models.DataSchemaFactory;
|
||||
@ -155,6 +159,15 @@ public class ElasticSearchTimeseriesAspectServiceTest {
|
||||
componentProfile2.setKey("col2");
|
||||
componentProfile2.setStat(stat + 2);
|
||||
testEntityProfile.setComponentProfiles(new TestEntityComponentProfileArray(componentProfile1, componentProfile2));
|
||||
|
||||
StringMap stringMap1 = new StringMap();
|
||||
stringMap1.put("p_key1", "p_val1");
|
||||
StringMap stringMap2 = new StringMap();
|
||||
stringMap2.put("p_key2", "p_val2");
|
||||
ComplexNestedRecord nestedRecord = new ComplexNestedRecord().setType(BatchType.PARTITION_BATCH)
|
||||
.setPartitions(new StringMapArray(stringMap1, stringMap2));
|
||||
testEntityProfile.setAComplexNestedRecord(nestedRecord);
|
||||
|
||||
return testEntityProfile;
|
||||
}
|
||||
|
||||
@ -346,6 +359,50 @@ public class ElasticSearchTimeseriesAspectServiceTest {
|
||||
_testEntityProfiles.get(_startTime + 23 * TIME_INCREMENT).getStat().toString())));
|
||||
}
|
||||
|
||||
@Test(groups = {"getAggregatedStats"}, dependsOnGroups = {"upsert"})
|
||||
public void testGetAggregatedStatsLatestAComplexNestedRecordForDay1() {
|
||||
// Filter is only on the urn
|
||||
Criterion hasUrnCriterion =
|
||||
new Criterion().setField("urn").setCondition(Condition.EQUAL).setValue(TEST_URN.toString());
|
||||
Criterion startTimeCriterion = new Criterion().setField(ES_FILED_TIMESTAMP)
|
||||
.setCondition(Condition.GREATER_THAN_OR_EQUAL_TO)
|
||||
.setValue(_startTime.toString());
|
||||
Criterion endTimeCriterion = new Criterion().setField(ES_FILED_TIMESTAMP)
|
||||
.setCondition(Condition.LESS_THAN_OR_EQUAL_TO)
|
||||
.setValue(String.valueOf(_startTime + 23 * TIME_INCREMENT));
|
||||
|
||||
Filter filter =
|
||||
QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion));
|
||||
|
||||
// Aggregate on latest stat value
|
||||
AggregationSpec latestStatAggregationSpec =
|
||||
new AggregationSpec().setAggregationType(AggregationType.LATEST).setFieldPath("aComplexNestedRecord");
|
||||
|
||||
// Grouping bucket is only timestamp filed.
|
||||
GroupingBucket timestampBucket = new GroupingBucket().setKey(ES_FILED_TIMESTAMP)
|
||||
.setType(GroupingBucketType.DATE_GROUPING_BUCKET)
|
||||
.setTimeWindowSize(new TimeWindowSize().setMultiple(1).setUnit(CalendarInterval.DAY));
|
||||
|
||||
GenericTable resultTable = _elasticSearchTimeseriesAspectService.getAggregatedStats(ENTITY_NAME, ASPECT_NAME,
|
||||
new AggregationSpec[]{latestStatAggregationSpec}, filter, new GroupingBucket[]{timestampBucket});
|
||||
// Validate column names
|
||||
assertEquals(resultTable.getColumnNames(), new StringArray(ES_FILED_TIMESTAMP, "latest_aComplexNestedRecord"));
|
||||
// Validate column types
|
||||
assertEquals(resultTable.getColumnTypes(), new StringArray("long", "record"));
|
||||
// Validate rows
|
||||
assertNotNull(resultTable.getRows());
|
||||
assertEquals(resultTable.getRows().size(), 1);
|
||||
assertEquals(resultTable.getRows().get(0).get(0), _startTime.toString());
|
||||
try {
|
||||
ComplexNestedRecord latestAComplexNestedRecord =
|
||||
OBJECT_MAPPER.readValue(resultTable.getRows().get(0).get(1), ComplexNestedRecord.class);
|
||||
assertEquals(latestAComplexNestedRecord,
|
||||
_testEntityProfiles.get(_startTime + 23 * TIME_INCREMENT).getAComplexNestedRecord());
|
||||
} catch (JsonProcessingException e) {
|
||||
fail("Unexpected exception thrown" + e);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(groups = {"getAggregatedStats"}, dependsOnGroups = {"upsert"})
|
||||
public void testGetAggregatedStatsLatestStrArrayDay1() {
|
||||
// Filter is only on the urn
|
||||
|
||||
@ -0,0 +1,51 @@
|
||||
namespace com.linkedin.assertion
|
||||
|
||||
import com.linkedin.common.CustomProperties
|
||||
import com.linkedin.common.ExternalReference
|
||||
import com.linkedin.common.Urn
|
||||
|
||||
/**
|
||||
* Information about an assertion
|
||||
*/
|
||||
@Aspect = {
|
||||
"name": "assertionInfo"
|
||||
}
|
||||
record AssertionInfo includes CustomProperties, ExternalReference {
|
||||
/**
|
||||
* One or more dataset schema fields that are targeted by this assertion
|
||||
*/
|
||||
@Relationship = {
|
||||
"/*": {
|
||||
"name": "Asserts",
|
||||
"entityTypes": [ "schemaField" ]
|
||||
}
|
||||
}
|
||||
datasetFields: optional array[Urn]
|
||||
|
||||
/**
|
||||
* One or more datasets that are targeted by this assertion
|
||||
*/
|
||||
@Relationship = {
|
||||
"/*": {
|
||||
"name": "Asserts",
|
||||
"entityTypes": [ "dataset" ]
|
||||
}
|
||||
}
|
||||
datasets: optional array[Urn]
|
||||
|
||||
/**
|
||||
* Type of assertion
|
||||
*/
|
||||
assertionType: AssertionType
|
||||
|
||||
/*
|
||||
* Logic for assertion such as implementation of custom nativeOperator
|
||||
*/
|
||||
assertionLogic: optional string
|
||||
|
||||
/**
|
||||
* Parameters required for the assertion. e.g. min_value, max_value, value, columns
|
||||
*/
|
||||
assertionParameters: map[string, string] = { }
|
||||
|
||||
}
|
||||
@ -0,0 +1,51 @@
|
||||
namespace com.linkedin.assertion
|
||||
|
||||
import com.linkedin.timeseries.TimeseriesAspectBase
|
||||
import com.linkedin.common.ExternalReference
|
||||
import com.linkedin.common.Urn
|
||||
|
||||
/**
|
||||
* The results of evaluating the assertion on the batch
|
||||
*/
|
||||
@Aspect = {
|
||||
"name": "assertionResult",
|
||||
"type": "timeseries",
|
||||
}
|
||||
|
||||
record AssertionResult includes TimeseriesAspectBase {
|
||||
|
||||
/*
|
||||
* Urn of assertion which is evaluated
|
||||
*/
|
||||
@TimeseriesField = {}
|
||||
assertionUrn: Urn
|
||||
|
||||
/*
|
||||
* Urn of entity being asserted
|
||||
*/
|
||||
//example - dataset urn, if dataset is being asserted
|
||||
@TimeseriesField = {}
|
||||
asserteeUrn: Urn
|
||||
|
||||
/**
|
||||
* Specification of the batch whose data quality is evaluated
|
||||
*/
|
||||
batchSpec: optional BatchSpec
|
||||
|
||||
/**
|
||||
* Results of assertion
|
||||
*/
|
||||
@TimeseriesField = {}
|
||||
batchAssertionResult: BatchAssertionResult
|
||||
|
||||
/**
|
||||
* Native Run identifier of platform evaluating the assertions
|
||||
*/
|
||||
//Multiple assertions could occur in same evaluator run
|
||||
nativeEvaluatorRunId: optional string
|
||||
|
||||
/**
|
||||
* Runtime parameters of evaluation
|
||||
*/
|
||||
runtimeContext: map[string, string] = { }
|
||||
}
|
||||
@ -0,0 +1,63 @@
|
||||
namespace com.linkedin.assertion
|
||||
|
||||
enum AssertionStdOperator {
|
||||
/**
|
||||
* Value being asserted is between min_value and max_value
|
||||
*/
|
||||
BETWEEN
|
||||
|
||||
/**
|
||||
* Value being asserted is less than max_value
|
||||
*/
|
||||
LESS_THAN
|
||||
|
||||
/**
|
||||
* Value being asserted is less than or equal to max_value
|
||||
*/
|
||||
LESS_THAN_OR_EQUAL_TO
|
||||
|
||||
/**
|
||||
* Value being asserted is greater than min_value
|
||||
*/
|
||||
GREATER_THAN
|
||||
|
||||
/**
|
||||
* Value being asserted is greater than or equal to min_value
|
||||
*/
|
||||
GREATER_THAN_OR_EQUAL_TO
|
||||
|
||||
/**
|
||||
* Value being asserted is equal to value
|
||||
*/
|
||||
EQUAL_TO
|
||||
|
||||
/**
|
||||
* Value being asserted is not null
|
||||
*/
|
||||
NOT_NULL
|
||||
|
||||
/**
|
||||
* Value being asserted contains value
|
||||
*/
|
||||
CONTAIN
|
||||
|
||||
/**
|
||||
* Value being asserted ends with value
|
||||
*/
|
||||
END_WITH
|
||||
|
||||
/**
|
||||
* Value being asserted starts with value
|
||||
*/
|
||||
START_WITH
|
||||
|
||||
/**
|
||||
* Value being asserted is one of the array values
|
||||
*/
|
||||
IN
|
||||
|
||||
/**
|
||||
* Other
|
||||
*/
|
||||
_NATIVE_
|
||||
}
|
||||
@ -0,0 +1,32 @@
|
||||
namespace com.linkedin.assertion
|
||||
|
||||
/**
|
||||
* Type of Assertion
|
||||
*/
|
||||
record AssertionType {
|
||||
/**
|
||||
* Scope of Assertion
|
||||
*/
|
||||
scope: enum AssertionScope {
|
||||
DATASET_COLUMN
|
||||
DATASET_ROWS
|
||||
DATASET_SCHEMA
|
||||
CROSS_DATASET
|
||||
}
|
||||
|
||||
/**
|
||||
* Assertion details for scope DATASET_COLUMN
|
||||
*/
|
||||
datasetColumnAssertion: optional DatasetColumnAssertion
|
||||
|
||||
/**
|
||||
* Assertion details for scope DATASET_ROWS
|
||||
*/
|
||||
datasetRowsAssertion: optional DatasetRowsAssertion
|
||||
|
||||
/**
|
||||
* Assertion details for scope DATASET_SCHEMA
|
||||
*/
|
||||
datasetSchemaAssertion: optional DatasetSchemaAssertion
|
||||
|
||||
}
|
||||
@ -0,0 +1,40 @@
|
||||
namespace com.linkedin.assertion
|
||||
|
||||
record BatchAssertionResult {
|
||||
|
||||
/**
|
||||
* Indicator of whether the constraint is fully satisfied for the batch
|
||||
*/
|
||||
success: boolean
|
||||
|
||||
/**
|
||||
* Number of rows for evaluated batch
|
||||
*/
|
||||
rowCount: optional long
|
||||
|
||||
/**
|
||||
* Number of rows with missing value for evaluated batch
|
||||
*/
|
||||
missingCount: optional long
|
||||
|
||||
/**
|
||||
* Number of rows with unexpected value for evaluated batch
|
||||
*/
|
||||
unexpectedCount: optional long
|
||||
|
||||
/**
|
||||
* Observed aggregate value for evaluated batch
|
||||
*/
|
||||
actualAggValue: optional float
|
||||
|
||||
/**
|
||||
* Other results of evaluation
|
||||
*/
|
||||
nativeResults: map[string, string] = { }
|
||||
|
||||
/**
|
||||
* URL where the reference exist
|
||||
*/
|
||||
//TODO - Change type to optional Url, not working
|
||||
externalUrl: optional string
|
||||
}
|
||||
@ -0,0 +1,24 @@
|
||||
namespace com.linkedin.assertion
|
||||
|
||||
import com.linkedin.common.CustomProperties
|
||||
|
||||
/**
|
||||
* A batch on which certain operations, e.g. data quality evaluation, is done.
|
||||
*/
|
||||
record BatchSpec includes CustomProperties {
|
||||
|
||||
/**
|
||||
* The native identifier as specified by the system operating on the batch.
|
||||
*/
|
||||
nativeBatchId: optional string
|
||||
|
||||
/**
|
||||
* A query that identifies a batch of data
|
||||
*/
|
||||
query: optional string
|
||||
|
||||
/**
|
||||
* Any limit to the number of rows in the batch, if applied
|
||||
*/
|
||||
limit: optional int
|
||||
}
|
||||
@ -0,0 +1,83 @@
|
||||
namespace com.linkedin.assertion
|
||||
|
||||
import com.linkedin.common.Urn
|
||||
|
||||
/**
|
||||
* Assertion on column of a dataset
|
||||
*/
|
||||
record DatasetColumnAssertion {
|
||||
/**
|
||||
* Standardized assertion operator
|
||||
*/
|
||||
stdOperator: AssertionStdOperator
|
||||
|
||||
/**
|
||||
* Native assertion operator
|
||||
*/
|
||||
nativeOperator: optional string // filled with the platform specific native operator string
|
||||
|
||||
/**
|
||||
* Standardized aggrgation function applied on column values
|
||||
*/
|
||||
stdAggFunc: enum DatasetColumnStdAggFunc {
|
||||
/**
|
||||
* Assertion is applied on individual column value
|
||||
*/
|
||||
IDENTITY
|
||||
|
||||
/**
|
||||
* Assertion is applied on column mean
|
||||
*/
|
||||
MEAN
|
||||
|
||||
/**
|
||||
* Assertion is applied on column median
|
||||
*/
|
||||
MEDIAN
|
||||
|
||||
/**
|
||||
* Assertion is applied on number of distinct values in column
|
||||
*/
|
||||
UNIQUE_COUNT
|
||||
|
||||
/**
|
||||
* Assertion is applied on proportion of distinct values in column
|
||||
*/
|
||||
UNIQUE_PROPOTION
|
||||
|
||||
/**
|
||||
* Assertion is applied on number of null values in column
|
||||
*/
|
||||
NULL_COUNT
|
||||
|
||||
/**
|
||||
* Assertion is applied on proportion of null values in column
|
||||
*/
|
||||
NULL_PROPORTION
|
||||
|
||||
/**
|
||||
* Assertion is applied on column std deviation
|
||||
*/
|
||||
STDDEV
|
||||
|
||||
/**
|
||||
* Assertion is applied on column min
|
||||
*/
|
||||
MIN
|
||||
|
||||
/**
|
||||
* Assertion is applied on column std deviation
|
||||
*/
|
||||
MAX
|
||||
|
||||
/**
|
||||
* Other
|
||||
*/
|
||||
_NATIVE_
|
||||
}
|
||||
|
||||
/**
|
||||
* Native aggrgation function applied on column values
|
||||
*/
|
||||
nativeAggFunc: optional string
|
||||
}
|
||||
@ -0,0 +1,40 @@
|
||||
namespace com.linkedin.assertion
|
||||
|
||||
import com.linkedin.common.Urn
|
||||
|
||||
/**
|
||||
* Assertion on rows of a dataset
|
||||
*/
|
||||
record DatasetRowsAssertion {
|
||||
|
||||
/**
|
||||
* Standardized assertion operator
|
||||
*/
|
||||
stdOperator: AssertionStdOperator
|
||||
|
||||
/**
|
||||
* Native assertion operator
|
||||
*/
|
||||
nativeOperator: optional string
|
||||
|
||||
/**
|
||||
* Standardized aggrgation function applied on rows
|
||||
*/
|
||||
stdAggFunc: enum DatasetRowsStdAggFunc {
|
||||
|
||||
/**
|
||||
* Assertion is applied on number of rows
|
||||
*/
|
||||
ROW_COUNT
|
||||
|
||||
/**
|
||||
* Other
|
||||
*/
|
||||
_NATIVE_
|
||||
}
|
||||
|
||||
/**
|
||||
* Native aggrgation function applied on rows
|
||||
*/
|
||||
nativeAggFunc: optional string
|
||||
}
|
||||
@ -0,0 +1,45 @@
|
||||
namespace com.linkedin.assertion
|
||||
|
||||
import com.linkedin.common.Urn
|
||||
|
||||
/**
|
||||
* Assertion on schema of a dataset
|
||||
*/
|
||||
record DatasetSchemaAssertion {
|
||||
|
||||
/**
|
||||
* Standardized assertion operator
|
||||
*/
|
||||
stdOperator: AssertionStdOperator
|
||||
|
||||
/**
|
||||
* Native assertion operator
|
||||
*/
|
||||
nativeOperator: optional string
|
||||
|
||||
/**
|
||||
* Standardized aggrgation function applied on rows
|
||||
*/
|
||||
stdAggFunc: enum DatasetSchemaStdAggFunc {
|
||||
|
||||
/**
|
||||
* Assertion is applied on all columns
|
||||
*/
|
||||
COLUMNS
|
||||
|
||||
/**
|
||||
* Assertion is applied on number of columns
|
||||
*/
|
||||
COLUMN_COUNT
|
||||
|
||||
/**
|
||||
* Other
|
||||
*/
|
||||
_NATIVE_
|
||||
}
|
||||
|
||||
/**
|
||||
* Native aggrgation function applied on rows
|
||||
*/
|
||||
nativeAggFunc: optional string
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
namespace com.linkedin.metadata.aspect
|
||||
|
||||
import com.linkedin.metadata.key.AssertionKey
|
||||
import com.linkedin.common.DataPlatformInstance
|
||||
import com.linkedin.assertion.AssertionInfo
|
||||
|
||||
/**
|
||||
* A union of all supported metadata aspects for a Assertion
|
||||
*/
|
||||
typeref AssertionAspect = union[
|
||||
AssertionKey,
|
||||
DataPlatformInstance,
|
||||
AssertionInfo
|
||||
]
|
||||
@ -0,0 +1,20 @@
|
||||
namespace com.linkedin.metadata.key
|
||||
|
||||
import com.linkedin.common.Urn
|
||||
|
||||
/**
|
||||
* Key for a Assertion
|
||||
*/
|
||||
@Aspect = {
|
||||
"name": "assertionKey",
|
||||
}
|
||||
record AssertionKey {
|
||||
|
||||
//The name of the assertion platform such as greatExpectations etc.
|
||||
//assertionPlatform: Urn
|
||||
|
||||
/**
|
||||
* Unique id for the assertion.
|
||||
*/
|
||||
assertionId: string
|
||||
}
|
||||
@ -0,0 +1,24 @@
|
||||
namespace com.linkedin.metadata.snapshot
|
||||
|
||||
import com.linkedin.common.Urn
|
||||
import com.linkedin.metadata.aspect.AssertionAspect
|
||||
|
||||
/**
|
||||
* A metadata snapshot for a specific Assertion entity.
|
||||
*/
|
||||
@Entity = {
|
||||
"name": "assertion",
|
||||
"keyAspect": "assertionKey"
|
||||
}
|
||||
record AssertionSnapshot {
|
||||
|
||||
/**
|
||||
* URN for the entity the metadata snapshot is associated with.
|
||||
*/
|
||||
urn: Urn
|
||||
|
||||
/**
|
||||
* The list of metadata aspects associated with the assertion.
|
||||
*/
|
||||
aspects: array[AssertionAspect]
|
||||
}
|
||||
@ -4,6 +4,13 @@ namespace com.linkedin.timeseries
|
||||
* Defines how the data is partitioned
|
||||
*/
|
||||
record PartitionSpec {
|
||||
|
||||
type: enum PartitionType {
|
||||
FULL_TABLE,
|
||||
QUERY,
|
||||
PARTITION
|
||||
} = "PARTITION"
|
||||
|
||||
/**
|
||||
* String representation of the partition
|
||||
*/
|
||||
|
||||
@ -14,7 +14,10 @@ record TimeseriesAspectBase {
|
||||
/**
|
||||
* The optional partition specification.
|
||||
*/
|
||||
partitionSpec: optional PartitionSpec
|
||||
partitionSpec: optional PartitionSpec = {
|
||||
"type":"FULL_TABLE",
|
||||
"partition":"FULL_TABLE_SNAPSHOT"
|
||||
}
|
||||
|
||||
/**
|
||||
* The optional messageId, if provided serves as a custom user-defined unique identifier for an aspect value.
|
||||
|
||||
@ -12,6 +12,7 @@ entities:
|
||||
- schemaMetadata
|
||||
- status
|
||||
- container
|
||||
- assertionResult
|
||||
- name: dataHubPolicy
|
||||
doc: DataHub Policies represent access policies granted to users or groups on metadata operations like edit, view etc.
|
||||
keyAspect: dataHubPolicyKey
|
||||
@ -84,3 +85,9 @@ entities:
|
||||
- dataHubExecutionRequestInput
|
||||
- dataHubExecutionRequestSignal
|
||||
- dataHubExecutionRequestResult
|
||||
- name: assertion
|
||||
doc: Assertion represents a data quality rule applied on one or more dataset.
|
||||
keyAspect: assertionKey
|
||||
aspects:
|
||||
- assertionInfo
|
||||
- dataPlatformInstance
|
||||
|
||||
144
smoke-test/test_data_quality.py
Normal file
144
smoke-test/test_data_quality.py
Normal file
@ -0,0 +1,144 @@
|
||||
import json
|
||||
import urllib
|
||||
import time
|
||||
import pytest
|
||||
import requests
|
||||
from datahub.cli.docker import check_local_docker_containers
|
||||
from tests.utils import ingest_file_via_rest
|
||||
|
||||
bootstrap_sample_data = "test_resources/bootstrap_data_quality.json"
|
||||
GMS_ENDPOINT = "http://localhost:8080"
|
||||
|
||||
restli_default_headers = {
|
||||
"X-RestLi-Protocol-Version": "2.0.0",
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def wait_for_healthchecks():
|
||||
# Simply assert that everything is healthy, but don't wait.
|
||||
assert not check_local_docker_containers()
|
||||
yield
|
||||
|
||||
|
||||
@pytest.mark.dependency()
|
||||
def test_healthchecks(wait_for_healthchecks):
|
||||
# Call to wait_for_healthchecks fixture will do the actual functionality.
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=["test_healthchecks"])
|
||||
def test_run_ingestion(wait_for_healthchecks):
|
||||
ingest_file_via_rest(bootstrap_sample_data)
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"])
|
||||
def test_gms_get_latest_assertions_results_by_partition():
|
||||
urn = "urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)"
|
||||
|
||||
# sleep for elasticsearch indices to be updated
|
||||
time.sleep(5)
|
||||
|
||||
# Query
|
||||
# Given the dataset
|
||||
# show me latest assertion results grouped-by date, partition, assertionId
|
||||
query = json.dumps(
|
||||
{
|
||||
"entityName": "dataset",
|
||||
"aspectName": "assertionResult",
|
||||
"filter": {
|
||||
"or": [
|
||||
{
|
||||
"and": [
|
||||
{
|
||||
"field": "urn",
|
||||
"value": urn,
|
||||
"condition": "EQUAL",
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"metrics": [
|
||||
{"fieldPath": "batchAssertionResult", "aggregationType": "LATEST"}
|
||||
],
|
||||
"buckets": [
|
||||
{"key": "asserteeUrn", "type": "STRING_GROUPING_BUCKET"},
|
||||
{"key": "partitionSpec.partition", "type": "STRING_GROUPING_BUCKET"},
|
||||
{
|
||||
"key": "timestampMillis",
|
||||
"type": "DATE_GROUPING_BUCKET",
|
||||
"timeWindowSize": {"multiple": 1, "unit": "DAY"},
|
||||
},
|
||||
{"key": "assertionUrn", "type": "STRING_GROUPING_BUCKET"},
|
||||
],
|
||||
}
|
||||
)
|
||||
response = requests.post(
|
||||
f"{GMS_ENDPOINT}/analytics?action=getTimeseriesStats",
|
||||
data=query,
|
||||
headers=restli_default_headers,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
assert data["value"]
|
||||
assert data["value"]["table"]
|
||||
assert sorted(data["value"]["table"]["columnNames"]) == [
|
||||
"asserteeUrn",
|
||||
"assertionUrn",
|
||||
"latest_batchAssertionResult",
|
||||
"partitionSpec.partition",
|
||||
"timestampMillis",
|
||||
]
|
||||
assert len(data["value"]["table"]["rows"]) == 6
|
||||
assert (
|
||||
data["value"]["table"]["rows"][0][
|
||||
data["value"]["table"]["columnNames"].index("asserteeUrn")
|
||||
]
|
||||
== urn
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"])
|
||||
def test_gms_get_assertions_on_dataset():
|
||||
"""lists all assertion urns including those which may not have executed"""
|
||||
urn = "urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)"
|
||||
response = requests.get(
|
||||
f"{GMS_ENDPOINT}/relationships?direction=INCOMING&urn={urllib.parse.quote(urn)}&types=Asserts"
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
assert len(data["relationships"]) == 1
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"])
|
||||
def test_gms_get_assertions_on_dataset_field():
|
||||
"""lists all assertion urns including those which may not have executed"""
|
||||
urn = "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD), col1)"
|
||||
response = requests.get(
|
||||
f"{GMS_ENDPOINT}/relationships?direction=INCOMING&urn={urllib.parse.quote(urn)}&types=Asserts"
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
assert len(data["relationships"]) == 1
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"])
|
||||
def test_gms_get_assertion_info():
|
||||
assertion_urn = "urn:li:assertion:2d3b06a6e77e1f24adc9860a05ea089b"
|
||||
response = requests.get(
|
||||
f"{GMS_ENDPOINT}/aspects/{urllib.parse.quote(assertion_urn)}\
|
||||
?aspect=assertionInfo&version=0",
|
||||
headers=restli_default_headers,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
assert data["aspect"]
|
||||
assert data["aspect"]["com.linkedin.assertion.AssertionInfo"]
|
||||
assert data["aspect"]["com.linkedin.assertion.AssertionInfo"]["assertionType"]
|
||||
72
smoke-test/test_resources/bootstrap_data_quality.json
Normal file
72
smoke-test/test_resources/bootstrap_data_quality.json
Normal file
@ -0,0 +1,72 @@
|
||||
[
|
||||
{
|
||||
"entityType": "assertion",
|
||||
"entityUrn": "urn:li:assertion:2d3b06a6e77e1f24adc9860a05ea089b",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "assertionInfo",
|
||||
"aspect": {
|
||||
"value": "{\"customProperties\": {\"suite_name\": \"demo_suite\"}, \"datasetFields\": [\"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD), col1)\"], \"datasets\": [\"urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)\"], \"assertionType\": {\"scope\": \"DATASET_COLUMN\", \"datasetColumnAssertion\": {\"stdOperator\": \"LESS_THAN\", \"nativeOperator\": \"column_value_is_less_than\", \"stdAggFunc\": \"IDENTITY\"}}, \"assertionParameters\": {\"max_value\": \"99\"}}",
|
||||
"contentType": "application/json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "dataset",
|
||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "assertionResult",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1643794280350, \"partitionSpec\": {\"type\": \"PARTITION\", \"partition\": \"[{'country': 'IN'}]\"}, \"messageId\": \"1643794280350\", \"assertionUrn\": \"urn:li:assertion:2d3b06a6e77e1f24adc9860a05ea089b\", \"asserteeUrn\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)\", \"batchAssertionResult\": {\"success\": true, \"actualAggValue\": 90, \"nativeResults\": {}, \"externalUrl\": \"http://example.com/uuid1\"}, \"nativeEvaluatorRunId\": \"uuid1\", \"runtimeContext\": {}}",
|
||||
"contentType": "application/json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "dataset",
|
||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "assertionResult",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1643794280352, \"partitionSpec\": {\"type\": \"PARTITION\", \"partition\": \"[{'country': 'US'}]\"}, \"messageId\": \"1643794280352\", \"assertionUrn\": \"urn:li:assertion:2d3b06a6e77e1f24adc9860a05ea089b\", \"asserteeUrn\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)\", \"batchAssertionResult\": {\"success\": false, \"actualAggValue\": 101, \"nativeResults\": {}, \"externalUrl\": \"http://example.com/uuid1\"}, \"nativeEvaluatorRunId\": \"uuid1\", \"runtimeContext\": {}}",
|
||||
"contentType": "application/json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "dataset",
|
||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "assertionResult",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1643794280354, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"messageId\": \"1643794280354\", \"assertionUrn\": \"urn:li:assertion:2d3b06a6e77e1f24adc9860a05ea089b\", \"asserteeUrn\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)\", \"batchAssertionResult\": {\"success\": true, \"actualAggValue\": 93, \"nativeResults\": {}, \"externalUrl\": \"http://example.com/uuid1\"}, \"nativeEvaluatorRunId\": \"uuid1\", \"runtimeContext\": {}}",
|
||||
"contentType": "application/json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "dataset",
|
||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "assertionResult",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1643880726872, \"partitionSpec\": {\"type\": \"PARTITION\", \"partition\": \"[{'country': 'IN'}]\"}, \"messageId\": \"1643880726872\", \"assertionUrn\": \"urn:li:assertion:2d3b06a6e77e1f24adc9860a05ea089b\", \"asserteeUrn\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)\", \"batchAssertionResult\": {\"success\": true, \"actualAggValue\": 90, \"nativeResults\": {}, \"externalUrl\": \"http://example.com/uuid1\"}, \"nativeEvaluatorRunId\": \"uuid1\", \"runtimeContext\": {}}",
|
||||
"contentType": "application/json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "dataset",
|
||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "assertionResult",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1643880726874, \"partitionSpec\": {\"type\": \"PARTITION\", \"partition\": \"[{'country': 'US'}]\"}, \"messageId\": \"1643880726874\", \"assertionUrn\": \"urn:li:assertion:2d3b06a6e77e1f24adc9860a05ea089b\", \"asserteeUrn\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)\", \"batchAssertionResult\": {\"success\": false, \"actualAggValue\": 101, \"nativeResults\": {}, \"externalUrl\": \"http://example.com/uuid1\"}, \"nativeEvaluatorRunId\": \"uuid1\", \"runtimeContext\": {}}",
|
||||
"contentType": "application/json"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "dataset",
|
||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "assertionResult",
|
||||
"aspect": {
|
||||
"value": "{\"timestampMillis\": 1643880726875, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"messageId\": \"1643880726875\", \"assertionUrn\": \"urn:li:assertion:2d3b06a6e77e1f24adc9860a05ea089b\", \"asserteeUrn\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,fooTable,PROD)\", \"batchAssertionResult\": {\"success\": true, \"actualAggValue\": 93, \"nativeResults\": {}, \"externalUrl\": \"http://example.com/uuid1\"}, \"nativeEvaluatorRunId\": \"uuid1\", \"runtimeContext\": {}}",
|
||||
"contentType": "application/json"
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -23,4 +23,15 @@ record TestEntityProfile includes TimeseriesAspectBase {
|
||||
"key":"key"
|
||||
}
|
||||
componentProfiles: array[TestEntityComponentProfile]
|
||||
|
||||
@TimeseriesField = {}
|
||||
aComplexNestedRecord: record ComplexNestedRecord {
|
||||
type: enum BatchType {
|
||||
QUERY_BATCH,
|
||||
PARTITION_BATCH,
|
||||
FULL_TABLE,
|
||||
_NATIVE_
|
||||
}
|
||||
partitions: array[map[string, string]]
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user