datahub/metadata-ingestion/tests/unit/test_rest_sink.py
John Joyce 352a0abf8d
Introducing TimeSeries Aspects + Dataset Profile (Stats) Aspect (#2983)
Co-authored-by: Dexter Lee <dexter@acryl.io>
Co-authored-by: Harshal Sheth <hsheth2@gmail.com>
Co-authored-by: Ravindra Lanka <rlanka@acryl.io>
Co-authored-by: Shirshanka Das <shirshanka@apache.org>
2021-07-30 17:41:03 -07:00

301 lines
13 KiB
Python

import json
import pytest
import requests
import datahub.metadata.schema_classes as models
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter
MOCK_GMS_ENDPOINT = "http://fakegmshost:8080"
basicAuditStamp = models.AuditStampClass(
time=1618987484580,
actor="urn:li:corpuser:datahub",
impersonator=None,
)
@pytest.mark.parametrize(
"record,path,snapshot",
[
(
# Simple test.
models.MetadataChangeEventClass(
proposedSnapshot=models.DatasetSnapshotClass(
urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,downstream,PROD)",
aspects=[
models.UpstreamLineageClass(
upstreams=[
models.UpstreamClass(
auditStamp=basicAuditStamp,
dataset="urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream1,PROD)",
type="TRANSFORMED",
),
models.UpstreamClass(
auditStamp=basicAuditStamp,
dataset="urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream2,PROD)",
type="TRANSFORMED",
),
]
)
],
),
),
"/entities?action=ingest",
{
"entity": {
"value": {
"com.linkedin.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,downstream,PROD)",
"aspects": [
{
"com.linkedin.dataset.UpstreamLineage": {
"upstreams": [
{
"auditStamp": {
"time": 1618987484580,
"actor": "urn:li:corpuser:datahub",
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream1,PROD)",
"type": "TRANSFORMED",
},
{
"auditStamp": {
"time": 1618987484580,
"actor": "urn:li:corpuser:datahub",
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream2,PROD)",
"type": "TRANSFORMED",
},
]
}
}
],
}
}
},
"systemMetadata": {},
},
),
(
# Verify the behavior of the fieldDiscriminator for primitive enums.
models.MetadataChangeEventClass(
proposedSnapshot=models.MLModelSnapshotClass(
urn="urn:li:mlModel:(urn:li:dataPlatform:science,scienceModel,PROD)",
aspects=[
models.CostClass(
costType=models.CostTypeClass.ORG_COST_TYPE,
cost=models.CostCostClass(
fieldDiscriminator=models.CostCostDiscriminatorClass.costCode,
costCode="sampleCostCode",
),
)
],
)
),
"/entities?action=ingest",
{
"entity": {
"value": {
"com.linkedin.metadata.snapshot.MLModelSnapshot": {
"urn": "urn:li:mlModel:(urn:li:dataPlatform:science,scienceModel,PROD)",
"aspects": [
{
"com.linkedin.common.Cost": {
"costType": "ORG_COST_TYPE",
"cost": {"costCode": "sampleCostCode"},
}
}
],
}
}
},
"systemMetadata": {},
},
),
(
# Verify the serialization behavior with chart type enums.
models.MetadataChangeEventClass(
proposedSnapshot=models.ChartSnapshotClass(
urn="urn:li:chart:(superset,227)",
aspects=[
models.ChartInfoClass(
title="Weekly Messages",
description="",
lastModified=models.ChangeAuditStampsClass(
created=basicAuditStamp,
lastModified=basicAuditStamp,
),
type=models.ChartTypeClass.SCATTER,
),
],
)
),
"/entities?action=ingest",
{
"entity": {
"value": {
"com.linkedin.metadata.snapshot.ChartSnapshot": {
"urn": "urn:li:chart:(superset,227)",
"aspects": [
{
"com.linkedin.chart.ChartInfo": {
"customProperties": {},
"title": "Weekly Messages",
"description": "",
"lastModified": {
"created": {
"time": 1618987484580,
"actor": "urn:li:corpuser:datahub",
},
"lastModified": {
"time": 1618987484580,
"actor": "urn:li:corpuser:datahub",
},
},
"type": "SCATTER",
}
}
],
}
}
},
"systemMetadata": {},
},
),
(
# Verify that DataJobInfo is serialized properly (particularly it's union type).
models.MetadataChangeEventClass(
proposedSnapshot=models.DataJobSnapshotClass(
urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)",
aspects=[
models.DataJobInfoClass(
name="User Deletions",
description="Constructs the fct_users_deleted from logging_events",
type=models.AzkabanJobTypeClass.SQL,
)
],
)
),
"/entities?action=ingest",
{
"entity": {
"value": {
"com.linkedin.metadata.snapshot.DataJobSnapshot": {
"urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)",
"aspects": [
{
"com.linkedin.datajob.DataJobInfo": {
"customProperties": {},
"name": "User Deletions",
"description": "Constructs the fct_users_deleted from logging_events",
"type": {"string": "SQL"},
}
}
],
}
}
},
"systemMetadata": {},
},
),
(
# Usage stats ingestion test.
models.UsageAggregationClass(
bucket=1623826800000,
duration="DAY",
resource="urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)",
metrics=models.UsageAggregationMetricsClass(
uniqueUserCount=2,
users=[
models.UserUsageCountsClass(
user="urn:li:corpuser:jdoe",
count=5,
),
models.UserUsageCountsClass(
user="urn:li:corpuser:unknown",
count=3,
userEmail="foo@example.com",
),
],
totalSqlQueries=1,
topSqlQueries=["SELECT * FROM foo"],
),
),
"/usageStats?action=batchIngest",
{
"buckets": [
{
"bucket": 1623826800000,
"duration": "DAY",
"resource": "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleKafkaDataset,PROD)",
"metrics": {
"uniqueUserCount": 2,
"users": [
{"count": 5, "user": "urn:li:corpuser:jdoe"},
{
"count": 3,
"user": "urn:li:corpuser:unknown",
"userEmail": "foo@example.com",
},
],
"totalSqlQueries": 1,
"topSqlQueries": ["SELECT * FROM foo"],
},
}
]
},
),
(
MetadataChangeProposalWrapper(
entityType="dataset",
entityUrn="urn:li:dataset:(urn:li:dataPlatform:foo,bar,PROD)",
changeType=models.ChangeTypeClass.UPSERT,
aspectName="ownership",
aspect=models.OwnershipClass(
owners=[
models.OwnerClass(
owner="urn:li:corpuser:fbar",
type=models.OwnershipTypeClass.DATAOWNER,
)
],
lastModified=models.AuditStampClass(
time=0,
actor="urn:li:corpuser:fbar",
),
),
),
"/aspects?action=ingestProposal",
{
"proposal": {
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:foo,bar,PROD)",
"changeType": "UPSERT",
"aspectName": "ownership",
"aspect": {
"value": '{"owners": [{"owner": "urn:li:corpuser:fbar", "type": "DATAOWNER"}], "lastModified": {"time": 0, "actor": "urn:li:corpuser:fbar"}}',
"contentType": "application/json",
},
}
},
),
],
)
def test_datahub_rest_emitter(requests_mock, record, path, snapshot):
def match_request_text(request: requests.Request) -> bool:
requested_snapshot = request.json()
assert (
requested_snapshot == snapshot
), f"Expected snapshot to be {json.dumps(snapshot)}, got {json.dumps(requested_snapshot)}"
return True
requests_mock.post(
f"{MOCK_GMS_ENDPOINT}{path}",
request_headers={"X-RestLi-Protocol-Version": "2.0.0"},
additional_matcher=match_request_text,
)
emitter = DatahubRestEmitter(MOCK_GMS_ENDPOINT)
emitter.emit(record)