mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-06 08:38:41 +00:00
186 lines
6.1 KiB
Python
186 lines
6.1 KiB
Python
![]() |
import unittest
|
||
|
from datetime import datetime
|
||
|
|
||
|
from botocore.stub import Stubber
|
||
|
from freezegun import freeze_time
|
||
|
|
||
|
from datahub.ingestion.source.glue import GlueSource, GlueSourceConfig, get_column_type
|
||
|
from datahub.ingestion.source.metadata_common import MetadataWorkUnit
|
||
|
from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp, Status
|
||
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
||
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
||
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
||
|
ArrayTypeClass,
|
||
|
MapTypeClass,
|
||
|
MySqlDDL,
|
||
|
NumberTypeClass,
|
||
|
SchemaField,
|
||
|
SchemaFieldDataType,
|
||
|
SchemaMetadata,
|
||
|
StringTypeClass,
|
||
|
)
|
||
|
from datahub.metadata.schema_classes import (
|
||
|
AuditStampClass,
|
||
|
DatasetPropertiesClass,
|
||
|
OwnerClass,
|
||
|
OwnershipClass,
|
||
|
OwnershipTypeClass,
|
||
|
)
|
||
|
|
||
|
FROZEN_TIME = "2020-04-14 07:00:00"
|
||
|
|
||
|
|
||
|
class GlueSourceTest(unittest.TestCase):
|
||
|
glue_source = GlueSource(ctx=None, config=GlueSourceConfig(aws_region="us-east-1"))
|
||
|
|
||
|
def test_get_column_type_contains_key(self):
|
||
|
|
||
|
field_type = "char"
|
||
|
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
|
||
|
self.assertEqual(
|
||
|
data_type.to_obj(), SchemaFieldDataType(type=StringTypeClass()).to_obj()
|
||
|
)
|
||
|
|
||
|
def test_get_column_type_contains_array(self):
|
||
|
|
||
|
field_type = "array_lol"
|
||
|
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
|
||
|
self.assertEqual(
|
||
|
data_type.to_obj(), SchemaFieldDataType(type=ArrayTypeClass()).to_obj()
|
||
|
)
|
||
|
|
||
|
def test_get_column_type_contains_map(self):
|
||
|
|
||
|
field_type = "map_hehe"
|
||
|
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
|
||
|
self.assertEqual(
|
||
|
data_type.to_obj(), SchemaFieldDataType(type=MapTypeClass()).to_obj()
|
||
|
)
|
||
|
|
||
|
def test_get_column_type_contains_set(self):
|
||
|
|
||
|
field_type = "set_yolo"
|
||
|
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
|
||
|
self.assertEqual(
|
||
|
data_type.to_obj(), SchemaFieldDataType(type=ArrayTypeClass()).to_obj()
|
||
|
)
|
||
|
|
||
|
def test_get_column_type_not_contained(self):
|
||
|
|
||
|
field_type = "bad_column_type"
|
||
|
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
|
||
|
self.assertEqual(
|
||
|
data_type.to_obj(), SchemaFieldDataType(type=StringTypeClass()).to_obj()
|
||
|
)
|
||
|
self.assertEqual(
|
||
|
self.glue_source.report.warnings["bad_column_type"],
|
||
|
[
|
||
|
"The type 'bad_column_type' is not recognised for field 'a_field' in table 'a_table', "
|
||
|
"setting as StringTypeClass."
|
||
|
],
|
||
|
)
|
||
|
|
||
|
@freeze_time(FROZEN_TIME)
|
||
|
def test_turn_boto_glue_data_to_metadata_event(self):
|
||
|
stringy_timestamp = datetime.strptime(FROZEN_TIME, "%Y-%m-%d %H:%M:%S")
|
||
|
timestamp = int(datetime.timestamp(stringy_timestamp) * 1000)
|
||
|
|
||
|
response = {
|
||
|
"TableList": [
|
||
|
{
|
||
|
"Name": "Barbeque",
|
||
|
"Owner": "Susan",
|
||
|
"DatabaseName": "datalake_grilled",
|
||
|
"Description": "Grilled Food",
|
||
|
"StorageDescriptor": {
|
||
|
"Columns": [
|
||
|
{
|
||
|
"Name": "Size",
|
||
|
"Type": "int",
|
||
|
"Comment": "Maximum attendees permitted",
|
||
|
}
|
||
|
]
|
||
|
},
|
||
|
}
|
||
|
]
|
||
|
}
|
||
|
|
||
|
def flatten(d):
|
||
|
out = {}
|
||
|
for key, val in d.items():
|
||
|
if isinstance(val, dict):
|
||
|
val = [val]
|
||
|
if isinstance(val, list):
|
||
|
for subdict in val:
|
||
|
deeper = flatten(subdict).items()
|
||
|
out.update({key + "_" + key2: val2 for key2, val2 in deeper})
|
||
|
else:
|
||
|
out[key] = val
|
||
|
return out
|
||
|
|
||
|
with Stubber(self.glue_source.glue_client) as stubber:
|
||
|
stubber.add_response("search_tables", response, {})
|
||
|
actual_work_unit = next(self.glue_source.get_workunits())
|
||
|
|
||
|
expected_metadata_work_unit = create_metadata_work_unit(timestamp)
|
||
|
|
||
|
self.assertTrue(
|
||
|
sorted(flatten(vars(expected_metadata_work_unit)))
|
||
|
== sorted(flatten(vars(actual_work_unit)))
|
||
|
)
|
||
|
|
||
|
|
||
|
def create_metadata_work_unit(timestamp):
|
||
|
mce = MetadataChangeEvent()
|
||
|
dataset_snapshot = DatasetSnapshot(
|
||
|
urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)",
|
||
|
aspects=[],
|
||
|
)
|
||
|
dataset_snapshot.aspects.append(
|
||
|
OwnershipClass(
|
||
|
owners=[
|
||
|
OwnerClass(
|
||
|
owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER
|
||
|
)
|
||
|
],
|
||
|
lastModified=AuditStampClass(
|
||
|
time=timestamp, actor="urn:li:corpuser:datahub"
|
||
|
),
|
||
|
)
|
||
|
)
|
||
|
|
||
|
dataset_snapshot.aspects.append(
|
||
|
DatasetPropertiesClass(
|
||
|
description="Grilled Food",
|
||
|
customProperties={},
|
||
|
uri=None,
|
||
|
tags=[],
|
||
|
)
|
||
|
)
|
||
|
dataset_snapshot.aspects.append(Status(removed=False))
|
||
|
|
||
|
mce.proposedSnapshot = dataset_snapshot
|
||
|
|
||
|
fields = [
|
||
|
SchemaField(
|
||
|
fieldPath="Size",
|
||
|
nativeDataType="int",
|
||
|
type=SchemaFieldDataType(type=NumberTypeClass()),
|
||
|
description="Maximum attendees permitted",
|
||
|
nullable=True,
|
||
|
)
|
||
|
]
|
||
|
|
||
|
schema_metadata = SchemaMetadata(
|
||
|
schemaName="datalake_grilled.Barbeque",
|
||
|
version=0,
|
||
|
fields=fields,
|
||
|
platform="urn:li:dataPlatform:glue",
|
||
|
created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
|
||
|
lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
|
||
|
hash="",
|
||
|
platformSchema=MySqlDDL(tableSchema=""),
|
||
|
)
|
||
|
dataset_snapshot.aspects.append(schema_metadata)
|
||
|
return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
|