datahub/metadata-ingestion/tests/unit/test_glue_source.py

186 lines
6.1 KiB
Python
Raw Normal View History

import unittest
from datetime import datetime
from botocore.stub import Stubber
from freezegun import freeze_time
from datahub.ingestion.source.glue import GlueSource, GlueSourceConfig, get_column_type
from datahub.ingestion.source.metadata_common import MetadataWorkUnit
from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp, Status
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
ArrayTypeClass,
MapTypeClass,
MySqlDDL,
NumberTypeClass,
SchemaField,
SchemaFieldDataType,
SchemaMetadata,
StringTypeClass,
)
from datahub.metadata.schema_classes import (
AuditStampClass,
DatasetPropertiesClass,
OwnerClass,
OwnershipClass,
OwnershipTypeClass,
)
FROZEN_TIME = "2020-04-14 07:00:00"
class GlueSourceTest(unittest.TestCase):
glue_source = GlueSource(ctx=None, config=GlueSourceConfig(aws_region="us-east-1"))
def test_get_column_type_contains_key(self):
field_type = "char"
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
self.assertEqual(
data_type.to_obj(), SchemaFieldDataType(type=StringTypeClass()).to_obj()
)
def test_get_column_type_contains_array(self):
field_type = "array_lol"
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
self.assertEqual(
data_type.to_obj(), SchemaFieldDataType(type=ArrayTypeClass()).to_obj()
)
def test_get_column_type_contains_map(self):
field_type = "map_hehe"
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
self.assertEqual(
data_type.to_obj(), SchemaFieldDataType(type=MapTypeClass()).to_obj()
)
def test_get_column_type_contains_set(self):
field_type = "set_yolo"
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
self.assertEqual(
data_type.to_obj(), SchemaFieldDataType(type=ArrayTypeClass()).to_obj()
)
def test_get_column_type_not_contained(self):
field_type = "bad_column_type"
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
self.assertEqual(
data_type.to_obj(), SchemaFieldDataType(type=StringTypeClass()).to_obj()
)
self.assertEqual(
self.glue_source.report.warnings["bad_column_type"],
[
"The type 'bad_column_type' is not recognised for field 'a_field' in table 'a_table', "
"setting as StringTypeClass."
],
)
@freeze_time(FROZEN_TIME)
def test_turn_boto_glue_data_to_metadata_event(self):
stringy_timestamp = datetime.strptime(FROZEN_TIME, "%Y-%m-%d %H:%M:%S")
timestamp = int(datetime.timestamp(stringy_timestamp) * 1000)
response = {
"TableList": [
{
"Name": "Barbeque",
"Owner": "Susan",
"DatabaseName": "datalake_grilled",
"Description": "Grilled Food",
"StorageDescriptor": {
"Columns": [
{
"Name": "Size",
"Type": "int",
"Comment": "Maximum attendees permitted",
}
]
},
}
]
}
def flatten(d):
out = {}
for key, val in d.items():
if isinstance(val, dict):
val = [val]
if isinstance(val, list):
for subdict in val:
deeper = flatten(subdict).items()
out.update({key + "_" + key2: val2 for key2, val2 in deeper})
else:
out[key] = val
return out
with Stubber(self.glue_source.glue_client) as stubber:
stubber.add_response("search_tables", response, {})
actual_work_unit = next(self.glue_source.get_workunits())
expected_metadata_work_unit = create_metadata_work_unit(timestamp)
self.assertTrue(
sorted(flatten(vars(expected_metadata_work_unit)))
== sorted(flatten(vars(actual_work_unit)))
)
def create_metadata_work_unit(timestamp):
mce = MetadataChangeEvent()
dataset_snapshot = DatasetSnapshot(
urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)",
aspects=[],
)
dataset_snapshot.aspects.append(
OwnershipClass(
owners=[
OwnerClass(
owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER
)
],
lastModified=AuditStampClass(
time=timestamp, actor="urn:li:corpuser:datahub"
),
)
)
dataset_snapshot.aspects.append(
DatasetPropertiesClass(
description="Grilled Food",
customProperties={},
uri=None,
tags=[],
)
)
dataset_snapshot.aspects.append(Status(removed=False))
mce.proposedSnapshot = dataset_snapshot
fields = [
SchemaField(
fieldPath="Size",
nativeDataType="int",
type=SchemaFieldDataType(type=NumberTypeClass()),
description="Maximum attendees permitted",
nullable=True,
)
]
schema_metadata = SchemaMetadata(
schemaName="datalake_grilled.Barbeque",
version=0,
fields=fields,
platform="urn:li:dataPlatform:glue",
created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
hash="",
platformSchema=MySqlDDL(tableSchema=""),
)
dataset_snapshot.aspects.append(schema_metadata)
return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)