2021-04-04 19:00:27 +01:00
|
|
|
import unittest
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
from botocore.stub import Stubber
|
|
|
|
from freezegun import freeze_time
|
|
|
|
|
2021-04-05 19:11:28 -07:00
|
|
|
from datahub.ingestion.api.common import PipelineContext
|
2021-04-04 19:00:27 +01:00
|
|
|
from datahub.ingestion.source.glue import GlueSource, GlueSourceConfig, get_column_type
|
|
|
|
from datahub.ingestion.source.metadata_common import MetadataWorkUnit
|
|
|
|
from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp, Status
|
|
|
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
|
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
|
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
|
ArrayTypeClass,
|
|
|
|
MapTypeClass,
|
|
|
|
MySqlDDL,
|
|
|
|
NumberTypeClass,
|
|
|
|
SchemaField,
|
|
|
|
SchemaFieldDataType,
|
|
|
|
SchemaMetadata,
|
|
|
|
StringTypeClass,
|
|
|
|
)
|
|
|
|
from datahub.metadata.schema_classes import (
|
|
|
|
AuditStampClass,
|
|
|
|
DatasetPropertiesClass,
|
|
|
|
OwnerClass,
|
|
|
|
OwnershipClass,
|
|
|
|
OwnershipTypeClass,
|
|
|
|
)
|
|
|
|
|
|
|
|
FROZEN_TIME = "2020-04-14 07:00:00"
|
|
|
|
|
|
|
|
|
|
|
|
class GlueSourceTest(unittest.TestCase):
|
2021-04-05 19:11:28 -07:00
|
|
|
glue_source = GlueSource(
|
|
|
|
ctx=PipelineContext(run_id="glue-source-test"),
|
|
|
|
config=GlueSourceConfig(aws_region="us-east-1"),
|
|
|
|
)
|
2021-04-04 19:00:27 +01:00
|
|
|
|
|
|
|
def test_get_column_type_contains_key(self):
|
|
|
|
|
|
|
|
field_type = "char"
|
|
|
|
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
|
|
|
|
self.assertEqual(
|
|
|
|
data_type.to_obj(), SchemaFieldDataType(type=StringTypeClass()).to_obj()
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_get_column_type_contains_array(self):
|
|
|
|
|
|
|
|
field_type = "array_lol"
|
|
|
|
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
|
|
|
|
self.assertEqual(
|
|
|
|
data_type.to_obj(), SchemaFieldDataType(type=ArrayTypeClass()).to_obj()
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_get_column_type_contains_map(self):
|
|
|
|
|
|
|
|
field_type = "map_hehe"
|
|
|
|
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
|
|
|
|
self.assertEqual(
|
|
|
|
data_type.to_obj(), SchemaFieldDataType(type=MapTypeClass()).to_obj()
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_get_column_type_contains_set(self):
|
|
|
|
|
|
|
|
field_type = "set_yolo"
|
|
|
|
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
|
|
|
|
self.assertEqual(
|
|
|
|
data_type.to_obj(), SchemaFieldDataType(type=ArrayTypeClass()).to_obj()
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_get_column_type_not_contained(self):
|
|
|
|
|
|
|
|
field_type = "bad_column_type"
|
|
|
|
data_type = get_column_type(self.glue_source, field_type, "a_table", "a_field")
|
|
|
|
self.assertEqual(
|
|
|
|
data_type.to_obj(), SchemaFieldDataType(type=StringTypeClass()).to_obj()
|
|
|
|
)
|
|
|
|
self.assertEqual(
|
|
|
|
self.glue_source.report.warnings["bad_column_type"],
|
|
|
|
[
|
|
|
|
"The type 'bad_column_type' is not recognised for field 'a_field' in table 'a_table', "
|
|
|
|
"setting as StringTypeClass."
|
|
|
|
],
|
|
|
|
)
|
|
|
|
|
|
|
|
@freeze_time(FROZEN_TIME)
|
|
|
|
def test_turn_boto_glue_data_to_metadata_event(self):
|
|
|
|
stringy_timestamp = datetime.strptime(FROZEN_TIME, "%Y-%m-%d %H:%M:%S")
|
|
|
|
timestamp = int(datetime.timestamp(stringy_timestamp) * 1000)
|
|
|
|
|
2021-06-01 20:02:52 +02:00
|
|
|
get_databases_response = {
|
|
|
|
"DatabaseList": [
|
|
|
|
{
|
|
|
|
"Name": "datalake_grilled",
|
|
|
|
"Description": "irrelevant",
|
|
|
|
"LocationUri": "irrelevant",
|
|
|
|
"Parameters": {},
|
|
|
|
"CreateTime": datetime(2015, 1, 1),
|
|
|
|
"CreateTableDefaultPermissions": [],
|
|
|
|
"CatalogId": "irrelevant",
|
|
|
|
},
|
|
|
|
],
|
|
|
|
}
|
|
|
|
get_tables_response = {
|
2021-04-04 19:00:27 +01:00
|
|
|
"TableList": [
|
|
|
|
{
|
|
|
|
"Name": "Barbeque",
|
|
|
|
"Owner": "Susan",
|
|
|
|
"DatabaseName": "datalake_grilled",
|
|
|
|
"Description": "Grilled Food",
|
|
|
|
"StorageDescriptor": {
|
|
|
|
"Columns": [
|
|
|
|
{
|
|
|
|
"Name": "Size",
|
|
|
|
"Type": "int",
|
|
|
|
"Comment": "Maximum attendees permitted",
|
|
|
|
}
|
|
|
|
]
|
|
|
|
},
|
|
|
|
}
|
|
|
|
]
|
|
|
|
}
|
|
|
|
|
|
|
|
with Stubber(self.glue_source.glue_client) as stubber:
|
2021-06-01 20:02:52 +02:00
|
|
|
stubber.add_response("get_databases", get_databases_response, {})
|
|
|
|
stubber.add_response(
|
|
|
|
"get_tables", get_tables_response, {"DatabaseName": "datalake_grilled"}
|
|
|
|
)
|
2021-04-05 19:11:28 -07:00
|
|
|
actual_work_unit = list(self.glue_source.get_workunits())[0]
|
2021-04-04 19:00:27 +01:00
|
|
|
|
|
|
|
expected_metadata_work_unit = create_metadata_work_unit(timestamp)
|
|
|
|
|
2021-04-14 19:25:57 -07:00
|
|
|
self.assertEqual(expected_metadata_work_unit, actual_work_unit)
|
2021-04-04 19:00:27 +01:00
|
|
|
|
|
|
|
|
|
|
|
def create_metadata_work_unit(timestamp):
|
|
|
|
dataset_snapshot = DatasetSnapshot(
|
|
|
|
urn="urn:li:dataset:(urn:li:dataPlatform:glue,datalake_grilled.Barbeque,PROD)",
|
|
|
|
aspects=[],
|
|
|
|
)
|
2021-04-14 19:25:57 -07:00
|
|
|
|
|
|
|
dataset_snapshot.aspects.append(Status(removed=False))
|
|
|
|
|
2021-04-04 19:00:27 +01:00
|
|
|
dataset_snapshot.aspects.append(
|
|
|
|
OwnershipClass(
|
|
|
|
owners=[
|
|
|
|
OwnerClass(
|
|
|
|
owner="urn:li:corpuser:Susan", type=OwnershipTypeClass.DATAOWNER
|
|
|
|
)
|
|
|
|
],
|
|
|
|
lastModified=AuditStampClass(
|
|
|
|
time=timestamp, actor="urn:li:corpuser:datahub"
|
|
|
|
),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
dataset_snapshot.aspects.append(
|
|
|
|
DatasetPropertiesClass(
|
|
|
|
description="Grilled Food",
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
fields = [
|
|
|
|
SchemaField(
|
|
|
|
fieldPath="Size",
|
|
|
|
nativeDataType="int",
|
|
|
|
type=SchemaFieldDataType(type=NumberTypeClass()),
|
|
|
|
description="Maximum attendees permitted",
|
|
|
|
nullable=True,
|
2021-04-05 19:11:28 -07:00
|
|
|
recursive=False,
|
2021-04-04 19:00:27 +01:00
|
|
|
)
|
|
|
|
]
|
|
|
|
|
|
|
|
schema_metadata = SchemaMetadata(
|
|
|
|
schemaName="datalake_grilled.Barbeque",
|
|
|
|
version=0,
|
|
|
|
fields=fields,
|
|
|
|
platform="urn:li:dataPlatform:glue",
|
|
|
|
created=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
|
|
|
|
lastModified=AuditStamp(time=timestamp, actor="urn:li:corpuser:etl"),
|
|
|
|
hash="",
|
|
|
|
platformSchema=MySqlDDL(tableSchema=""),
|
|
|
|
)
|
|
|
|
dataset_snapshot.aspects.append(schema_metadata)
|
2021-04-14 19:25:57 -07:00
|
|
|
|
|
|
|
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
2021-04-04 19:00:27 +01:00
|
|
|
return MetadataWorkUnit(id="glue-datalake_grilled.Barbeque", mce=mce)
|