2021-06-22 11:33:22 -07:00
|
|
|
import json
|
2021-04-04 19:00:27 +01:00
|
|
|
|
|
|
|
from botocore.stub import Stubber
|
|
|
|
from freezegun import freeze_time
|
|
|
|
|
2021-04-05 19:11:28 -07:00
|
|
|
from datahub.ingestion.api.common import PipelineContext
|
2021-04-04 19:00:27 +01:00
|
|
|
from datahub.ingestion.source.glue import GlueSource, GlueSourceConfig, get_column_type
|
|
|
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
|
ArrayTypeClass,
|
|
|
|
MapTypeClass,
|
|
|
|
SchemaFieldDataType,
|
|
|
|
StringTypeClass,
|
|
|
|
)
|
2021-06-22 11:33:22 -07:00
|
|
|
from tests.test_helpers import mce_helpers
|
|
|
|
from tests.unit.test_glue_source_stubs import (
|
|
|
|
get_databases_response,
|
|
|
|
get_dataflow_graph_response_1,
|
|
|
|
get_dataflow_graph_response_2,
|
|
|
|
get_jobs_response,
|
|
|
|
get_object_body_1,
|
|
|
|
get_object_body_2,
|
|
|
|
get_object_response_1,
|
|
|
|
get_object_response_2,
|
|
|
|
get_tables_response_1,
|
|
|
|
get_tables_response_2,
|
2021-04-04 19:00:27 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
FROZEN_TIME = "2020-04-14 07:00:00"
|
|
|
|
|
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
def glue_source() -> GlueSource:
|
|
|
|
return GlueSource(
|
2021-04-05 19:11:28 -07:00
|
|
|
ctx=PipelineContext(run_id="glue-source-test"),
|
2021-06-22 11:33:22 -07:00
|
|
|
config=GlueSourceConfig(aws_region="us-west-2", extract_transforms=True),
|
2021-04-05 19:11:28 -07:00
|
|
|
)
|
2021-04-04 19:00:27 +01:00
|
|
|
|
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
def test_get_column_type_contains_key():
|
2021-04-04 19:00:27 +01:00
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
field_type = "char"
|
|
|
|
data_type = get_column_type(glue_source(), field_type, "a_table", "a_field")
|
|
|
|
assert data_type.to_obj() == SchemaFieldDataType(type=StringTypeClass()).to_obj()
|
2021-04-04 19:00:27 +01:00
|
|
|
|
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
def test_get_column_type_contains_array():
|
2021-04-04 19:00:27 +01:00
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
field_type = "array_lol"
|
|
|
|
data_type = get_column_type(glue_source(), field_type, "a_table", "a_field")
|
|
|
|
assert data_type.to_obj() == SchemaFieldDataType(type=ArrayTypeClass()).to_obj()
|
2021-04-04 19:00:27 +01:00
|
|
|
|
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
def test_get_column_type_contains_map():
|
2021-04-04 19:00:27 +01:00
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
field_type = "map_hehe"
|
|
|
|
data_type = get_column_type(glue_source(), field_type, "a_table", "a_field")
|
|
|
|
assert data_type.to_obj() == SchemaFieldDataType(type=MapTypeClass()).to_obj()
|
2021-04-04 19:00:27 +01:00
|
|
|
|
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
def test_get_column_type_contains_set():
|
2021-04-04 19:00:27 +01:00
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
field_type = "set_yolo"
|
|
|
|
data_type = get_column_type(glue_source(), field_type, "a_table", "a_field")
|
|
|
|
assert data_type.to_obj() == SchemaFieldDataType(type=ArrayTypeClass()).to_obj()
|
2021-04-04 19:00:27 +01:00
|
|
|
|
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
def test_get_column_type_not_contained():
|
2021-04-04 19:00:27 +01:00
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
glue_source_instance = glue_source()
|
2021-04-04 19:00:27 +01:00
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
field_type = "bad_column_type"
|
|
|
|
data_type = get_column_type(glue_source_instance, field_type, "a_table", "a_field")
|
|
|
|
assert data_type.to_obj() == SchemaFieldDataType(type=StringTypeClass()).to_obj()
|
|
|
|
assert glue_source_instance.report.warnings["bad_column_type"] == [
|
|
|
|
"The type 'bad_column_type' is not recognised for field 'a_field' in table 'a_table', "
|
|
|
|
"setting as StringTypeClass."
|
|
|
|
]
|
2021-04-04 19:00:27 +01:00
|
|
|
|
2021-04-14 19:25:57 -07:00
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
@freeze_time(FROZEN_TIME)
|
|
|
|
def test_glue_ingest(tmp_path, pytestconfig):
|
2021-04-04 19:00:27 +01:00
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
glue_source_instance = glue_source()
|
|
|
|
|
|
|
|
with Stubber(glue_source_instance.glue_client) as glue_stubber:
|
2021-04-04 19:00:27 +01:00
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
glue_stubber.add_response("get_databases", get_databases_response, {})
|
|
|
|
glue_stubber.add_response(
|
|
|
|
"get_tables",
|
|
|
|
get_tables_response_1,
|
|
|
|
{"DatabaseName": "flights-database"},
|
|
|
|
)
|
|
|
|
glue_stubber.add_response(
|
|
|
|
"get_tables",
|
|
|
|
get_tables_response_2,
|
|
|
|
{"DatabaseName": "test-database"},
|
|
|
|
)
|
|
|
|
glue_stubber.add_response("get_jobs", get_jobs_response, {})
|
|
|
|
glue_stubber.add_response(
|
|
|
|
"get_dataflow_graph",
|
|
|
|
get_dataflow_graph_response_1,
|
|
|
|
{"PythonScript": get_object_body_1},
|
|
|
|
)
|
|
|
|
glue_stubber.add_response(
|
|
|
|
"get_dataflow_graph",
|
|
|
|
get_dataflow_graph_response_2,
|
|
|
|
{"PythonScript": get_object_body_2},
|
2021-04-04 19:00:27 +01:00
|
|
|
)
|
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
with Stubber(glue_source_instance.s3_client) as s3_stubber:
|
2021-04-14 19:25:57 -07:00
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
s3_stubber.add_response(
|
|
|
|
"get_object",
|
|
|
|
get_object_response_1,
|
|
|
|
{
|
|
|
|
"Bucket": "aws-glue-assets-123412341234-us-west-2",
|
|
|
|
"Key": "scripts/job-1.py",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
s3_stubber.add_response(
|
|
|
|
"get_object",
|
|
|
|
get_object_response_2,
|
|
|
|
{
|
|
|
|
"Bucket": "aws-glue-assets-123412341234-us-west-2",
|
|
|
|
"Key": "scripts/job-2.py",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
mce_objects = [
|
|
|
|
wu.mce.to_obj() for wu in glue_source_instance.get_workunits()
|
|
|
|
]
|
|
|
|
|
|
|
|
with open(str(tmp_path / "glue_mces.json"), "w") as f:
|
|
|
|
json.dump(mce_objects, f, indent=2)
|
|
|
|
|
|
|
|
output = mce_helpers.load_json_file(str(tmp_path / "glue_mces.json"))
|
|
|
|
|
|
|
|
test_resources_dir = pytestconfig.rootpath / "tests/unit/glue"
|
|
|
|
golden = mce_helpers.load_json_file(
|
|
|
|
str(test_resources_dir / "glue_mces_golden.json")
|
|
|
|
)
|
|
|
|
mce_helpers.assert_mces_equal(output, golden)
|