2021-06-22 11:33:22 -07:00
|
|
|
import json
|
2022-03-31 03:50:26 +02:00
|
|
|
from pathlib import Path
|
|
|
|
from typing import Dict, Optional, Tuple, Type
|
2021-04-04 19:00:27 +01:00
|
|
|
|
2022-01-17 14:21:53 -08:00
|
|
|
import pytest
|
2021-04-04 19:00:27 +01:00
|
|
|
from botocore.stub import Stubber
|
|
|
|
from freezegun import freeze_time
|
|
|
|
|
2022-03-31 03:50:26 +02:00
|
|
|
from datahub.configuration.common import ConfigurationError
|
2021-04-05 19:11:28 -07:00
|
|
|
from datahub.ingestion.api.common import PipelineContext
|
2022-01-17 14:21:53 -08:00
|
|
|
from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields
|
|
|
|
from datahub.ingestion.source.aws.glue import GlueSource, GlueSourceConfig
|
2021-04-04 19:00:27 +01:00
|
|
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
|
ArrayTypeClass,
|
|
|
|
MapTypeClass,
|
2022-01-17 14:21:53 -08:00
|
|
|
RecordTypeClass,
|
2021-04-04 19:00:27 +01:00
|
|
|
StringTypeClass,
|
|
|
|
)
|
2022-01-17 14:21:53 -08:00
|
|
|
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
|
2021-06-22 11:33:22 -07:00
|
|
|
from tests.test_helpers import mce_helpers
|
2022-03-31 03:50:26 +02:00
|
|
|
from tests.test_helpers.type_helpers import PytestConfig
|
2021-06-22 11:33:22 -07:00
|
|
|
from tests.unit.test_glue_source_stubs import (
|
2022-04-29 04:09:06 -04:00
|
|
|
get_bucket_tagging,
|
2021-06-22 11:33:22 -07:00
|
|
|
get_databases_response,
|
|
|
|
get_dataflow_graph_response_1,
|
|
|
|
get_dataflow_graph_response_2,
|
|
|
|
get_jobs_response,
|
|
|
|
get_object_body_1,
|
|
|
|
get_object_body_2,
|
|
|
|
get_object_response_1,
|
|
|
|
get_object_response_2,
|
2022-04-29 04:09:06 -04:00
|
|
|
get_object_tagging,
|
2021-06-22 11:33:22 -07:00
|
|
|
get_tables_response_1,
|
|
|
|
get_tables_response_2,
|
2021-04-04 19:00:27 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
FROZEN_TIME = "2020-04-14 07:00:00"
|
|
|
|
|
|
|
|
|
2022-03-31 03:50:26 +02:00
|
|
|
def glue_source(platform_instance: Optional[str] = None) -> GlueSource:
|
2021-06-22 11:33:22 -07:00
|
|
|
return GlueSource(
|
2021-04-05 19:11:28 -07:00
|
|
|
ctx=PipelineContext(run_id="glue-source-test"),
|
2022-03-31 03:50:26 +02:00
|
|
|
config=GlueSourceConfig(
|
|
|
|
aws_region="us-west-2",
|
|
|
|
extract_transforms=True,
|
|
|
|
platform_instance=platform_instance,
|
2022-04-29 04:09:06 -04:00
|
|
|
use_s3_bucket_tags=True,
|
|
|
|
use_s3_object_tags=True,
|
2022-03-31 03:50:26 +02:00
|
|
|
),
|
2021-04-05 19:11:28 -07:00
|
|
|
)
|
2021-04-04 19:00:27 +01:00
|
|
|
|
|
|
|
|
2022-01-17 14:21:53 -08:00
|
|
|
column_type_test_cases: Dict[str, Tuple[str, Type]] = {
|
|
|
|
"char": ("char", StringTypeClass),
|
|
|
|
"array": ("array<int>", ArrayTypeClass),
|
|
|
|
"map": ("map<string, int>", MapTypeClass),
|
|
|
|
"struct": ("struct<a:int, b:string>", RecordTypeClass),
|
|
|
|
}
|
2021-04-04 19:00:27 +01:00
|
|
|
|
|
|
|
|
2022-01-17 14:21:53 -08:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"hive_column_type, expected_type",
|
|
|
|
column_type_test_cases.values(),
|
|
|
|
ids=column_type_test_cases.keys(),
|
|
|
|
)
|
|
|
|
def test_column_type(hive_column_type: str, expected_type: Type) -> None:
|
|
|
|
avro_schema = get_avro_schema_for_hive_column(
|
|
|
|
f"test_column_{hive_column_type}", hive_column_type
|
|
|
|
)
|
|
|
|
schema_fields = avro_schema_to_mce_fields(json.dumps(avro_schema))
|
|
|
|
actual_schema_field_type = schema_fields[0].type
|
|
|
|
assert type(actual_schema_field_type.type) == expected_type
|
2021-04-04 19:00:27 +01:00
|
|
|
|
2021-04-14 19:25:57 -07:00
|
|
|
|
2022-03-31 03:50:26 +02:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"platform_instance, mce_file, mce_golden_file",
|
|
|
|
[
|
|
|
|
(None, "glue_mces.json", "glue_mces_golden.json"),
|
|
|
|
(
|
|
|
|
"some_instance_name",
|
|
|
|
"glue_mces_platform_instance.json",
|
|
|
|
"glue_mces_platform_instance_golden.json",
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
2021-06-22 11:33:22 -07:00
|
|
|
@freeze_time(FROZEN_TIME)
|
2022-03-31 03:50:26 +02:00
|
|
|
def test_glue_ingest(
|
|
|
|
tmp_path: Path,
|
|
|
|
pytestconfig: PytestConfig,
|
|
|
|
platform_instance: str,
|
|
|
|
mce_file: str,
|
|
|
|
mce_golden_file: str,
|
|
|
|
) -> None:
|
2021-04-04 19:00:27 +01:00
|
|
|
|
2022-03-31 03:50:26 +02:00
|
|
|
glue_source_instance = glue_source(platform_instance=platform_instance)
|
2021-06-22 11:33:22 -07:00
|
|
|
|
|
|
|
with Stubber(glue_source_instance.glue_client) as glue_stubber:
|
2021-04-04 19:00:27 +01:00
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
glue_stubber.add_response("get_databases", get_databases_response, {})
|
|
|
|
glue_stubber.add_response(
|
|
|
|
"get_tables",
|
|
|
|
get_tables_response_1,
|
|
|
|
{"DatabaseName": "flights-database"},
|
|
|
|
)
|
|
|
|
glue_stubber.add_response(
|
|
|
|
"get_tables",
|
|
|
|
get_tables_response_2,
|
|
|
|
{"DatabaseName": "test-database"},
|
|
|
|
)
|
|
|
|
glue_stubber.add_response("get_jobs", get_jobs_response, {})
|
|
|
|
glue_stubber.add_response(
|
|
|
|
"get_dataflow_graph",
|
|
|
|
get_dataflow_graph_response_1,
|
|
|
|
{"PythonScript": get_object_body_1},
|
|
|
|
)
|
|
|
|
glue_stubber.add_response(
|
|
|
|
"get_dataflow_graph",
|
|
|
|
get_dataflow_graph_response_2,
|
|
|
|
{"PythonScript": get_object_body_2},
|
2021-04-04 19:00:27 +01:00
|
|
|
)
|
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
with Stubber(glue_source_instance.s3_client) as s3_stubber:
|
2021-04-14 19:25:57 -07:00
|
|
|
|
2022-04-29 04:09:06 -04:00
|
|
|
for _ in range(
|
|
|
|
len(get_tables_response_1["TableList"])
|
|
|
|
+ len(get_tables_response_2["TableList"])
|
|
|
|
):
|
|
|
|
s3_stubber.add_response(
|
|
|
|
"get_bucket_tagging",
|
|
|
|
get_bucket_tagging(),
|
|
|
|
)
|
|
|
|
s3_stubber.add_response(
|
|
|
|
"get_object_tagging",
|
|
|
|
get_object_tagging(),
|
|
|
|
)
|
|
|
|
|
2021-06-22 11:33:22 -07:00
|
|
|
s3_stubber.add_response(
|
|
|
|
"get_object",
|
2022-03-31 03:50:26 +02:00
|
|
|
get_object_response_1(),
|
2021-06-22 11:33:22 -07:00
|
|
|
{
|
|
|
|
"Bucket": "aws-glue-assets-123412341234-us-west-2",
|
|
|
|
"Key": "scripts/job-1.py",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
s3_stubber.add_response(
|
|
|
|
"get_object",
|
2022-03-31 03:50:26 +02:00
|
|
|
get_object_response_2(),
|
2021-06-22 11:33:22 -07:00
|
|
|
{
|
|
|
|
"Bucket": "aws-glue-assets-123412341234-us-west-2",
|
|
|
|
"Key": "scripts/job-2.py",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
mce_objects = [
|
2021-07-30 17:41:03 -07:00
|
|
|
wu.metadata.to_obj() for wu in glue_source_instance.get_workunits()
|
2021-06-22 11:33:22 -07:00
|
|
|
]
|
|
|
|
|
2022-03-31 03:50:26 +02:00
|
|
|
glue_stubber.assert_no_pending_responses()
|
|
|
|
s3_stubber.assert_no_pending_responses()
|
|
|
|
|
|
|
|
with open(str(tmp_path / mce_file), "w") as f:
|
2021-06-22 11:33:22 -07:00
|
|
|
json.dump(mce_objects, f, indent=2)
|
|
|
|
|
2021-06-30 16:53:20 -07:00
|
|
|
# Verify the output.
|
2021-06-22 11:33:22 -07:00
|
|
|
test_resources_dir = pytestconfig.rootpath / "tests/unit/glue"
|
2021-06-30 16:53:20 -07:00
|
|
|
mce_helpers.check_golden_file(
|
|
|
|
pytestconfig,
|
2022-03-31 03:50:26 +02:00
|
|
|
output_path=tmp_path / mce_file,
|
|
|
|
golden_path=test_resources_dir / mce_golden_file,
|
2021-06-22 11:33:22 -07:00
|
|
|
)
|
2021-08-06 22:49:21 +05:30
|
|
|
|
|
|
|
|
|
|
|
def test_underlying_platform_takes_precendence():
|
|
|
|
source = GlueSource(
|
|
|
|
ctx=PipelineContext(run_id="glue-source-test"),
|
|
|
|
config=GlueSourceConfig(aws_region="us-west-2", underlying_platform="athena"),
|
|
|
|
)
|
2022-03-31 03:50:26 +02:00
|
|
|
assert source.platform == "athena"
|
2021-08-06 22:49:21 +05:30
|
|
|
|
|
|
|
|
2022-03-31 03:50:26 +02:00
|
|
|
def test_platform_takes_precendence_over_underlying_platform():
|
2021-08-06 22:49:21 +05:30
|
|
|
source = GlueSource(
|
|
|
|
ctx=PipelineContext(run_id="glue-source-test"),
|
|
|
|
config=GlueSourceConfig(
|
2022-03-31 03:50:26 +02:00
|
|
|
aws_region="us-west-2", platform="athena", underlying_platform="glue"
|
2021-08-06 22:49:21 +05:30
|
|
|
),
|
|
|
|
)
|
2022-03-31 03:50:26 +02:00
|
|
|
assert source.platform == "athena"
|
|
|
|
|
|
|
|
|
|
|
|
def test_underlying_platform_must_be_valid():
|
|
|
|
with pytest.raises(ConfigurationError):
|
|
|
|
GlueSource(
|
|
|
|
ctx=PipelineContext(run_id="glue-source-test"),
|
|
|
|
config=GlueSourceConfig(
|
|
|
|
aws_region="us-west-2", underlying_platform="data-warehouse"
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_platform_must_be_valid():
|
|
|
|
with pytest.raises(ConfigurationError):
|
|
|
|
GlueSource(
|
|
|
|
ctx=PipelineContext(run_id="glue-source-test"),
|
|
|
|
config=GlueSourceConfig(aws_region="us-west-2", platform="data-warehouse"),
|
|
|
|
)
|
2021-08-06 22:49:21 +05:30
|
|
|
|
|
|
|
|
|
|
|
def test_without_underlying_platform():
|
|
|
|
source = GlueSource(
|
|
|
|
ctx=PipelineContext(run_id="glue-source-test"),
|
|
|
|
config=GlueSourceConfig(aws_region="us-west-2"),
|
|
|
|
)
|
2022-03-31 03:50:26 +02:00
|
|
|
assert source.platform == "glue"
|