datahub/metadata-ingestion/tests/unit/test_glue_source_stubs.py

778 lines
38 KiB
Python

import datetime
import io
from typing import Any, Dict
from botocore.response import StreamingBody
get_databases_response = {
"DatabaseList": [
{
"Name": "flights-database",
"CreateTime": datetime.datetime(2021, 6, 9, 14, 14, 19),
"CreateTableDefaultPermissions": [
{
"Principal": {
"DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS"
},
"Permissions": ["ALL"],
}
],
"CatalogId": "123412341234",
},
{
"Name": "test-database",
"CreateTime": datetime.datetime(2021, 6, 1, 14, 55, 2),
"CreateTableDefaultPermissions": [
{
"Principal": {
"DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS"
},
"Permissions": ["ALL"],
}
],
"CatalogId": "123412341234",
},
]
}
databases_1 = {
"flights-database": {"Name": "flights-database", "CatalogId": "123412341234"}
}
databases_2 = {"test-database": {"Name": "test-database", "CatalogId": "123412341234"}}
tables_1 = [
{
"Name": "avro",
"DatabaseName": "flights-database",
"Owner": "owner",
"CreateTime": datetime.datetime(2021, 6, 9, 14, 17, 35),
"UpdateTime": datetime.datetime(2021, 6, 9, 14, 17, 35),
"LastAccessTime": datetime.datetime(2021, 6, 9, 14, 17, 35),
"Retention": 0,
"StorageDescriptor": {
"Columns": [
{"Name": "yr", "Type": "int"},
{"Name": "flightdate", "Type": "string"},
{"Name": "uniquecarrier", "Type": "string"},
{"Name": "airlineid", "Type": "int"},
{"Name": "carrier", "Type": "string"},
{"Name": "flightnum", "Type": "string"},
{"Name": "origin", "Type": "string"},
],
"Location": "s3://crawler-public-us-west-2/flight/avro/",
"InputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat",
"OutputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat",
"Compressed": False,
"NumberOfBuckets": -1,
"SerdeInfo": {
"SerializationLibrary": "org.apache.hadoop.hive.serde2.avro.AvroSerDe",
"Parameters": {
"avro.schema.literal": '{"type":"record","name":"flights_avro_subset","namespace":"default","fields":[{"name":"yr","type":["null","int"],"default":null},{"name":"flightdate","type":["null","string"],"default":null},{"name":"uniquecarrier","type":["null","string"],"default":null},{"name":"airlineid","type":["null","int"],"default":null},{"name":"carrier","type":["null","string"],"default":null},{"name":"flightnum","type":["null","string"],"default":null},{"name":"origin","type":["null","string"],"default":null},{"name":"dest","type":["null","string"],"default":null},{"name":"depdelay","type":["null","int"],"default":null},{"name":"carrierdelay","type":["null","int"],"default":null},{"name":"weatherdelay","type":["null","int"],"default":null}]}',
"serialization.format": "1",
},
},
"BucketColumns": [],
"SortColumns": [],
"Parameters": {
"CrawlerSchemaDeserializerVersion": "1.0",
"CrawlerSchemaSerializerVersion": "1.0",
"UPDATED_BY_CRAWLER": "flights-crawler",
"averageRecordSize": "55",
"avro.schema.literal": '{"type":"record","name":"flights_avro_subset","namespace":"default","fields":[{"name":"yr","type":["null","int"],"default":null},{"name":"flightdate","type":["null","string"],"default":null},{"name":"uniquecarrier","type":["null","string"],"default":null},{"name":"airlineid","type":["null","int"],"default":null},{"name":"carrier","type":["null","string"],"default":null},{"name":"flightnum","type":["null","string"],"default":null},{"name":"origin","type":["null","string"],"default":null},{"name":"dest","type":["null","string"],"default":null},{"name":"depdelay","type":["null","int"],"default":null},{"name":"carrierdelay","type":["null","int"],"default":null},{"name":"weatherdelay","type":["null","int"],"default":null}]}',
"classification": "avro",
"compressionType": "none",
"objectCount": "30",
"recordCount": "169222196",
"sizeKey": "9503351413",
"typeOfData": "file",
},
"StoredAsSubDirectories": False,
},
"PartitionKeys": [{"Name": "year", "Type": "string"}],
"TableType": "EXTERNAL_TABLE",
"Parameters": {
"CrawlerSchemaDeserializerVersion": "1.0",
"CrawlerSchemaSerializerVersion": "1.0",
"UPDATED_BY_CRAWLER": "flights-crawler",
"averageRecordSize": "55",
"avro.schema.literal": '{"type":"record","name":"flights_avro_subset","namespace":"default","fields":[{"name":"yr","type":["null","int"],"default":null},{"name":"flightdate","type":["null","string"],"default":null},{"name":"uniquecarrier","type":["null","string"],"default":null},{"name":"airlineid","type":["null","int"],"default":null},{"name":"carrier","type":["null","string"],"default":null},{"name":"flightnum","type":["null","string"],"default":null},{"name":"origin","type":["null","string"],"default":null},{"name":"dest","type":["null","string"],"default":null},{"name":"depdelay","type":["null","int"],"default":null},{"name":"carrierdelay","type":["null","int"],"default":null},{"name":"weatherdelay","type":["null","int"],"default":null}]}',
"classification": "avro",
"compressionType": "none",
"objectCount": "30",
"recordCount": "169222196",
"sizeKey": "9503351413",
"typeOfData": "file",
},
"CreatedBy": "arn:aws:sts::123412341234:assumed-role/AWSGlueServiceRole-flights-crawler/AWS-Crawler",
"IsRegisteredWithLakeFormation": False,
"CatalogId": "123412341234",
}
]
get_tables_response_1 = {"TableList": tables_1}
tables_2 = [
{
"Name": "test_jsons_markers",
"DatabaseName": "test-database",
"Owner": "owner",
"CreateTime": datetime.datetime(2021, 6, 2, 12, 6, 59),
"UpdateTime": datetime.datetime(2021, 6, 2, 12, 6, 59),
"LastAccessTime": datetime.datetime(2021, 6, 2, 12, 6, 59),
"Retention": 0,
"StorageDescriptor": {
"Columns": [
{
"Name": "markers",
"Type": "array<struct<name:string,position:array<double>,location:array<double>>>",
}
],
"Location": "s3://test-glue-jsons/markers/",
"InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
"OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"Compressed": False,
"NumberOfBuckets": -1,
"SerdeInfo": {
"SerializationLibrary": "org.openx.data.jsonserde.JsonSerDe",
"Parameters": {"paths": "markers"},
},
"BucketColumns": [],
"SortColumns": [],
"Parameters": {
"CrawlerSchemaDeserializerVersion": "1.0",
"CrawlerSchemaSerializerVersion": "1.0",
"UPDATED_BY_CRAWLER": "test-jsons",
"averageRecordSize": "273",
"classification": "json",
"compressionType": "none",
"objectCount": "1",
"recordCount": "1",
"sizeKey": "273",
"typeOfData": "file",
},
"StoredAsSubDirectories": False,
},
"PartitionKeys": [],
"TableType": "EXTERNAL_TABLE",
"Parameters": {
"CrawlerSchemaDeserializerVersion": "1.0",
"CrawlerSchemaSerializerVersion": "1.0",
"UPDATED_BY_CRAWLER": "test-jsons",
"averageRecordSize": "273",
"classification": "json",
"compressionType": "none",
"objectCount": "1",
"recordCount": "1",
"sizeKey": "273",
"typeOfData": "file",
},
"CreatedBy": "arn:aws:sts::795586375822:assumed-role/AWSGlueServiceRole-test-crawler/AWS-Crawler",
"IsRegisteredWithLakeFormation": False,
"CatalogId": "795586375822",
},
{
"Name": "test_parquet",
"DatabaseName": "test-database",
"Owner": "owner",
"CreateTime": datetime.datetime(2021, 6, 1, 16, 14, 53),
"UpdateTime": datetime.datetime(2021, 6, 1, 16, 14, 53),
"LastAccessTime": datetime.datetime(2021, 6, 1, 16, 14, 53),
"Retention": 0,
"StorageDescriptor": {
"Columns": [
{"Name": "yr", "Type": "int"},
{"Name": "quarter", "Type": "int"},
{"Name": "month", "Type": "int"},
{"Name": "dayofmonth", "Type": "int"},
],
"Location": "s3://crawler-public-us-west-2/flight/parquet/",
"InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
"OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
"Compressed": False,
"NumberOfBuckets": -1,
"SerdeInfo": {
"SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
"Parameters": {"serialization.format": "1"},
},
"BucketColumns": [],
"SortColumns": [],
"Parameters": {
"CrawlerSchemaDeserializerVersion": "1.0",
"CrawlerSchemaSerializerVersion": "1.0",
"UPDATED_BY_CRAWLER": "test",
"averageRecordSize": "19",
"classification": "parquet",
"compressionType": "none",
"objectCount": "60",
"recordCount": "167497743",
"sizeKey": "4463574900",
"typeOfData": "file",
},
"StoredAsSubDirectories": False,
},
"PartitionKeys": [{"Name": "year", "Type": "string"}],
"TableType": "EXTERNAL_TABLE",
"Parameters": {
"CrawlerSchemaDeserializerVersion": "1.0",
"CrawlerSchemaSerializerVersion": "1.0",
"UPDATED_BY_CRAWLER": "test",
"averageRecordSize": "19",
"classification": "parquet",
"compressionType": "none",
"objectCount": "60",
"recordCount": "167497743",
"sizeKey": "4463574900",
"typeOfData": "file",
},
"CreatedBy": "arn:aws:sts::795586375822:assumed-role/AWSGlueServiceRole-test-crawler/AWS-Crawler",
"IsRegisteredWithLakeFormation": False,
"CatalogId": "795586375822",
},
]
get_tables_response_2 = {"TableList": tables_2}
get_jobs_response = {
"Jobs": [
{
"Name": "test-job-1",
"Description": "The first test job",
"Role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler",
"CreatedOn": datetime.datetime(2021, 6, 10, 16, 51, 25, 690000),
"LastModifiedOn": datetime.datetime(2021, 6, 10, 16, 55, 35, 307000),
"ExecutionProperty": {"MaxConcurrentRuns": 1},
"Command": {
"Name": "glueetl",
"ScriptLocation": "s3://aws-glue-assets-123412341234-us-west-2/scripts/job-1.py",
"PythonVersion": "3",
},
"DefaultArguments": {
"--TempDir": "s3://aws-glue-assets-123412341234-us-west-2/temporary/",
"--class": "GlueApp",
"--enable-continuous-cloudwatch-log": "true",
"--enable-glue-datacatalog": "true",
"--enable-metrics": "true",
"--enable-spark-ui": "true",
"--encryption-type": "sse-s3",
"--job-bookmark-option": "job-bookmark-enable",
"--job-language": "python",
"--spark-event-logs-path": "s3://aws-glue-assets-123412341234-us-west-2/sparkHistoryLogs/",
},
"MaxRetries": 3,
"AllocatedCapacity": 10,
"Timeout": 2880,
"MaxCapacity": 10.0,
"WorkerType": "G.1X",
"NumberOfWorkers": 10,
"GlueVersion": "2.0",
},
{
"Name": "test-job-2",
"Description": "The second test job",
"Role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler",
"CreatedOn": datetime.datetime(2021, 6, 10, 16, 58, 32, 469000),
"LastModifiedOn": datetime.datetime(2021, 6, 10, 16, 58, 32, 469000),
"ExecutionProperty": {"MaxConcurrentRuns": 1},
"Command": {
"Name": "glueetl",
"ScriptLocation": "s3://aws-glue-assets-123412341234-us-west-2/scripts/job-2.py",
"PythonVersion": "3",
},
"DefaultArguments": {
"--TempDir": "s3://aws-glue-assets-123412341234-us-west-2/temporary/",
"--class": "GlueApp",
"--enable-continuous-cloudwatch-log": "true",
"--enable-glue-datacatalog": "true",
"--enable-metrics": "true",
"--enable-spark-ui": "true",
"--encryption-type": "sse-s3",
"--job-bookmark-option": "job-bookmark-enable",
"--job-language": "python",
"--spark-event-logs-path": "s3://aws-glue-assets-123412341234-us-west-2/sparkHistoryLogs/",
},
"MaxRetries": 3,
"AllocatedCapacity": 10,
"Timeout": 2880,
"MaxCapacity": 10.0,
"WorkerType": "G.1X",
"NumberOfWorkers": 10,
"GlueVersion": "2.0",
},
]
}
# for job 1
get_dataflow_graph_response_1 = {
"DagNodes": [
{
"Id": "Transform0_job1",
"NodeType": "Filter",
"Args": [
{"Name": "f", "Value": "lambda row : ()", "Param": False},
{
"Name": "transformation_ctx",
"Value": '"Transform0"',
"Param": False,
},
],
"LineNumber": 32,
},
{
"Id": "Transform1_job1",
"NodeType": "ApplyMapping",
"Args": [
{
"Name": "mappings",
"Value": '[("yr", "int", "yr", "int"), ("flightdate", "string", "flightdate", "string"), ("uniquecarrier", "string", "uniquecarrier", "string"), ("airlineid", "int", "airlineid", "int"), ("carrier", "string", "carrier", "string"), ("flightnum", "string", "flightnum", "string"), ("origin", "string", "origin", "string"), ("dest", "string", "dest", "string"), ("depdelay", "int", "depdelay", "int"), ("carrierdelay", "int", "carrierdelay", "int"), ("weatherdelay", "int", "weatherdelay", "int"), ("year", "string", "year", "string")]',
"Param": False,
},
{
"Name": "transformation_ctx",
"Value": '"Transform1"',
"Param": False,
},
],
"LineNumber": 37,
},
{
"Id": "Transform2_job1",
"NodeType": "ApplyMapping",
"Args": [
{
"Name": "mappings",
"Value": '[("yr", "int", "yr", "int"), ("flightdate", "string", "flightdate", "string"), ("uniquecarrier", "string", "uniquecarrier", "string"), ("airlineid", "int", "airlineid", "int"), ("carrier", "string", "carrier", "string"), ("flightnum", "string", "flightnum", "string"), ("origin", "string", "origin", "string"), ("dest", "string", "dest", "string"), ("depdelay", "int", "depdelay", "int"), ("carrierdelay", "int", "carrierdelay", "int"), ("weatherdelay", "int", "weatherdelay", "int"), ("year", "string", "year", "string")]',
"Param": False,
},
{
"Name": "transformation_ctx",
"Value": '"Transform2"',
"Param": False,
},
],
"LineNumber": 22,
},
{
"Id": "Transform3_job1",
"NodeType": "Join",
"Args": [
{
"Name": "keys2",
"Value": '["(right) flightdate"]',
"Param": False,
},
{
"Name": "transformation_ctx",
"Value": '"Transform3"',
"Param": False,
},
{"Name": "keys1", "Value": '["yr"]', "Param": False},
],
"LineNumber": 47,
},
{
"Id": "DataSource0_job1",
"NodeType": "DataSource",
"Args": [
{
"Name": "database",
"Value": '"flights-database"',
"Param": False,
},
{"Name": "table_name", "Value": '"avro"', "Param": False},
{
"Name": "transformation_ctx",
"Value": '"DataSource0"',
"Param": False,
},
],
"LineNumber": 17,
},
{
"Id": "DataSink0_job1",
"NodeType": "DataSink",
"Args": [
{
"Name": "database",
"Value": '"test-database"',
"Param": False,
},
{
"Name": "table_name",
"Value": '"test_jsons_markers"',
"Param": False,
},
{
"Name": "transformation_ctx",
"Value": '"DataSink0"',
"Param": False,
},
],
"LineNumber": 57,
},
{
"Id": "Transform4_job1",
"NodeType": "ApplyMapping",
"Args": [
{
"Name": "mappings",
"Value": '[("yr", "int", "yr", "int"), ("flightdate", "string", "flightdate", "string"), ("uniquecarrier", "string", "uniquecarrier", "string"), ("airlineid", "int", "airlineid", "int"), ("carrier", "string", "carrier", "string"), ("flightnum", "string", "flightnum", "string"), ("origin", "string", "origin", "string"), ("dest", "string", "dest", "string"), ("depdelay", "int", "depdelay", "int"), ("carrierdelay", "int", "carrierdelay", "int"), ("weatherdelay", "int", "weatherdelay", "int"), ("year", "string", "year", "string")]',
"Param": False,
},
{
"Name": "transformation_ctx",
"Value": '"Transform4"',
"Param": False,
},
],
"LineNumber": 27,
},
{
"Id": "Transform5_job1",
"NodeType": "ApplyMapping",
"Args": [
{
"Name": "mappings",
"Value": '[("yr", "int", "(right) yr", "int"), ("flightdate", "string", "(right) flightdate", "string"), ("uniquecarrier", "string", "(right) uniquecarrier", "string"), ("airlineid", "int", "(right) airlineid", "int"), ("carrier", "string", "(right) carrier", "string"), ("flightnum", "string", "(right) flightnum", "string"), ("origin", "string", "(right) origin", "string"), ("dest", "string", "(right) dest", "string"), ("depdelay", "int", "(right) depdelay", "int"), ("carrierdelay", "int", "(right) carrierdelay", "int"), ("weatherdelay", "int", "(right) weatherdelay", "int"), ("year", "string", "(right) year", "string")]',
"Param": False,
},
{
"Name": "transformation_ctx",
"Value": '"Transform5"',
"Param": False,
},
],
"LineNumber": 42,
},
{
"Id": "DataSink1_job1",
"NodeType": "DataSink",
"Args": [
{"Name": "connection_type", "Value": '"s3"', "Param": False},
{"Name": "format", "Value": '"json"', "Param": False},
{
"Name": "connection_options",
"Value": '{"path": "s3://test-glue-jsons/", "partitionKeys": []}',
"Param": False,
},
{
"Name": "transformation_ctx",
"Value": '"DataSink1"',
"Param": False,
},
],
"LineNumber": 52,
},
],
"DagEdges": [
{
"Source": "Transform2_job1",
"Target": "Transform0_job1",
"TargetParameter": "frame",
},
{
"Source": "Transform0_job1",
"Target": "Transform1_job1",
"TargetParameter": "frame",
},
{
"Source": "DataSource0_job1",
"Target": "Transform2_job1",
"TargetParameter": "frame",
},
{
"Source": "Transform4_job1",
"Target": "Transform3_job1",
"TargetParameter": "frame1",
},
],
}
# for job 2
get_dataflow_graph_response_2 = {
"DagNodes": [
{
"Id": "Transform0_job2",
"NodeType": "SplitFields",
"Args": [
{
"Name": "paths",
"Value": '["yr", "quarter", "month", "dayofmonth", "dayofweek", "flightdate", "uniquecarrier"]',
"Param": False,
},
{
"Name": "name2",
"Value": '"Transform0Output1"',
"Param": False,
},
{
"Name": "name1",
"Value": '"Transform0Output0"',
"Param": False,
},
{
"Name": "transformation_ctx",
"Value": '"Transform0"',
"Param": False,
},
],
"LineNumber": 42,
},
{
"Id": "Transform1_job2",
"NodeType": "ApplyMapping",
"Args": [
{
"Name": "mappings",
"Value": '[("yr", "int", "yr", "int"), ("quarter", "int", "quarter", "int"), ("month", "int", "month", "int"), ("dayofmonth", "int", "dayofmonth", "int"), ("dayofweek", "int", "dayofweek", "int"), ("flightdate", "string", "flightdate", "string"), ("uniquecarrier", "string", "uniquecarrier", "string"), ("airlineid", "int", "airlineid", "int"), ("carrier", "string", "carrier", "string")]',
"Param": False,
},
{
"Name": "transformation_ctx",
"Value": '"Transform1"',
"Param": False,
},
],
"LineNumber": 22,
},
{
"Id": "Transform2_job2",
"NodeType": "FillMissingValues",
"Args": [
{
"Name": "missing_values_column",
"Value": '"dayofmonth"',
"Param": False,
},
{
"Name": "transformation_ctx",
"Value": '"Transform2"',
"Param": False,
},
],
"LineNumber": 27,
},
{
"Id": "Transform3_job2",
"NodeType": "SelectFields",
"Args": [
{"Name": "paths", "Value": "[]", "Param": False},
{
"Name": "transformation_ctx",
"Value": '"Transform3"',
"Param": False,
},
],
"LineNumber": 32,
},
{
"Id": "DataSource0_job2",
"NodeType": "DataSource",
"Args": [
{
"Name": "database",
"Value": '"test-database"',
"Param": False,
},
{
"Name": "table_name",
"Value": '"test_parquet"',
"Param": False,
},
{
"Name": "transformation_ctx",
"Value": '"DataSource0"',
"Param": False,
},
],
"LineNumber": 17,
},
{
"Id": "DataSink0_job2",
"NodeType": "DataSink",
"Args": [
{"Name": "connection_type", "Value": '"s3"', "Param": False},
{"Name": "format", "Value": '"json"', "Param": False},
{
"Name": "connection_options",
"Value": '{"path": "s3://test-glue-jsons/", "partitionKeys": []}',
"Param": False,
},
{
"Name": "transformation_ctx",
"Value": '"DataSink0"',
"Param": False,
},
],
"LineNumber": 37,
},
],
"DagEdges": [
{
"Source": "Transform1_job2",
"Target": "Transform0_job2",
"TargetParameter": "frame",
},
{
"Source": "DataSource0_job2",
"Target": "Transform1_job2",
"TargetParameter": "frame",
},
{
"Source": "Transform1_job2",
"Target": "Transform2_job2",
"TargetParameter": "frame",
},
{
"Source": "Transform2_job2",
"Target": "Transform3_job2",
"TargetParameter": "frame",
},
{
"Source": "Transform3_job2",
"Target": "DataSink0_job2",
"TargetParameter": "frame",
},
],
}
get_object_body_1 = """
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import re
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "flights-database", table_name = "avro", transformation_ctx = "DataSource0"]
## @return: DataSource0
## @inputs: []
DataSource0 = glueContext.create_dynamic_frame.from_catalog(database = "flights-database", table_name = "avro", transformation_ctx = "DataSource0")
## @type: ApplyMapping
## @args: [mappings = [("yr", "int", "yr", "int"), ("flightdate", "string", "flightdate", "string"), ("uniquecarrier", "string", "uniquecarrier", "string"), ("airlineid", "int", "airlineid", "int"), ("carrier", "string", "carrier", "string"), ("flightnum", "string", "flightnum", "string"), ("origin", "string", "origin", "string"), ("dest", "string", "dest", "string"), ("depdelay", "int", "depdelay", "int"), ("carrierdelay", "int", "carrierdelay", "int"), ("weatherdelay", "int", "weatherdelay", "int"), ("year", "string", "year", "string")], transformation_ctx = "Transform2"]
## @return: Transform2
## @inputs: [frame = DataSource0]
Transform2 = ApplyMapping.apply(frame = DataSource0, mappings = [("yr", "int", "yr", "int"), ("flightdate", "string", "flightdate", "string"), ("uniquecarrier", "string", "uniquecarrier", "string"), ("airlineid", "int", "airlineid", "int"), ("carrier", "string", "carrier", "string"), ("flightnum", "string", "flightnum", "string"), ("origin", "string", "origin", "string"), ("dest", "string", "dest", "string"), ("depdelay", "int", "depdelay", "int"), ("carrierdelay", "int", "carrierdelay", "int"), ("weatherdelay", "int", "weatherdelay", "int"), ("year", "string", "year", "string")], transformation_ctx = "Transform2")
## @type: ApplyMapping
## @args: [mappings = [("yr", "int", "yr", "int"), ("flightdate", "string", "flightdate", "string"), ("uniquecarrier", "string", "uniquecarrier", "string"), ("airlineid", "int", "airlineid", "int"), ("carrier", "string", "carrier", "string"), ("flightnum", "string", "flightnum", "string"), ("origin", "string", "origin", "string"), ("dest", "string", "dest", "string"), ("depdelay", "int", "depdelay", "int"), ("carrierdelay", "int", "carrierdelay", "int"), ("weatherdelay", "int", "weatherdelay", "int"), ("year", "string", "year", "string")], transformation_ctx = "Transform4"]
## @return: Transform4
## @inputs: [frame = Transform2]
Transform4 = ApplyMapping.apply(frame = Transform2, mappings = [("yr", "int", "yr", "int"), ("flightdate", "string", "flightdate", "string"), ("uniquecarrier", "string", "uniquecarrier", "string"), ("airlineid", "int", "airlineid", "int"), ("carrier", "string", "carrier", "string"), ("flightnum", "string", "flightnum", "string"), ("origin", "string", "origin", "string"), ("dest", "string", "dest", "string"), ("depdelay", "int", "depdelay", "int"), ("carrierdelay", "int", "carrierdelay", "int"), ("weatherdelay", "int", "weatherdelay", "int"), ("year", "string", "year", "string")], transformation_ctx = "Transform4")
## @type: Filter
## @args: [f = lambda row : (), transformation_ctx = "Transform0"]
## @return: Transform0
## @inputs: [frame = Transform2]
Transform0 = Filter.apply(frame = Transform2, f = lambda row : (), transformation_ctx = "Transform0")
## @type: ApplyMapping
## @args: [mappings = [("yr", "int", "yr", "int"), ("flightdate", "string", "flightdate", "string"), ("uniquecarrier", "string", "uniquecarrier", "string"), ("airlineid", "int", "airlineid", "int"), ("carrier", "string", "carrier", "string"), ("flightnum", "string", "flightnum", "string"), ("origin", "string", "origin", "string"), ("dest", "string", "dest", "string"), ("depdelay", "int", "depdelay", "int"), ("carrierdelay", "int", "carrierdelay", "int"), ("weatherdelay", "int", "weatherdelay", "int"), ("year", "string", "year", "string")], transformation_ctx = "Transform1"]
## @return: Transform1
## @inputs: [frame = Transform0]
Transform1 = ApplyMapping.apply(frame = Transform0, mappings = [("yr", "int", "yr", "int"), ("flightdate", "string", "flightdate", "string"), ("uniquecarrier", "string", "uniquecarrier", "string"), ("airlineid", "int", "airlineid", "int"), ("carrier", "string", "carrier", "string"), ("flightnum", "string", "flightnum", "string"), ("origin", "string", "origin", "string"), ("dest", "string", "dest", "string"), ("depdelay", "int", "depdelay", "int"), ("carrierdelay", "int", "carrierdelay", "int"), ("weatherdelay", "int", "weatherdelay", "int"), ("year", "string", "year", "string")], transformation_ctx = "Transform1")
## @type: ApplyMapping
## @args: [mappings = [("yr", "int", "(right) yr", "int"), ("flightdate", "string", "(right) flightdate", "string"), ("uniquecarrier", "string", "(right) uniquecarrier", "string"), ("airlineid", "int", "(right) airlineid", "int"), ("carrier", "string", "(right) carrier", "string"), ("flightnum", "string", "(right) flightnum", "string"), ("origin", "string", "(right) origin", "string"), ("dest", "string", "(right) dest", "string"), ("depdelay", "int", "(right) depdelay", "int"), ("carrierdelay", "int", "(right) carrierdelay", "int"), ("weatherdelay", "int", "(right) weatherdelay", "int"), ("year", "string", "(right) year", "string")], transformation_ctx = "Transform5"]
## @return: Transform5
## @inputs: [frame = Transform1]
Transform5 = ApplyMapping.apply(frame = Transform1, mappings = [("yr", "int", "(right) yr", "int"), ("flightdate", "string", "(right) flightdate", "string"), ("uniquecarrier", "string", "(right) uniquecarrier", "string"), ("airlineid", "int", "(right) airlineid", "int"), ("carrier", "string", "(right) carrier", "string"), ("flightnum", "string", "(right) flightnum", "string"), ("origin", "string", "(right) origin", "string"), ("dest", "string", "(right) dest", "string"), ("depdelay", "int", "(right) depdelay", "int"), ("carrierdelay", "int", "(right) carrierdelay", "int"), ("weatherdelay", "int", "(right) weatherdelay", "int"), ("year", "string", "(right) year", "string")], transformation_ctx = "Transform5")
## @type: Join
## @args: [keys2 = ["(right) flightdate"], keys1 = ["yr"], transformation_ctx = "Transform3"]
## @return: Transform3
## @inputs: [frame1 = Transform4, frame2 = Transform5]
Transform3 = Join.apply(frame1 = Transform4, frame2 = Transform5, keys2 = ["(right) flightdate"], keys1 = ["yr"], transformation_ctx = "Transform3")
## @type: DataSink
## @args: [connection_type = "s3", format = "json", connection_options = {"path": "s3://test-glue-jsons/", "partitionKeys": []}, transformation_ctx = "DataSink1"]
## @return: DataSink1
## @inputs: [frame = Transform3]
DataSink1 = glueContext.write_dynamic_frame.from_options(frame = Transform3, connection_type = "s3", format = "json", connection_options = {"path": "s3://test-glue-jsons/", "partitionKeys": []}, transformation_ctx = "DataSink1")
## @type: DataSink
## @args: [database = "test-database", table_name = "test_jsons_markers", transformation_ctx = "DataSink0"]
## @return: DataSink0
## @inputs: [frame = Transform3]
DataSink0 = glueContext.write_dynamic_frame.from_catalog(frame = Transform3, database = "test-database", table_name = "test_jsons_markers", transformation_ctx = "DataSink0")
job.commit()
"""
get_object_body_2 = """
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglueml.transforms import FillMissingValues
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "test-database", table_name = "test_parquet", transformation_ctx = "DataSource0"]
## @return: DataSource0
## @inputs: []
DataSource0 = glueContext.create_dynamic_frame.from_catalog(database = "test-database", table_name = "test_parquet", transformation_ctx = "DataSource0")
## @type: ApplyMapping
## @args: [mappings = [("yr", "int", "yr", "int"), ("quarter", "int", "quarter", "int"), ("month", "int", "month", "int"), ("dayofmonth", "int", "dayofmonth", "int"), ("dayofweek", "int", "dayofweek", "int"), ("flightdate", "string", "flightdate", "string"), ("uniquecarrier", "string", "uniquecarrier", "string"), ("airlineid", "int", "airlineid", "int"), ("carrier", "string", "carrier", "string")], transformation_ctx = "Transform1"]
## @return: Transform1
## @inputs: [frame = DataSource0]
Transform1 = ApplyMapping.apply(frame = DataSource0, mappings = [("yr", "int", "yr", "int"), ("quarter", "int", "quarter", "int"), ("month", "int", "month", "int"), ("dayofmonth", "int", "dayofmonth", "int"), ("dayofweek", "int", "dayofweek", "int"), ("flightdate", "string", "flightdate", "string"), ("uniquecarrier", "string", "uniquecarrier", "string"), ("airlineid", "int", "airlineid", "int"), ("carrier", "string", "carrier", "string")], transformation_ctx = "Transform1")
## @type: FillMissingValues
## @args: [missing_values_column = "dayofmonth", transformation_ctx = "Transform2"]
## @return: Transform2
## @inputs: [frame = Transform1]
Transform2 = FillMissingValues.apply(frame = Transform1, missing_values_column = "dayofmonth", transformation_ctx = "Transform2")
## @type: SelectFields
## @args: [paths = [], transformation_ctx = "Transform3"]
## @return: Transform3
## @inputs: [frame = Transform2]
Transform3 = SelectFields.apply(frame = Transform2, paths = [], transformation_ctx = "Transform3")
## @type: DataSink
## @args: [connection_type = "s3", format = "json", connection_options = {"path": "s3://test-glue-jsons/", "partitionKeys": []}, transformation_ctx = "DataSink0"]
## @return: DataSink0
## @inputs: [frame = Transform3]
DataSink0 = glueContext.write_dynamic_frame.from_options(frame = Transform3, connection_type = "s3", format = "json", connection_options = {"path": "s3://test-glue-jsons/", "partitionKeys": []}, transformation_ctx = "DataSink0")
## @type: SplitFields
## @args: [paths = ["yr", "quarter", "month", "dayofmonth", "dayofweek", "flightdate", "uniquecarrier", "airlineid", "carrier"], name2 = "Transform0Output1", name1 = "Transform0Output0", transformation_ctx = "Transform0"]
## @return: Transform0
## @inputs: [frame = Transform1]
Transform0 = SplitFields.apply(frame = Transform1, paths = ["yr", "quarter", "month", "dayofmonth", "dayofweek", "flightdate", "uniquecarrier", "airlineid", "carrier"], name2 = "Transform0Output1", name1 = "Transform0Output0", transformation_ctx = "Transform0")
job.commit()
"""
def mock_get_object_response(raw_body: str) -> Dict[str, Any]:
"""
Mock s3 client get_object() response object.
See https://gist.github.com/grantcooksey/132ddc85274a50b94b821302649f9d7b
Parameters
----------
raw_body:
Content of the 'Body' field to return
"""
encoded_message = raw_body.encode("utf-8")
raw_stream = StreamingBody(io.BytesIO(encoded_message), len(encoded_message))
return {"Body": raw_stream}
def get_object_response_1() -> Dict[str, Any]:
return mock_get_object_response(get_object_body_1)
def get_object_response_2() -> Dict[str, Any]:
return mock_get_object_response(get_object_body_2)
def get_bucket_tagging() -> Dict[str, Any]:
return {"TagSet": [{"Key": "foo", "Value": "bar"}]}
def get_object_tagging() -> Dict[str, Any]:
return {"TagSet": [{"Key": "baz", "Value": "bob"}]}