From f61a040555b6ab706f6f6f7701927f577af5fe7d Mon Sep 17 00:00:00 2001 From: skrydal Date: Wed, 14 Sep 2022 21:25:09 +0200 Subject: [PATCH] feat(ingestion) Add more info to glue entities (#5874) Co-authored-by: Harshal Sheth --- .../src/datahub/ingestion/source/aws/glue.py | 108 +-- .../glue/glue_deleted_actor_mces_golden.json | 251 ++----- .../tests/unit/glue/glue_mces_golden.json | 663 ++++++------------ .../glue_mces_platform_instance_golden.json | 520 ++++---------- .../tests/unit/test_glue_source.py | 12 +- .../tests/unit/test_glue_source_stubs.py | 2 + 6 files changed, 483 insertions(+), 1073 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index bd58ecb98d..238e6160fd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -1,13 +1,14 @@ import logging -import typing from collections import defaultdict from dataclasses import dataclass, field as dataclass_field from typing import ( Any, + DefaultDict, Dict, Iterable, Iterator, List, + Mapping, Optional, Set, Tuple, @@ -65,7 +66,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionConfigBase, StatefulIngestionSourceBase, ) -from datahub.metadata.com.linkedin.pegasus2avro.common import Status +from datahub.metadata.com.linkedin.pegasus2avro.common import Status, SubTypes from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.com.linkedin.pegasus2avro.schema import ( @@ -414,7 +415,7 @@ class GlueSource(StatefulIngestionSourceBase): flow_urn: str, new_dataset_ids: List[str], new_dataset_mces: List[MetadataChangeEvent], - s3_formats: typing.DefaultDict[str, Set[Union[str, None]]], + s3_formats: DefaultDict[str, Set[Union[str, None]]], ) -> Optional[Dict[str, Any]]: node_type = node["NodeType"] @@ -509,7 +510,7 @@ class GlueSource(StatefulIngestionSourceBase): self, dataflow_graph: Dict[str, Any], flow_urn: str, - s3_formats: typing.DefaultDict[str, Set[Union[str, None]]], + s3_formats: DefaultDict[str, Set[Union[str, None]]], ) -> Tuple[Dict[str, Dict[str, Any]], List[str], List[MetadataChangeEvent]]: """ Prepare a job's DAG for ingestion. @@ -654,7 +655,9 @@ class GlueSource(StatefulIngestionSourceBase): return MetadataWorkUnit(id=f'{job_name}-{node["Id"]}', mce=mce) - def get_all_tables(self) -> List[dict]: + def get_all_tables_and_databases( + self, + ) -> Tuple[Dict, List[Dict]]: def get_tables_from_database(database_name: str) -> List[dict]: new_tables = [] @@ -673,8 +676,8 @@ class GlueSource(StatefulIngestionSourceBase): return new_tables - def get_database_names() -> List[str]: - database_names = [] + def get_databases() -> List[Mapping[str, Any]]: + databases = [] # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html#Glue.Client.get_databases paginator = self.glue_client.get_paginator("get_databases") @@ -689,19 +692,25 @@ class GlueSource(StatefulIngestionSourceBase): for page in paginator_response: for db in page["DatabaseList"]: if self.source_config.database_pattern.allowed(db["Name"]): - database_names.append(db["Name"]) + databases.append(db) - return database_names + return databases - if self.source_config.database_pattern.is_fully_specified_allow_list(): - database_names = self.source_config.database_pattern.get_allowed_list() - else: - database_names = get_database_names() + all_databases = get_databases() - all_tables: List[dict] = [] - for database in database_names: - all_tables += get_tables_from_database(database) - return all_tables + databases = { + database["Name"]: database + for database in all_databases + if self.source_config.database_pattern.allowed(database["Name"]) + } + + all_tables: List[dict] = [ + table + for databaseName in databases.keys() + for table in get_tables_from_database(databaseName) + ] + + return databases, all_tables def get_lineage_if_enabled( self, mce: MetadataChangeEventClass @@ -905,14 +914,17 @@ class GlueSource(StatefulIngestionSourceBase): else self.source_config.env, ) - def gen_database_containers(self, database: str) -> Iterable[MetadataWorkUnit]: - domain_urn = self._gen_domain_urn(database) - database_container_key = self.gen_database_key(database) + def gen_database_containers( + self, database: Mapping[str, Any] + ) -> Iterable[MetadataWorkUnit]: + domain_urn = self._gen_domain_urn(database["Name"]) + database_container_key = self.gen_database_key(database["Name"]) container_workunits = gen_containers( container_key=database_container_key, - name=database, + name=database["Name"], sub_types=["Database"], domain_urn=domain_urn, + description=database.get("Description"), ) for wu in container_workunits: @@ -955,7 +967,7 @@ class GlueSource(StatefulIngestionSourceBase): def get_workunits(self) -> Iterable[MetadataWorkUnit]: database_seen = set() - tables = self.get_all_tables() + databases, tables = self.get_all_tables_and_databases() for table in tables: database_name = table["DatabaseName"] @@ -969,19 +981,29 @@ class GlueSource(StatefulIngestionSourceBase): continue if database_name not in database_seen: database_seen.add(database_name) - yield from self.gen_database_containers(database_name) + yield from self.gen_database_containers(databases[database_name]) - mce = self._extract_record(table, full_table_name) - workunit = MetadataWorkUnit(full_table_name, mce=mce) - self.report.report_workunit(workunit) - yield workunit - - dataset_urn: str = make_dataset_urn_with_platform_instance( + dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=full_table_name, env=self.env, platform_instance=self.source_config.platform_instance, ) + + mce = self._extract_record(dataset_urn, table, full_table_name) + workunit = MetadataWorkUnit(full_table_name, mce=mce) + self.report.report_workunit(workunit) + yield workunit + + # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not + # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp. + workunit = MetadataChangeProposalWrapper( + entityUrn=dataset_urn, + aspect=SubTypes(typeNames=["table"]), + ).as_workunit() + self.report.report_workunit(workunit) + yield workunit + yield from self._get_domain_wu( dataset_name=full_table_name, entity_urn=dataset_urn, @@ -1044,9 +1066,7 @@ class GlueSource(StatefulIngestionSourceBase): # in Glue, it's possible for two buckets to have files of different extensions # if this happens, we append the extension in the URN so the sources can be distinguished # see process_dataflow_node() for details - s3_formats: typing.DefaultDict[str, Set[Optional[str]]] = defaultdict( - lambda: set() - ) + s3_formats: DefaultDict[str, Set[Optional[str]]] = defaultdict(lambda: set()) for dag in dags.values(): if dag is not None: for s3_name, extension in self.get_dataflow_s3_names(dag): @@ -1074,7 +1094,9 @@ class GlueSource(StatefulIngestionSourceBase): yield dataset_wu # flake8: noqa: C901 - def _extract_record(self, table: Dict, table_name: str) -> MetadataChangeEvent: + def _extract_record( + self, dataset_urn: str, table: Dict, table_name: str + ) -> MetadataChangeEvent: def get_owner() -> Optional[OwnershipClass]: owner = table.get("Owner") if owner: @@ -1172,7 +1194,7 @@ class GlueSource(StatefulIngestionSourceBase): ) return new_tags - def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata: + def get_schema_metadata() -> SchemaMetadata: schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: @@ -1214,27 +1236,21 @@ class GlueSource(StatefulIngestionSourceBase): else None, ) - dataset_urn = make_dataset_urn_with_platform_instance( - platform=self.platform, - name=table_name, - env=self.env, - platform_instance=self.source_config.platform_instance, - ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, - aspects=[], + aspects=[ + Status(removed=False), + get_dataset_properties(), + get_schema_metadata(), + get_data_platform_instance(), + ], ) - dataset_snapshot.aspects.append(Status(removed=False)) - if self.extract_owners: optional_owner_aspect = get_owner() if optional_owner_aspect is not None: dataset_snapshot.aspects.append(optional_owner_aspect) - dataset_snapshot.aspects.append(get_dataset_properties()) - dataset_snapshot.aspects.append(get_schema_metadata(self)) - dataset_snapshot.aspects.append(get_data_platform_instance()) if ( self.source_config.use_s3_bucket_tags or self.source_config.use_s3_object_tags diff --git a/metadata-ingestion/tests/unit/glue/glue_deleted_actor_mces_golden.json b/metadata-ingestion/tests/unit/glue/glue_deleted_actor_mces_golden.json index f4ea106f53..34a884f401 100644 --- a/metadata-ingestion/tests/unit/glue/glue_deleted_actor_mces_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_deleted_actor_mces_golden.json @@ -1,9 +1,7 @@ [ { - "auditHeader": null, "entityType": "container", "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { @@ -12,17 +10,12 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "glue-2020_04_14-07_00_00", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "glue-2020_04_14-07_00_00" } }, { - "auditHeader": null, "entityType": "container", "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -31,17 +24,12 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "glue-2020_04_14-07_00_00", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "glue-2020_04_14-07_00_00" } }, { - "auditHeader": null, "entityType": "container", "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -50,14 +38,10 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "glue-2020_04_14-07_00_00", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "glue-2020_04_14-07_00_00" } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", @@ -67,22 +51,6 @@ "removed": false } }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - } - } - }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { @@ -106,11 +74,6 @@ "SortColumns": "[]", "StoredAsSubDirectories": "False" }, - "externalUrl": null, - "name": null, - "qualifiedName": null, - "description": null, - "uri": null, "tags": [] } }, @@ -121,17 +84,12 @@ "version": 0, "created": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, "lastModified": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, - "deleted": null, - "dataset": null, - "cluster": null, "hash": "", "platformSchema": { "com.linkedin.pegasus2avro.schema.MySqlDDL": { @@ -141,11 +99,7 @@ "fields": [ { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers", - "jsonPath": null, "nullable": true, - "description": null, - "created": null, - "lastModified": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.ArrayType": { @@ -157,19 +111,12 @@ }, "nativeDataType": "array,location:array>>", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"array,location:array>>\"}" }, { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=string].name", - "jsonPath": null, "nullable": true, - "description": null, - "created": null, - "lastModified": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -177,19 +124,12 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].position", - "jsonPath": null, "nullable": true, - "description": null, - "created": null, - "lastModified": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.ArrayType": { @@ -201,19 +141,12 @@ }, "nativeDataType": "array", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"array\"}" }, { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].location", - "jsonPath": null, "nullable": true, - "description": null, - "created": null, - "lastModified": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.ArrayType": { @@ -225,41 +158,56 @@ }, "nativeDataType": "array", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"array\"}" } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null + ] } }, { "com.linkedin.pegasus2avro.common.DataPlatformInstance": { - "platform": "urn:li:dataPlatform:glue", - "instance": null + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } } } ] } }, - "proposedDelta": null, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "glue-2020_04_14-07_00_00", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "glue-2020_04_14-07_00_00" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "value": "{\"typeNames\": [\"table\"]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "glue-2020_04_14-07_00_00" } }, { - "auditHeader": null, "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -268,14 +216,10 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "glue-2020_04_14-07_00_00", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "glue-2020_04_14-07_00_00" } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", @@ -285,22 +229,6 @@ "removed": false } }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - } - } - }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { @@ -324,11 +252,6 @@ "SortColumns": "[]", "StoredAsSubDirectories": "False" }, - "externalUrl": null, - "name": null, - "qualifiedName": null, - "description": null, - "uri": null, "tags": [] } }, @@ -339,17 +262,12 @@ "version": 0, "created": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, "lastModified": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, - "deleted": null, - "dataset": null, - "cluster": null, "hash": "", "platformSchema": { "com.linkedin.pegasus2avro.schema.MySqlDDL": { @@ -359,11 +277,7 @@ "fields": [ { "fieldPath": "[version=2.0].[type=int].yr", - "jsonPath": null, "nullable": true, - "description": null, - "created": null, - "lastModified": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -371,19 +285,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=int].quarter", - "jsonPath": null, "nullable": true, - "description": null, - "created": null, - "lastModified": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -391,19 +298,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=int].month", - "jsonPath": null, "nullable": true, - "description": null, - "created": null, - "lastModified": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -411,19 +311,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=int].dayofmonth", - "jsonPath": null, "nullable": true, - "description": null, - "created": null, - "lastModified": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -431,19 +324,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].year", - "jsonPath": null, "nullable": true, - "description": null, - "created": null, - "lastModified": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -451,41 +337,56 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null + ] } }, { "com.linkedin.pegasus2avro.common.DataPlatformInstance": { - "platform": "urn:li:dataPlatform:glue", - "instance": null + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } } } ] } }, - "proposedDelta": null, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "glue-2020_04_14-07_00_00", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "glue-2020_04_14-07_00_00" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "value": "{\"typeNames\": [\"table\"]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "glue-2020_04_14-07_00_00" } }, { - "auditHeader": null, "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -494,17 +395,12 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "glue-2020_04_14-07_00_00", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "glue-2020_04_14-07_00_00" } }, { - "auditHeader": null, "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -513,10 +409,7 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "glue-2020_04_14-07_00_00", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "glue-2020_04_14-07_00_00" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json index 3f441ad4a8..9d65962c13 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json @@ -1,45 +1,35 @@ [ { - "auditHeader": null, "entityType": "container", "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "value": "{\"customProperties\": {\"platform\": \"glue\", \"instance\": \"PROD\", \"database\": \"flights-database\"}, \"name\": \"flights-database\"}", "contentType": "application/json" - }, - "systemMetadata": null + } }, { - "auditHeader": null, "entityType": "container", "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "value": "{\"platform\": \"urn:li:dataPlatform:glue\"}", "contentType": "application/json" - }, - "systemMetadata": null + } }, { - "auditHeader": null, "entityType": "container", "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "value": "{\"typeNames\": [\"Database\"]}", "contentType": "application/json" - }, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", @@ -49,22 +39,6 @@ "removed": false } }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - } - } - }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { @@ -89,11 +63,6 @@ "SortColumns": "[]", "StoredAsSubDirectories": "False" }, - "externalUrl": null, - "name": null, - "qualifiedName": null, - "description": null, - "uri": null, "tags": [] } }, @@ -104,17 +73,12 @@ "version": 0, "created": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, "lastModified": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, - "deleted": null, - "dataset": null, - "cluster": null, "hash": "", "platformSchema": { "com.linkedin.pegasus2avro.schema.MySqlDDL": { @@ -124,9 +88,7 @@ "fields": [ { "fieldPath": "[version=2.0].[type=int].yr", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -134,17 +96,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].flightdate", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -152,17 +109,12 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].uniquecarrier", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -170,17 +122,12 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=int].airlineid", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -188,17 +135,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].carrier", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -206,17 +148,12 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].flightnum", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -224,17 +161,12 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].origin", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -242,17 +174,12 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].year", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -260,103 +187,15 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null + ] } }, { "com.linkedin.pegasus2avro.common.DataPlatformInstance": { - "platform": "urn:li:dataPlatform:glue", - "instance": null - } - }, - { - "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [ - { - "tag": "urn:li:tag:baz:bob" - }, - { - "tag": "urn:li:tag:foo:bar" - } - ] - } - } - ] - } - }, - "proposedDelta": null, - "systemMetadata": null - }, - { - "auditHeader": null, - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", - "entityKeyAspect": null, - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "value": "{\"container\": \"urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba\"}", - "contentType": "application/json" - }, - "systemMetadata": null - }, - { - "auditHeader": null, - "entityType": "container", - "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "entityKeyAspect": null, - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "value": "{\"customProperties\": {\"platform\": \"glue\", \"instance\": \"PROD\", \"database\": \"test-database\"}, \"name\": \"test-database\"}", - "contentType": "application/json" - }, - "systemMetadata": null - }, - { - "auditHeader": null, - "entityType": "container", - "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "entityKeyAspect": null, - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "value": "{\"platform\": \"urn:li:dataPlatform:glue\"}", - "contentType": "application/json" - }, - "systemMetadata": null - }, - { - "auditHeader": null, - "entityType": "container", - "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "entityKeyAspect": null, - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "value": "{\"typeNames\": [\"Database\"]}", - "contentType": "application/json" - }, - "systemMetadata": null - }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false + "platform": "urn:li:dataPlatform:glue" } }, { @@ -364,17 +203,91 @@ "owners": [ { "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER", - "source": null + "type": "DATAOWNER" } ], "lastModified": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" } } }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:foo:bar" + }, + { + "tag": "urn:li:tag:baz:bob" + } + ] + } + } + ] + } + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "value": "{\"typeNames\": [\"table\"]}", + "contentType": "application/json" + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "value": "{\"container\": \"urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba\"}", + "contentType": "application/json" + } + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "value": "{\"customProperties\": {\"platform\": \"glue\", \"instance\": \"PROD\", \"database\": \"test-database\"}, \"name\": \"test-database\"}", + "contentType": "application/json" + } + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "value": "{\"platform\": \"urn:li:dataPlatform:glue\"}", + "contentType": "application/json" + } + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "value": "{\"typeNames\": [\"Database\"]}", + "contentType": "application/json" + } + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { @@ -398,11 +311,6 @@ "SortColumns": "[]", "StoredAsSubDirectories": "False" }, - "externalUrl": null, - "name": null, - "qualifiedName": null, - "description": null, - "uri": null, "tags": [] } }, @@ -413,17 +321,12 @@ "version": 0, "created": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, "lastModified": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, - "deleted": null, - "dataset": null, - "cluster": null, "hash": "", "platformSchema": { "com.linkedin.pegasus2avro.schema.MySqlDDL": { @@ -433,9 +336,7 @@ "fields": [ { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.ArrayType": { @@ -447,17 +348,12 @@ }, "nativeDataType": "array,location:array>>", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"array,location:array>>\"}" }, { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=string].name", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -465,17 +361,12 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].position", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.ArrayType": { @@ -487,17 +378,12 @@ }, "nativeDataType": "array", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"array\"}" }, { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].location", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.ArrayType": { @@ -509,64 +395,15 @@ }, "nativeDataType": "array", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"array\"}" } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null + ] } }, { "com.linkedin.pegasus2avro.common.DataPlatformInstance": { - "platform": "urn:li:dataPlatform:glue", - "instance": null - } - }, - { - "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [ - { - "tag": "urn:li:tag:baz:bob" - }, - { - "tag": "urn:li:tag:foo:bar" - } - ] - } - } - ] - } - }, - "proposedDelta": null, - "systemMetadata": null - }, - { - "auditHeader": null, - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", - "entityKeyAspect": null, - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "value": "{\"container\": \"urn:li:container:bdf4342ea6899d162eae685bfe9074a7\"}", - "contentType": "application/json" - }, - "systemMetadata": null - }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false + "platform": "urn:li:dataPlatform:glue" } }, { @@ -574,17 +411,61 @@ "owners": [ { "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER", - "source": null + "type": "DATAOWNER" } ], "lastModified": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" } } }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:foo:bar" + }, + { + "tag": "urn:li:tag:baz:bob" + } + ] + } + } + ] + } + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "value": "{\"typeNames\": [\"table\"]}", + "contentType": "application/json" + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "value": "{\"container\": \"urn:li:container:bdf4342ea6899d162eae685bfe9074a7\"}", + "contentType": "application/json" + } + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { @@ -608,11 +489,6 @@ "SortColumns": "[]", "StoredAsSubDirectories": "False" }, - "externalUrl": null, - "name": null, - "qualifiedName": null, - "description": null, - "uri": null, "tags": [] } }, @@ -623,17 +499,12 @@ "version": 0, "created": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, "lastModified": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, - "deleted": null, - "dataset": null, - "cluster": null, "hash": "", "platformSchema": { "com.linkedin.pegasus2avro.schema.MySqlDDL": { @@ -643,9 +514,7 @@ "fields": [ { "fieldPath": "[version=2.0].[type=int].yr", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -653,17 +522,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=int].quarter", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -671,17 +535,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=int].month", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -689,17 +548,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=int].dayofmonth", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -707,17 +561,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].year", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -725,57 +574,68 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null + ] } }, { "com.linkedin.pegasus2avro.common.DataPlatformInstance": { - "platform": "urn:li:dataPlatform:glue", - "instance": null + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } } }, { "com.linkedin.pegasus2avro.common.GlobalTags": { "tags": [ { - "tag": "urn:li:tag:baz:bob" + "tag": "urn:li:tag:foo:bar" }, { - "tag": "urn:li:tag:foo:bar" + "tag": "urn:li:tag:baz:bob" } ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "value": "{\"typeNames\": [\"table\"]}", + "contentType": "application/json" + } }, { - "auditHeader": null, "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "container", "aspect": { "value": "{\"container\": \"urn:li:container:bdf4342ea6899d162eae685bfe9074a7\"}", "contentType": "application/json" - }, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { "urn": "urn:li:dataFlow:(glue,test-job-1,PROD)", @@ -790,18 +650,14 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1", - "description": "The first test job", - "project": null + "description": "The first test job" } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { "urn": "urn:li:dataFlow:(glue,test-job-2,PROD)", @@ -816,18 +672,14 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", "name": "test-job-2", - "description": "The second test job", - "project": null + "description": "The second test job" } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)", @@ -842,12 +694,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1:Filter-Transform0_job1", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -856,20 +705,14 @@ "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)" - ], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform1_job1)", @@ -884,12 +727,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1:ApplyMapping-Transform1_job1", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -898,20 +738,14 @@ "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)" - ], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)", @@ -926,12 +760,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1:ApplyMapping-Transform2_job1", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -940,20 +771,14 @@ "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)" ], "outputDatasets": [], - "inputDatajobs": [], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + "inputDatajobs": [] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Join-Transform3_job1)", @@ -969,12 +794,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1:Join-Transform3_job1", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -983,20 +805,14 @@ "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)" - ], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)", @@ -1011,32 +827,23 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1:ApplyMapping-Transform4_job1", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [], "outputDatasets": [], - "inputDatajobs": [], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + "inputDatajobs": [] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform5_job1)", @@ -1051,32 +858,23 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1:ApplyMapping-Transform5_job1", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [], "outputDatasets": [], - "inputDatajobs": [], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + "inputDatajobs": [] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", @@ -1094,22 +892,14 @@ "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", "transformation_ctx": "DataSink1" }, - "externalUrl": null, - "name": null, - "qualifiedName": null, - "description": null, - "uri": null, "tags": [] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SplitFields-Transform0_job2)", @@ -1126,12 +916,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", "name": "test-job-2:SplitFields-Transform0_job2", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -1140,20 +927,14 @@ "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" - ], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)", @@ -1168,12 +949,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", "name": "test-job-2:ApplyMapping-Transform1_job2", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -1182,20 +960,14 @@ "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)" ], "outputDatasets": [], - "inputDatajobs": [], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + "inputDatajobs": [] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)", @@ -1210,12 +982,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", "name": "test-job-2:FillMissingValues-Transform2_job2", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -1224,20 +993,14 @@ "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" - ], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SelectFields-Transform3_job2)", @@ -1252,12 +1015,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", "name": "test-job-2:SelectFields-Transform3_job2", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -1268,20 +1028,14 @@ ], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)" - ], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", @@ -1299,18 +1053,11 @@ "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", "transformation_ctx": "DataSink0" }, - "externalUrl": null, - "name": null, - "qualifiedName": null, - "description": null, - "uri": null, "tags": [] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json index 38ce61fe9c..1fc3759f0d 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json @@ -1,45 +1,35 @@ [ { - "auditHeader": null, "entityType": "container", "entityUrn": "urn:li:container:7d53111f2c71396ea6f6d26c84770665", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "value": "{\"customProperties\": {\"platform\": \"glue\", \"instance\": \"some_instance_name\", \"database\": \"flights-database\"}, \"name\": \"flights-database\"}", "contentType": "application/json" - }, - "systemMetadata": null + } }, { - "auditHeader": null, "entityType": "container", "entityUrn": "urn:li:container:7d53111f2c71396ea6f6d26c84770665", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "value": "{\"platform\": \"urn:li:dataPlatform:glue\"}", "contentType": "application/json" - }, - "systemMetadata": null + } }, { - "auditHeader": null, "entityType": "container", "entityUrn": "urn:li:container:7d53111f2c71396ea6f6d26c84770665", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "value": "{\"typeNames\": [\"Database\"]}", "contentType": "application/json" - }, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.flights-database.avro,PROD)", @@ -49,22 +39,6 @@ "removed": false } }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - } - } - }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { @@ -89,11 +63,6 @@ "SortColumns": "[]", "StoredAsSubDirectories": "False" }, - "externalUrl": null, - "name": null, - "qualifiedName": null, - "description": null, - "uri": null, "tags": [] } }, @@ -104,17 +73,12 @@ "version": 0, "created": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, "lastModified": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, - "deleted": null, - "dataset": null, - "cluster": null, "hash": "", "platformSchema": { "com.linkedin.pegasus2avro.schema.MySqlDDL": { @@ -124,9 +88,7 @@ "fields": [ { "fieldPath": "[version=2.0].[type=int].yr", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -134,17 +96,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].flightdate", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -152,17 +109,12 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].uniquecarrier", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -170,17 +122,12 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=int].airlineid", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -188,17 +135,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].carrier", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -206,17 +148,12 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].flightnum", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -224,17 +161,12 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].origin", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -242,17 +174,12 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].year", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -260,16 +187,10 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null + ] } }, { @@ -278,78 +199,87 @@ "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:glue,some_instance_name)" } }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, { "com.linkedin.pegasus2avro.common.GlobalTags": { "tags": [ { - "tag": "urn:li:tag:baz:bob" + "tag": "urn:li:tag:foo:bar" }, { - "tag": "urn:li:tag:foo:bar" + "tag": "urn:li:tag:baz:bob" } ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.flights-database.avro,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "value": "{\"typeNames\": [\"table\"]}", + "contentType": "application/json" + } }, { - "auditHeader": null, "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.flights-database.avro,PROD)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "container", "aspect": { "value": "{\"container\": \"urn:li:container:7d53111f2c71396ea6f6d26c84770665\"}", "contentType": "application/json" - }, - "systemMetadata": null + } }, { - "auditHeader": null, "entityType": "container", "entityUrn": "urn:li:container:9fb26491b2c92dde9e80791dbecca9ca", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "value": "{\"customProperties\": {\"platform\": \"glue\", \"instance\": \"some_instance_name\", \"database\": \"test-database\"}, \"name\": \"test-database\"}", "contentType": "application/json" - }, - "systemMetadata": null + } }, { - "auditHeader": null, "entityType": "container", "entityUrn": "urn:li:container:9fb26491b2c92dde9e80791dbecca9ca", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "value": "{\"platform\": \"urn:li:dataPlatform:glue\"}", "contentType": "application/json" - }, - "systemMetadata": null + } }, { - "auditHeader": null, "entityType": "container", "entityUrn": "urn:li:container:9fb26491b2c92dde9e80791dbecca9ca", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "value": "{\"typeNames\": [\"Database\"]}", "contentType": "application/json" - }, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_jsons_markers,PROD)", @@ -359,22 +289,6 @@ "removed": false } }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - } - } - }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { @@ -398,11 +312,6 @@ "SortColumns": "[]", "StoredAsSubDirectories": "False" }, - "externalUrl": null, - "name": null, - "qualifiedName": null, - "description": null, - "uri": null, "tags": [] } }, @@ -413,17 +322,12 @@ "version": 0, "created": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, "lastModified": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, - "deleted": null, - "dataset": null, - "cluster": null, "hash": "", "platformSchema": { "com.linkedin.pegasus2avro.schema.MySqlDDL": { @@ -433,9 +337,7 @@ "fields": [ { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.ArrayType": { @@ -447,17 +349,12 @@ }, "nativeDataType": "array,location:array>>", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"array,location:array>>\"}" }, { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=string].name", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -465,17 +362,12 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].position", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.ArrayType": { @@ -487,17 +379,12 @@ }, "nativeDataType": "array", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"array\"}" }, { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].location", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.ArrayType": { @@ -509,16 +396,10 @@ }, "nativeDataType": "array", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"array\"}" } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null + ] } }, { @@ -527,39 +408,57 @@ "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:glue,some_instance_name)" } }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, { "com.linkedin.pegasus2avro.common.GlobalTags": { "tags": [ { - "tag": "urn:li:tag:baz:bob" + "tag": "urn:li:tag:foo:bar" }, { - "tag": "urn:li:tag:foo:bar" + "tag": "urn:li:tag:baz:bob" } ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_jsons_markers,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "value": "{\"typeNames\": [\"table\"]}", + "contentType": "application/json" + } }, { - "auditHeader": null, "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_jsons_markers,PROD)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "container", "aspect": { "value": "{\"container\": \"urn:li:container:9fb26491b2c92dde9e80791dbecca9ca\"}", "contentType": "application/json" - }, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_parquet,PROD)", @@ -569,22 +468,6 @@ "removed": false } }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER", - "source": null - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - } - } - }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { @@ -608,11 +491,6 @@ "SortColumns": "[]", "StoredAsSubDirectories": "False" }, - "externalUrl": null, - "name": null, - "qualifiedName": null, - "description": null, - "uri": null, "tags": [] } }, @@ -623,17 +501,12 @@ "version": 0, "created": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, "lastModified": { "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null + "actor": "urn:li:corpuser:unknown" }, - "deleted": null, - "dataset": null, - "cluster": null, "hash": "", "platformSchema": { "com.linkedin.pegasus2avro.schema.MySqlDDL": { @@ -643,9 +516,7 @@ "fields": [ { "fieldPath": "[version=2.0].[type=int].yr", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -653,17 +524,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=int].quarter", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -671,17 +537,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=int].month", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -689,17 +550,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=int].dayofmonth", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -707,17 +563,12 @@ }, "nativeDataType": "int", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=string].year", - "jsonPath": null, "nullable": true, - "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} @@ -725,16 +576,10 @@ }, "nativeDataType": "string", "recursive": false, - "globalTags": null, - "glossaryTerms": null, "isPartOfKey": false, - "isPartitioningKey": null, "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null + ] } }, { @@ -743,39 +588,57 @@ "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:glue,some_instance_name)" } }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, { "com.linkedin.pegasus2avro.common.GlobalTags": { "tags": [ { - "tag": "urn:li:tag:baz:bob" + "tag": "urn:li:tag:foo:bar" }, { - "tag": "urn:li:tag:foo:bar" + "tag": "urn:li:tag:baz:bob" } ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_parquet,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "value": "{\"typeNames\": [\"table\"]}", + "contentType": "application/json" + } }, { - "auditHeader": null, "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_parquet,PROD)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "container", "aspect": { "value": "{\"container\": \"urn:li:container:9fb26491b2c92dde9e80791dbecca9ca\"}", "contentType": "application/json" - }, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { "urn": "urn:li:dataFlow:(glue,test-job-1,PROD)", @@ -790,18 +653,14 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1", - "description": "The first test job", - "project": null + "description": "The first test job" } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { "urn": "urn:li:dataFlow:(glue,test-job-2,PROD)", @@ -816,18 +675,14 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", "name": "test-job-2", - "description": "The second test job", - "project": null + "description": "The second test job" } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)", @@ -842,12 +697,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1:Filter-Transform0_job1", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -856,20 +708,14 @@ "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)" - ], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform1_job1)", @@ -884,12 +730,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1:ApplyMapping-Transform1_job1", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -898,20 +741,14 @@ "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)" - ], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)", @@ -926,12 +763,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1:ApplyMapping-Transform2_job1", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -940,20 +774,14 @@ "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.flights-database.avro,PROD)" ], "outputDatasets": [], - "inputDatajobs": [], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + "inputDatajobs": [] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Join-Transform3_job1)", @@ -969,12 +797,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1:Join-Transform3_job1", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -983,20 +808,14 @@ "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)" - ], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)", @@ -1011,32 +830,23 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1:ApplyMapping-Transform4_job1", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [], "outputDatasets": [], - "inputDatajobs": [], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + "inputDatajobs": [] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform5_job1)", @@ -1051,32 +861,23 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", "name": "test-job-1:ApplyMapping-Transform5_job1", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [], "outputDatasets": [], - "inputDatajobs": [], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + "inputDatajobs": [] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", @@ -1094,22 +895,14 @@ "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", "transformation_ctx": "DataSink1" }, - "externalUrl": null, - "name": null, - "qualifiedName": null, - "description": null, - "uri": null, "tags": [] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SplitFields-Transform0_job2)", @@ -1126,12 +919,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", "name": "test-job-2:SplitFields-Transform0_job2", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -1140,20 +930,14 @@ "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" - ], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)", @@ -1168,12 +952,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", "name": "test-job-2:ApplyMapping-Transform1_job2", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -1182,20 +963,14 @@ "urn:li:dataset:(urn:li:dataPlatform:glue,some_instance_name.test-database.test_parquet,PROD)" ], "outputDatasets": [], - "inputDatajobs": [], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + "inputDatajobs": [] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)", @@ -1210,12 +985,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", "name": "test-job-2:FillMissingValues-Transform2_job2", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -1224,20 +996,14 @@ "outputDatasets": [], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" - ], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SelectFields-Transform3_job2)", @@ -1252,12 +1018,9 @@ }, "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", "name": "test-job-2:SelectFields-Transform3_job2", - "description": null, "type": { "string": "GLUE" - }, - "flowUrn": null, - "status": null + } } }, { @@ -1268,20 +1031,14 @@ ], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)" - ], - "inputDatasetFields": null, - "outputDatasetFields": null, - "fineGrainedLineages": null + ] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } }, { - "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", @@ -1299,18 +1056,11 @@ "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", "transformation_ctx": "DataSink0" }, - "externalUrl": null, - "name": null, - "qualifiedName": null, - "description": null, - "uri": null, "tags": [] } } ] } - }, - "proposedDelta": null, - "systemMetadata": null + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/test_glue_source.py b/metadata-ingestion/tests/unit/test_glue_source.py index b680b1f66b..8bcc2b96c0 100644 --- a/metadata-ingestion/tests/unit/test_glue_source.py +++ b/metadata-ingestion/tests/unit/test_glue_source.py @@ -30,6 +30,8 @@ from tests.test_helpers.state_helpers import ( ) from tests.test_helpers.type_helpers import PytestConfig from tests.unit.test_glue_source_stubs import ( + databases_1, + databases_2, get_bucket_tagging, get_databases_response, get_dataflow_graph_response_1, @@ -277,13 +279,13 @@ def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): ) as mock_checkpoint: mock_checkpoint.return_value = mock_datahub_graph with patch( - "datahub.ingestion.source.aws.glue.GlueSource.get_all_tables", - ) as mock_get_all_tables: + "datahub.ingestion.source.aws.glue.GlueSource.get_all_tables_and_databases", + ) as mock_get_all_tables_and_databases: tables_on_first_call = tables_1 tables_on_second_call = tables_2 - mock_get_all_tables.side_effect = [ - tables_on_first_call, - tables_on_second_call, + mock_get_all_tables_and_databases.side_effect = [ + (databases_1, tables_on_first_call), + (databases_2, tables_on_second_call), ] pipeline_run1 = run_and_get_pipeline(pipeline_config_dict) diff --git a/metadata-ingestion/tests/unit/test_glue_source_stubs.py b/metadata-ingestion/tests/unit/test_glue_source_stubs.py index a1b10dd730..bca9bc90c8 100644 --- a/metadata-ingestion/tests/unit/test_glue_source_stubs.py +++ b/metadata-ingestion/tests/unit/test_glue_source_stubs.py @@ -34,6 +34,8 @@ get_databases_response = { }, ] } +databases_1 = {"flights-database": {"Name": "flights-database"}} +databases_2 = {"test-database": {"Name": "test-database"}} tables_1 = [ { "Name": "avro",