From a711baa1314b622b5aaed807f5206046567f869c Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Tue, 2 May 2023 18:37:51 +0530 Subject: [PATCH] fix(ingest/hive): fix containers generation for hive (#7926) --- .../src/datahub/ingestion/source/sql/hive.py | 12 +- .../hive/hive_mces_all_db_golden.json | 1435 +++++++++++++++++ .../integration/hive/hive_mces_golden.json | 209 +-- .../tests/integration/hive/hive_setup.sql | 2 +- .../tests/integration/hive/test_hive.py | 48 +- 5 files changed, 1568 insertions(+), 138 deletions(-) create mode 100644 metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py index 73e58b4fac..63b21bc82e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py @@ -19,11 +19,11 @@ from datahub.ingestion.api.decorators import ( support_status, ) from datahub.ingestion.extractor import schema_util -from datahub.ingestion.source.sql.sql_common import ( - SQLAlchemySource, - register_custom_type, +from datahub.ingestion.source.sql.sql_common import register_custom_type +from datahub.ingestion.source.sql.two_tier_sql_source import ( + TwoTierSQLAlchemyConfig, + TwoTierSQLAlchemySource, ) -from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig from datahub.metadata.com.linkedin.pegasus2avro.schema import ( DateTypeClass, NullTypeClass, @@ -90,7 +90,7 @@ except Exception as e: logger.warning(f"Failed to patch method due to {e}") -class HiveConfig(BasicSQLAlchemyConfig): +class HiveConfig(TwoTierSQLAlchemyConfig): # defaults scheme = Field(default="hive", hidden_from_docs=True) @@ -113,7 +113,7 @@ class HiveConfig(BasicSQLAlchemyConfig): @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") -class HiveSource(SQLAlchemySource): +class HiveSource(TwoTierSQLAlchemySource): """ This plugin extracts the following: diff --git a/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json b/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json new file mode 100644 index 0000000000..1dcc214c48 --- /dev/null +++ b/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json @@ -0,0 +1,1435 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:ded36d15fcfbbb939830549697122661", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "hive", + "instance": "PROD", + "database": "db1" + }, + "name": "db1" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ded36d15fcfbbb939830549697122661", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ded36d15fcfbbb939830549697122661", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:hive" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ded36d15fcfbbb939830549697122661", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1._test_table_underscore,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1._test_table_underscore,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Tue May 02 10:23:18 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore", + "Table Type:": "MANAGED_TABLE", + "Table Parameters: COLUMN_STATS_ACCURATE": "{\\\"BASIC_STATS\\\":\\\"true\\\"}", + "Table Parameters: numFiles": "0", + "Table Parameters: numRows": "0", + "Table Parameters: rawDataSize": "0", + "Table Parameters: totalSize": "0", + "Table Parameters: transient_lastDdlTime": "1683022998", + "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "Storage Desc Params: serialization.format": "1" + }, + "name": "_test_table_underscore", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1._test_table_underscore", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "foo", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "bar", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1._test_table_underscore,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Tue May 02 10:23:18 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test", + "Table Type:": "MANAGED_TABLE", + "Table Parameters: COLUMN_STATS_ACCURATE": "{\\\"BASIC_STATS\\\":\\\"true\\\"}", + "Table Parameters: numFiles": "1", + "Table Parameters: numRows": "1", + "Table Parameters: rawDataSize": "32", + "Table Parameters: totalSize": "33", + "Table Parameters: transient_lastDdlTime": "1683023000", + "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "Storage Desc Params: serialization.format": "1" + }, + "name": "array_struct_test", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.array_struct_test", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "record" + ] + } + } + }, + "nativeDataType": "array>>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array>>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=string].type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=array].[type=int].provider", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "int" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.map_test,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.map_test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Tue May 02 10:23:21 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test", + "Table Type:": "MANAGED_TABLE", + "Table Parameters: COLUMN_STATS_ACCURATE": "{\\\"BASIC_STATS\\\":\\\"true\\\"}", + "Table Parameters: numFiles": "0", + "Table Parameters: numRows": "0", + "Table Parameters: rawDataSize": "0", + "Table Parameters: totalSize": "0", + "Table Parameters: transient_lastDdlTime": "1683023001", + "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "Storage Desc Params: serialization.format": "1" + }, + "name": "map_test", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.map_test", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "keyvalue", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=string].recordid", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.MapType": { + "keyType": "string", + "valueType": "string" + } + } + }, + "nativeDataType": "map", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"map\", \"key_type\": {\"type\": \"int\", \"native_data_type\": \"int\", \"_nullable\": true}, \"key_native_data_type\": \"int\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.map_test,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.nested_struct_test,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.nested_struct_test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Tue May 02 10:23:21 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test", + "Table Type:": "MANAGED_TABLE", + "Table Parameters: COLUMN_STATS_ACCURATE": "{\\\"BASIC_STATS\\\":\\\"true\\\"}", + "Table Parameters: numFiles": "0", + "Table Parameters: numRows": "0", + "Table Parameters: rawDataSize": "0", + "Table Parameters: totalSize": "0", + "Table Parameters: transient_lastDdlTime": "1683023001", + "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "Storage Desc Params: serialization.format": "1" + }, + "name": "nested_struct_test", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.nested_struct_test", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "struct>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"struct>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=struct].provider", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "struct", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"struct\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=struct].provider.[type=string].name", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "varchar(50)", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"varchar(50)\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=struct].provider.[type=int].id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "tinyint", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"tinyint\", \"_nullable\": true}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.nested_struct_test,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.pokes,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.pokes,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Tue May 02 10:23:16 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes", + "Table Type:": "MANAGED_TABLE", + "Table Parameters: numFiles": "1", + "Table Parameters: numRows": "0", + "Table Parameters: rawDataSize": "0", + "Table Parameters: totalSize": "5812", + "Table Parameters: transient_lastDdlTime": "1683022997", + "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "Storage Desc Params: serialization.format": "1" + }, + "name": "pokes", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.pokes", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "foo", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "bar", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.pokes,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Tue May 02 10:23:18 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test", + "Table Type:": "MANAGED_TABLE", + "Table Parameters: COLUMN_STATS_ACCURATE": "{\\\"BASIC_STATS\\\":\\\"true\\\"}", + "Table Parameters: numFiles": "0", + "Table Parameters: numRows": "0", + "Table Parameters: rawDataSize": "0", + "Table Parameters: totalSize": "0", + "Table Parameters: transient_lastDdlTime": "1683022998", + "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "Storage Desc Params: serialization.format": "1" + }, + "name": "struct_test", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.struct_test", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "struct>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"struct>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=array].[type=int].provider", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "int" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.union_test,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.union_test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Tue May 02 10:23:21 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test", + "Table Type:": "MANAGED_TABLE", + "Table Parameters: COLUMN_STATS_ACCURATE": "{\\\"BASIC_STATS\\\":\\\"true\\\"}", + "Table Parameters: numFiles": "0", + "Table Parameters: numRows": "0", + "Table Parameters: rawDataSize": "0", + "Table Parameters: totalSize": "0", + "Table Parameters: transient_lastDdlTime": "1683023001", + "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "Storage Desc Params: serialization.format": "1" + }, + "name": "union_test", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.union_test", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=union].foo", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.UnionType": {} + } + }, + "nativeDataType": "union", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=union].[type=int].foo", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.UnionType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=union].[type=double].foo", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.UnionType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=union].[type=array].[type=string].foo", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.UnionType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=union].[type=struct0].foo", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.UnionType": {} + } + }, + "nativeDataType": "struct0", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=union].[type=struct0].foo.[type=int].a", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=union].[type=struct0].foo.[type=string].b", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=union].[type=struct1].foo", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.UnionType": {} + } + }, + "nativeDataType": "struct1", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=union].[type=struct1].foo.[type=int].c", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=union].[type=struct1].foo.[type=double].d", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.union_test,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8cc876554899e33efe67c389aaf29c4b", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "hive", + "instance": "PROD", + "database": "db2" + }, + "name": "db2" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8cc876554899e33efe67c389aaf29c4b", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8cc876554899e33efe67c389aaf29c4b", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:hive" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8cc876554899e33efe67c389aaf29c4b", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db2.pokes,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8cc876554899e33efe67c389aaf29c4b" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db2.pokes,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db2", + "Owner:": "root", + "CreateTime:": "Tue May 02 10:23:17 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Location:": "hdfs://namenode:8020/user/hive/warehouse/db2.db/pokes", + "Table Type:": "MANAGED_TABLE", + "Table Parameters: numFiles": "1", + "Table Parameters: numRows": "0", + "Table Parameters: rawDataSize": "0", + "Table Parameters: totalSize": "5812", + "Table Parameters: transient_lastDdlTime": "1683022998", + "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "Storage Desc Params: serialization.format": "1", + "Table:": "db2.pokes", + "Constraint Name:": "pk_1173723383_1683022998392_0", + "Column Names:": "foo" + }, + "name": "pokes", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db2.pokes", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "foo", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "bar", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db2.pokes,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:80a9564e8e33aacbf9a023fa43b8ee03", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "hive", + "instance": "PROD", + "database": "default" + }, + "name": "default" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:80a9564e8e33aacbf9a023fa43b8ee03", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:80a9564e8e33aacbf9a023fa43b8ee03", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:hive" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:80a9564e8e33aacbf9a023fa43b8ee03", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/hive/hive_mces_golden.json b/metadata-ingestion/tests/integration/hive/hive_mces_golden.json index 4251c0df35..5a64f8c9c7 100644 --- a/metadata-ingestion/tests/integration/hive/hive_mces_golden.json +++ b/metadata-ingestion/tests/integration/hive/hive_mces_golden.json @@ -5,8 +5,14 @@ "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { - "value": "{\"customProperties\": {\"platform\": \"hive\", \"instance\": \"PROD\", \"database\": \"db1\"}, \"name\": \"db1\"}", - "contentType": "application/json" + "json": { + "customProperties": { + "platform": "hive", + "instance": "PROD", + "database": "db1" + }, + "name": "db1" + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -19,8 +25,9 @@ "changeType": "UPSERT", "aspectName": "status", "aspect": { - "value": "{\"removed\": false}", - "contentType": "application/json" + "json": { + "removed": false + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -33,8 +40,9 @@ "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { - "value": "{\"platform\": \"urn:li:dataPlatform:hive\"}", - "contentType": "application/json" + "json": { + "platform": "urn:li:dataPlatform:hive" + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -47,78 +55,11 @@ "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { - "value": "{\"typeNames\": [\"Database\"]}", - "contentType": "application/json" - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "hive-test" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:d4ee8f0f53fee8e83d4188d0497bfe37", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "value": "{\"customProperties\": {\"platform\": \"hive\", \"instance\": \"PROD\", \"database\": \"db1\", \"schema\": \"db1\"}, \"name\": \"db1\"}", - "contentType": "application/json" - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "hive-test" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:d4ee8f0f53fee8e83d4188d0497bfe37", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "value": "{\"removed\": false}", - "contentType": "application/json" - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "hive-test" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:d4ee8f0f53fee8e83d4188d0497bfe37", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "value": "{\"platform\": \"urn:li:dataPlatform:hive\"}", - "contentType": "application/json" - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "hive-test" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:d4ee8f0f53fee8e83d4188d0497bfe37", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "value": "{\"typeNames\": [\"Schema\"]}", - "contentType": "application/json" - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "hive-test" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:d4ee8f0f53fee8e83d4188d0497bfe37", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "value": "{\"container\": \"urn:li:container:ded36d15fcfbbb939830549697122661\"}", - "contentType": "application/json" + "json": { + "typeNames": [ + "Database" + ] + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -131,8 +72,9 @@ "changeType": "UPSERT", "aspectName": "container", "aspect": { - "value": "{\"container\": \"urn:li:container:d4ee8f0f53fee8e83d4188d0497bfe37\"}", - "contentType": "application/json" + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -154,7 +96,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Sat Oct 08 01:09:04 UTC 2022", + "CreateTime:": "Fri Apr 28 12:04:49 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore", @@ -164,7 +106,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1665191344", + "Table Parameters: transient_lastDdlTime": "1682683489", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -239,8 +181,11 @@ "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { - "value": "{\"typeNames\": [\"Table\"]}", - "contentType": "application/json" + "json": { + "typeNames": [ + "Table" + ] + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -253,8 +198,9 @@ "changeType": "UPSERT", "aspectName": "container", "aspect": { - "value": "{\"container\": \"urn:li:container:d4ee8f0f53fee8e83d4188d0497bfe37\"}", - "contentType": "application/json" + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -276,7 +222,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Sat Oct 08 01:09:05 UTC 2022", + "CreateTime:": "Fri Apr 28 12:04:49 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test", @@ -286,7 +232,7 @@ "Table Parameters: numRows": "1", "Table Parameters: rawDataSize": "32", "Table Parameters: totalSize": "33", - "Table Parameters: transient_lastDdlTime": "1665191650", + "Table Parameters: transient_lastDdlTime": "1682683491", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -396,8 +342,11 @@ "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { - "value": "{\"typeNames\": [\"Table\"]}", - "contentType": "application/json" + "json": { + "typeNames": [ + "Table" + ] + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -410,8 +359,9 @@ "changeType": "UPSERT", "aspectName": "container", "aspect": { - "value": "{\"container\": \"urn:li:container:d4ee8f0f53fee8e83d4188d0497bfe37\"}", - "contentType": "application/json" + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -433,7 +383,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Sat Oct 08 01:14:11 UTC 2022", + "CreateTime:": "Fri Apr 28 12:04:51 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test", @@ -443,7 +393,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1665191651", + "Table Parameters: transient_lastDdlTime": "1682683491", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -522,8 +472,11 @@ "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { - "value": "{\"typeNames\": [\"Table\"]}", - "contentType": "application/json" + "json": { + "typeNames": [ + "Table" + ] + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -536,8 +489,9 @@ "changeType": "UPSERT", "aspectName": "container", "aspect": { - "value": "{\"container\": \"urn:li:container:d4ee8f0f53fee8e83d4188d0497bfe37\"}", - "contentType": "application/json" + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -559,7 +513,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Sat Oct 08 01:14:11 UTC 2022", + "CreateTime:": "Fri Apr 28 12:04:51 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test", @@ -569,7 +523,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1665191651", + "Table Parameters: transient_lastDdlTime": "1682683491", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -697,8 +651,11 @@ "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { - "value": "{\"typeNames\": [\"Table\"]}", - "contentType": "application/json" + "json": { + "typeNames": [ + "Table" + ] + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -711,8 +668,9 @@ "changeType": "UPSERT", "aspectName": "container", "aspect": { - "value": "{\"container\": \"urn:li:container:d4ee8f0f53fee8e83d4188d0497bfe37\"}", - "contentType": "application/json" + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -734,7 +692,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Sat Oct 08 01:08:44 UTC 2022", + "CreateTime:": "Fri Apr 28 12:04:47 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes", @@ -743,7 +701,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "5812", - "Table Parameters: transient_lastDdlTime": "1665191333", + "Table Parameters: transient_lastDdlTime": "1682683488", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -818,8 +776,11 @@ "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { - "value": "{\"typeNames\": [\"Table\"]}", - "contentType": "application/json" + "json": { + "typeNames": [ + "Table" + ] + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -832,8 +793,9 @@ "changeType": "UPSERT", "aspectName": "container", "aspect": { - "value": "{\"container\": \"urn:li:container:d4ee8f0f53fee8e83d4188d0497bfe37\"}", - "contentType": "application/json" + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -855,7 +817,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Sat Oct 08 01:09:04 UTC 2022", + "CreateTime:": "Fri Apr 28 12:04:49 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test", @@ -865,7 +827,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1665191344", + "Table Parameters: transient_lastDdlTime": "1682683489", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -971,8 +933,11 @@ "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { - "value": "{\"typeNames\": [\"Table\"]}", - "contentType": "application/json" + "json": { + "typeNames": [ + "Table" + ] + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -985,8 +950,9 @@ "changeType": "UPSERT", "aspectName": "container", "aspect": { - "value": "{\"container\": \"urn:li:container:d4ee8f0f53fee8e83d4188d0497bfe37\"}", - "contentType": "application/json" + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } }, "systemMetadata": { "lastObserved": 1586847600000, @@ -1008,7 +974,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Sat Oct 08 01:14:11 UTC 2022", + "CreateTime:": "Fri Apr 28 12:04:51 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test", @@ -1018,7 +984,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1665191651", + "Table Parameters: transient_lastDdlTime": "1682683491", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -1193,8 +1159,11 @@ "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { - "value": "{\"typeNames\": [\"Table\"]}", - "contentType": "application/json" + "json": { + "typeNames": [ + "Table" + ] + } }, "systemMetadata": { "lastObserved": 1586847600000, diff --git a/metadata-ingestion/tests/integration/hive/hive_setup.sql b/metadata-ingestion/tests/integration/hive/hive_setup.sql index b11d68f240..8fb8498894 100644 --- a/metadata-ingestion/tests/integration/hive/hive_setup.sql +++ b/metadata-ingestion/tests/integration/hive/hive_setup.sql @@ -4,7 +4,7 @@ CREATE DATABASE IF NOT EXISTS db2; CREATE TABLE IF NOT EXISTS db1.pokes (foo INT, bar STRING); LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE db1.pokes; -CREATE TABLE IF NOT EXISTS db2.pokes (foo INT, bar STRING, primary key(foo) DISABLE NOVALIDATE NORELY); +CREATE TABLE IF NOT EXISTS db2.pokes (foo INT, bar STRING, CONSTRAINT pk_1173723383_1683022998392_0 primary key(foo) DISABLE NOVALIDATE NORELY); LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE db2.pokes; -- Setup a table with a special character. diff --git a/metadata-ingestion/tests/integration/hive/test_hive.py b/metadata-ingestion/tests/integration/hive/test_hive.py index 8d97c1f09a..ce166c3b33 100644 --- a/metadata-ingestion/tests/integration/hive/test_hive.py +++ b/metadata-ingestion/tests/integration/hive/test_hive.py @@ -5,8 +5,6 @@ import pytest from freezegun import freeze_time from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.sink.file import FileSinkConfig -from datahub.ingestion.source.sql.hive import HiveConfig from tests.test_helpers import mce_helpers from tests.test_helpers.docker_helpers import wait_for_port @@ -37,18 +35,20 @@ def loaded_hive(hive_runner): subprocess.run(command, shell=True, check=True) -def base_pipeline_config(events_file): +def base_pipeline_config(events_file, db=None): return { "run_id": "hive-test", "source": { "type": data_platform, - "config": HiveConfig( - scheme="hive", database="db1", host_port="localhost:10000" - ).dict(), + "config": { + "scheme": "hive", + "database": db, + "host_port": "localhost:10000", + }, }, "sink": { "type": "file", - "config": FileSinkConfig(filename=str(events_file)).dict(), + "config": {"filename": str(events_file)}, }, } @@ -61,6 +61,34 @@ def test_hive_ingest( mce_out_file = "test_hive_ingest.json" events_file = tmp_path / mce_out_file + # Run the metadata ingestion pipeline. + pipeline = Pipeline.create(base_pipeline_config(events_file, "db1")) + pipeline.run() + pipeline.pretty_print_summary() + pipeline.raise_from_status(raise_warnings=True) + + # Verify the output. + mce_helpers.check_golden_file( + pytestconfig, + output_path=events_file, + golden_path=test_resources_dir / "hive_mces_golden.json", + ignore_paths=[ + r"root\[\d+\]\['proposedSnapshot'\]\['com\.linkedin\.pegasus2avro\.metadata\.snapshot\.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com\.linkedin\.pegasus2avro\.dataset\.DatasetProperties'\]\['customProperties'\]\['.*Time.*'\]", + r"root\[6\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.schema.SchemaMetadata'\]\['fields'\]\[\d+\]\['nativeDataType'\]", + ], + ) + + # Limitation - native data types for union does not show up as expected + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration_batch_1 +def test_hive_ingest_all_db( + loaded_hive, pytestconfig, test_resources_dir, tmp_path, mock_time +): + mce_out_file = "test_hive_ingest.json" + events_file = tmp_path / mce_out_file + # Run the metadata ingestion pipeline. pipeline = Pipeline.create(base_pipeline_config(events_file)) pipeline.run() @@ -71,10 +99,8 @@ def test_hive_ingest( mce_helpers.check_golden_file( pytestconfig, output_path=events_file, - golden_path=test_resources_dir / "hive_mces_golden.json", + golden_path=test_resources_dir / "hive_mces_all_db_golden.json", ignore_paths=[ - # example: root[1]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['CreateTime:'] - # example: root[2]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['Table Parameters: transient_lastDdlTime'] r"root\[\d+\]\['proposedSnapshot'\]\['com\.linkedin\.pegasus2avro\.metadata\.snapshot\.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com\.linkedin\.pegasus2avro\.dataset\.DatasetProperties'\]\['customProperties'\]\['.*Time.*'\]", r"root\[6\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.schema.SchemaMetadata'\]\['fields'\]\[\d+\]\['nativeDataType'\]", ], @@ -92,7 +118,7 @@ def test_hive_instance_check(loaded_hive, test_resources_dir, tmp_path, pytestco mce_out_file = "test_hive_instance.json" events_file = tmp_path / mce_out_file - pipeline_config = base_pipeline_config(events_file) + pipeline_config = base_pipeline_config(events_file, "db1") pipeline_config["source"]["config"]["platform_instance"] = instance pipeline = Pipeline.create(pipeline_config)