feat(ingest/presto-on-hive): Extracting all the table properties from Hive Metastore (#8348)

Co-authored-by: Pedro Silva <pedro@acryl.io>
Co-authored-by: Harshal Sheth <hsheth2@gmail.com>
This commit is contained in:
Tamas Nemeth 2023-07-12 20:56:13 +02:00 committed by GitHub
parent a884cf3f90
commit 54c7aef1bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 279 additions and 80 deletions

View File

@ -57,7 +57,8 @@ framework_common = {
"click-spinner", "click-spinner",
"requests_file", "requests_file",
"jsonref", "jsonref",
"jsonschema", # jsonschema drops python 3.7 support in v4.18.0
"jsonschema<=4.17.3",
"ruamel.yaml", "ruamel.yaml",
} }

View File

@ -129,11 +129,6 @@ class PrestoOnHiveConfig(BasicSQLAlchemyConfig):
description="Add the Presto catalog name (e.g. hive) to the generated dataset urns. `urn:li:dataset:(urn:li:dataPlatform:hive,hive.user.logging_events,PROD)` versus `urn:li:dataset:(urn:li:dataPlatform:hive,user.logging_events,PROD)`", description="Add the Presto catalog name (e.g. hive) to the generated dataset urns. `urn:li:dataset:(urn:li:dataPlatform:hive,hive.user.logging_events,PROD)` versus `urn:li:dataset:(urn:li:dataPlatform:hive,user.logging_events,PROD)`",
) )
extra_properties: List[str] = Field(
default=[],
description="By default, the connector extracts a specific set of properties from the metastore tables with a sql query. Use this list of keys to provide additional properties that you would like to extract. You have to make sure the column name returned by the sql query is the same as the key you provide here.",
)
enable_properties_merge: bool = Field( enable_properties_merge: bool = Field(
default=False, default=False,
description="By default, the connector overwrites properties every time. Set this to True to enable merging of properties with what exists on the server.", description="By default, the connector overwrites properties every time. Set this to True to enable merging of properties with what exists on the server.",
@ -171,25 +166,23 @@ class PrestoOnHiveSource(SQLAlchemySource):
_TABLES_SQL_STATEMENT = """ _TABLES_SQL_STATEMENT = """
SELECT source.* FROM SELECT source.* FROM
(SELECT t.TBL_ID, d.NAME as schema_name, t.TBL_NAME as table_name, t.TBL_TYPE as table_type, tp.PARAM_VALUE as description, (SELECT t.TBL_ID, d.NAME as schema_name, t.TBL_NAME as table_name, t.TBL_TYPE as table_type,
FROM_UNIXTIME(t.CREATE_TIME, '%Y-%m-%d') as create_date, p.PKEY_NAME as col_name, p.INTEGER_IDX as col_sort_order, FROM_UNIXTIME(t.CREATE_TIME, '%Y-%m-%d') as create_date, p.PKEY_NAME as col_name, p.INTEGER_IDX as col_sort_order,
p.PKEY_COMMENT as col_description, p.PKEY_TYPE as col_type, 1 as is_partition_col, s.LOCATION as table_location p.PKEY_COMMENT as col_description, p.PKEY_TYPE as col_type, 1 as is_partition_col, s.LOCATION as table_location
FROM TBLS t FROM TBLS t
JOIN DBS d ON t.DB_ID = d.DB_ID JOIN DBS d ON t.DB_ID = d.DB_ID
JOIN SDS s ON t.SD_ID = s.SD_ID JOIN SDS s ON t.SD_ID = s.SD_ID
JOIN PARTITION_KEYS p ON t.TBL_ID = p.TBL_ID JOIN PARTITION_KEYS p ON t.TBL_ID = p.TBL_ID
LEFT JOIN TABLE_PARAMS tp ON (t.TBL_ID = tp.TBL_ID AND tp.PARAM_KEY='comment')
WHERE t.TBL_TYPE IN ('EXTERNAL_TABLE', 'MANAGED_TABLE') WHERE t.TBL_TYPE IN ('EXTERNAL_TABLE', 'MANAGED_TABLE')
{where_clause_suffix} {where_clause_suffix}
UNION UNION
SELECT t.TBL_ID, d.NAME as schema_name, t.TBL_NAME as table_name, t.TBL_TYPE as table_type, tp.PARAM_VALUE as description, SELECT t.TBL_ID, d.NAME as schema_name, t.TBL_NAME as table_name, t.TBL_TYPE as table_type,
FROM_UNIXTIME(t.CREATE_TIME, '%Y-%m-%d') as create_date, c.COLUMN_NAME as col_name, c.INTEGER_IDX as col_sort_order, FROM_UNIXTIME(t.CREATE_TIME, '%Y-%m-%d') as create_date, c.COLUMN_NAME as col_name, c.INTEGER_IDX as col_sort_order,
c.COMMENT as col_description, c.TYPE_NAME as col_type, 0 as is_partition_col, s.LOCATION as table_location c.COMMENT as col_description, c.TYPE_NAME as col_type, 0 as is_partition_col, s.LOCATION as table_location
FROM TBLS t FROM TBLS t
JOIN DBS d ON t.DB_ID = d.DB_ID JOIN DBS d ON t.DB_ID = d.DB_ID
JOIN SDS s ON t.SD_ID = s.SD_ID JOIN SDS s ON t.SD_ID = s.SD_ID
JOIN COLUMNS_V2 c ON s.CD_ID = c.CD_ID JOIN COLUMNS_V2 c ON s.CD_ID = c.CD_ID
LEFT JOIN TABLE_PARAMS tp ON (t.TBL_ID = tp.TBL_ID AND tp.PARAM_KEY='comment')
WHERE t.TBL_TYPE IN ('EXTERNAL_TABLE', 'MANAGED_TABLE') WHERE t.TBL_TYPE IN ('EXTERNAL_TABLE', 'MANAGED_TABLE')
{where_clause_suffix} {where_clause_suffix}
) source ) source
@ -198,25 +191,23 @@ class PrestoOnHiveSource(SQLAlchemySource):
_TABLES_POSTGRES_SQL_STATEMENT = """ _TABLES_POSTGRES_SQL_STATEMENT = """
SELECT source.* FROM SELECT source.* FROM
(SELECT t."TBL_ID" as tbl_id, d."NAME" as schema_name, t."TBL_NAME" as table_name, t."TBL_TYPE" as table_type, tp."PARAM_VALUE" as description, (SELECT t."TBL_ID" as tbl_id, d."NAME" as schema_name, t."TBL_NAME" as table_name, t."TBL_TYPE" as table_type,
to_char(to_timestamp(t."CREATE_TIME"), 'YYYY-MM-DD') as create_date, p."PKEY_NAME" as col_name, p."INTEGER_IDX" as col_sort_order, to_char(to_timestamp(t."CREATE_TIME"), 'YYYY-MM-DD') as create_date, p."PKEY_NAME" as col_name, p."INTEGER_IDX" as col_sort_order,
p."PKEY_COMMENT" as col_description, p."PKEY_TYPE" as col_type, 1 as is_partition_col, s."LOCATION" as table_location p."PKEY_COMMENT" as col_description, p."PKEY_TYPE" as col_type, 1 as is_partition_col, s."LOCATION" as table_location
FROM "TBLS" t FROM "TBLS" t
JOIN "DBS" d ON t."DB_ID" = d."DB_ID" JOIN "DBS" d ON t."DB_ID" = d."DB_ID"
JOIN "SDS" s ON t."SD_ID" = s."SD_ID" JOIN "SDS" s ON t."SD_ID" = s."SD_ID"
JOIN "PARTITION_KEYS" p ON t."TBL_ID" = p."TBL_ID" JOIN "PARTITION_KEYS" p ON t."TBL_ID" = p."TBL_ID"
LEFT JOIN "TABLE_PARAMS" tp ON (t."TBL_ID" = tp."TBL_ID" AND tp."PARAM_KEY"='comment')
WHERE t."TBL_TYPE" IN ('EXTERNAL_TABLE', 'MANAGED_TABLE') WHERE t."TBL_TYPE" IN ('EXTERNAL_TABLE', 'MANAGED_TABLE')
{where_clause_suffix} {where_clause_suffix}
UNION UNION
SELECT t."TBL_ID" as tbl_id, d."NAME" as schema_name, t."TBL_NAME" as table_name, t."TBL_TYPE" as table_type, tp."PARAM_VALUE" as description, SELECT t."TBL_ID" as tbl_id, d."NAME" as schema_name, t."TBL_NAME" as table_name, t."TBL_TYPE" as table_type,
to_char(to_timestamp(t."CREATE_TIME"), 'YYYY-MM-DD') as create_date, c."COLUMN_NAME" as col_name, to_char(to_timestamp(t."CREATE_TIME"), 'YYYY-MM-DD') as create_date, c."COLUMN_NAME" as col_name,
c."INTEGER_IDX" as col_sort_order, c."COMMENT" as col_description, c."TYPE_NAME" as col_type, 0 as is_partition_col, s."LOCATION" as table_location c."INTEGER_IDX" as col_sort_order, c."COMMENT" as col_description, c."TYPE_NAME" as col_type, 0 as is_partition_col, s."LOCATION" as table_location
FROM "TBLS" t FROM "TBLS" t
JOIN "DBS" d ON t."DB_ID" = d."DB_ID" JOIN "DBS" d ON t."DB_ID" = d."DB_ID"
JOIN "SDS" s ON t."SD_ID" = s."SD_ID" JOIN "SDS" s ON t."SD_ID" = s."SD_ID"
JOIN "COLUMNS_V2" c ON s."CD_ID" = c."CD_ID" JOIN "COLUMNS_V2" c ON s."CD_ID" = c."CD_ID"
LEFT JOIN "TABLE_PARAMS" tp ON (t."TBL_ID" = tp."TBL_ID" AND tp."PARAM_KEY"='comment')
WHERE t."TBL_TYPE" IN ('EXTERNAL_TABLE', 'MANAGED_TABLE') WHERE t."TBL_TYPE" IN ('EXTERNAL_TABLE', 'MANAGED_TABLE')
{where_clause_suffix} {where_clause_suffix}
) source ) source
@ -273,6 +264,26 @@ class PrestoOnHiveSource(SQLAlchemySource):
ORDER by tbl_id desc, col_sort_order asc; ORDER by tbl_id desc, col_sort_order asc;
""" """
_HIVE_PROPERTIES_SQL_STATEMENT = """
SELECT d.NAME as schema_name, t.TBL_NAME as table_name, tp.PARAM_KEY, tp.PARAM_VALUE
FROM TABLE_PARAMS tp
JOIN TBLS t on t.TBL_ID = tp.TBL_ID
JOIN DBS d on d.DB_ID = t.DB_ID
WHERE 1
{where_clause_suffix}
ORDER BY tp.TBL_ID desc;
"""
_HIVE_PROPERTIES_POSTGRES_SQL_STATEMENT = """
SELECT d."NAME" as schema_name, t."TBL_NAME" as table_name, tp."PARAM_KEY", tp."PARAM_VALUE"
FROM "TABLE_PARAMS" tp
JOIN "TBLS" t on t."TBL_ID" = tp."TBL_ID"
JOIN "DBS" d on d."DB_ID" = t."DB_ID"
WHERE 1 = 1
{where_clause_suffix}
ORDER BY tp."TBL_ID" desc;
"""
_PRESTO_VIEW_PREFIX = "/* Presto View: " _PRESTO_VIEW_PREFIX = "/* Presto View: "
_PRESTO_VIEW_SUFFIX = " */" _PRESTO_VIEW_SUFFIX = " */"
@ -408,6 +419,31 @@ class PrestoOnHiveSource(SQLAlchemySource):
""" """
return JobId(self.config.ingestion_job_id) return JobId(self.config.ingestion_job_id)
def _get_table_properties(
self, db_name: str, scheme: str, where_clause_suffix: str
) -> Dict[str, Dict[str, str]]:
statement: str = (
PrestoOnHiveSource._HIVE_PROPERTIES_POSTGRES_SQL_STATEMENT.format(
where_clause_suffix=where_clause_suffix
)
if "postgresql" in scheme
else PrestoOnHiveSource._HIVE_PROPERTIES_SQL_STATEMENT.format(
where_clause_suffix=where_clause_suffix
)
)
iter_res = self._alchemy_client.execute_query(statement)
table_properties: Dict[str, Dict[str, str]] = {}
for row in iter_res:
dataset_name = f"{row['schema_name']}.{row['table_name']}"
if self.config.include_catalog_name_in_ids:
dataset_name = f"{db_name}.{dataset_name}"
if row["PARAM_KEY"] and row["PARAM_VALUE"]:
table_properties.setdefault(dataset_name, {})[row["PARAM_KEY"]] = row[
"PARAM_VALUE"
]
return table_properties
def loop_tables( def loop_tables(
self, self,
inspector: Inspector, inspector: Inspector,
@ -434,10 +470,17 @@ class PrestoOnHiveSource(SQLAlchemySource):
) )
) )
db_name = self.get_db_name(inspector)
properties_cache = self._get_table_properties(
db_name=db_name,
scheme=sql_config.scheme,
where_clause_suffix=where_clause_suffix,
)
iter_res = self._alchemy_client.execute_query(statement) iter_res = self._alchemy_client.execute_query(statement)
for key, group in groupby(iter_res, self._get_table_key): for key, group in groupby(iter_res, self._get_table_key):
db_name = self.get_db_name(inspector)
schema_name = ( schema_name = (
f"{db_name}.{key.schema}" f"{db_name}.{key.schema}"
if self.config.include_catalog_name_in_ids if self.config.include_catalog_name_in_ids
@ -488,11 +531,10 @@ class PrestoOnHiveSource(SQLAlchemySource):
dataset_snapshot.aspects.append(schema_metadata) dataset_snapshot.aspects.append(schema_metadata)
# add table properties # add table properties
default_properties = ["create_date", "table_type", "table_location"] properties: Dict[str, str] = properties_cache.get(dataset_name, {})
properties: Dict[str, str] = {} properties["table_type"] = str(columns[-1]["table_type"] or "")
for prop in default_properties + self.config.extra_properties: properties["table_location"] = str(columns[-1]["table_location"] or "")
if prop in columns[-1]: properties["create_date"] = str(columns[-1]["create_date"] or "")
properties[prop] = str(columns[-1][prop]) or ""
par_columns: str = ", ".join( par_columns: str = ", ".join(
[c["col_name"] for c in columns if c["is_partition_col"]] [c["col_name"] for c in columns if c["is_partition_col"]]
@ -500,10 +542,7 @@ class PrestoOnHiveSource(SQLAlchemySource):
if par_columns != "": if par_columns != "":
properties["partitioned_columns"] = par_columns properties["partitioned_columns"] = par_columns
table_description = ( table_description = properties.get("comment")
columns[-1]["description"] if "description" in columns[-1] else ""
)
yield from self.add_hive_dataset_to_container( yield from self.add_hive_dataset_to_container(
dataset_urn=dataset_urn, inspector=inspector, schema=key.schema dataset_urn=dataset_urn, inspector=inspector, schema=key.schema
) )
@ -514,9 +553,11 @@ class PrestoOnHiveSource(SQLAlchemySource):
patch_builder: DatasetPatchBuilder = DatasetPatchBuilder( patch_builder: DatasetPatchBuilder = DatasetPatchBuilder(
urn=dataset_snapshot.urn urn=dataset_snapshot.urn
) )
patch_builder.set_display_name(key.table).set_description( patch_builder.set_display_name(key.table)
description=table_description
) if table_description:
patch_builder.set_description(description=table_description)
for prop, value in properties.items(): for prop, value in properties.items():
patch_builder.add_custom_property(key=prop, value=value) patch_builder.add_custom_property(key=prop, value=value)
yield from [ yield from [

View File

@ -264,9 +264,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test",
"create_date": "2023-07-03"
}, },
"name": "map_test", "name": "map_test",
"tags": [] "tags": []
@ -458,9 +464,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test",
"create_date": "2023-07-03"
}, },
"name": "union_test", "name": "union_test",
"tags": [] "tags": []
@ -644,9 +656,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test",
"create_date": "2023-07-03"
}, },
"name": "nested_struct_test", "name": "nested_struct_test",
"tags": [] "tags": []
@ -814,9 +832,17 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "comment": "This table has array of structs",
"numFiles": "1",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395011",
"rawDataSize": "32",
"numRows": "1",
"totalSize": "33",
"another.comment": "This table has no partitions",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test",
"create_date": "2023-07-03"
}, },
"name": "array_struct_test", "name": "array_struct_test",
"description": "This table has array of structs", "description": "This table has array of structs",
@ -979,9 +1005,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test",
"create_date": "2023-07-03"
}, },
"name": "struct_test", "name": "struct_test",
"tags": [] "tags": []
@ -1113,9 +1145,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore",
"create_date": "2023-07-03"
}, },
"name": "_test_table_underscore", "name": "_test_table_underscore",
"tags": [] "tags": []
@ -1260,9 +1298,10 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "transient_lastDdlTime": "1688395005",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes", "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes",
"create_date": "2023-07-03",
"partitioned_columns": "baz" "partitioned_columns": "baz"
}, },
"name": "pokes", "name": "pokes",

View File

@ -264,9 +264,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test",
"create_date": "2023-07-03"
}, },
"name": "map_test", "name": "map_test",
"tags": [] "tags": []
@ -458,9 +464,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test",
"create_date": "2023-07-03"
}, },
"name": "union_test", "name": "union_test",
"tags": [] "tags": []
@ -644,9 +656,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test",
"create_date": "2023-07-03"
}, },
"name": "nested_struct_test", "name": "nested_struct_test",
"tags": [] "tags": []
@ -814,9 +832,17 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "comment": "This table has array of structs",
"numFiles": "1",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395011",
"rawDataSize": "32",
"numRows": "1",
"totalSize": "33",
"another.comment": "This table has no partitions",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test",
"create_date": "2023-07-03"
}, },
"name": "array_struct_test", "name": "array_struct_test",
"description": "This table has array of structs", "description": "This table has array of structs",
@ -979,9 +1005,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test",
"create_date": "2023-07-03"
}, },
"name": "struct_test", "name": "struct_test",
"tags": [] "tags": []
@ -1113,9 +1145,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore",
"create_date": "2023-07-03"
}, },
"name": "_test_table_underscore", "name": "_test_table_underscore",
"tags": [] "tags": []
@ -1260,9 +1298,10 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "transient_lastDdlTime": "1688395005",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes", "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes",
"create_date": "2023-07-03",
"partitioned_columns": "baz" "partitioned_columns": "baz"
}, },
"name": "pokes", "name": "pokes",

View File

@ -264,9 +264,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test",
"create_date": "2023-07-03"
}, },
"name": "map_test", "name": "map_test",
"tags": [] "tags": []
@ -458,9 +464,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test",
"create_date": "2023-07-03"
}, },
"name": "union_test", "name": "union_test",
"tags": [] "tags": []
@ -644,9 +656,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test",
"create_date": "2023-07-03"
}, },
"name": "nested_struct_test", "name": "nested_struct_test",
"tags": [] "tags": []
@ -814,9 +832,17 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "comment": "This table has array of structs",
"numFiles": "1",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395011",
"rawDataSize": "32",
"numRows": "1",
"totalSize": "33",
"another.comment": "This table has no partitions",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test",
"create_date": "2023-07-03"
}, },
"name": "array_struct_test", "name": "array_struct_test",
"description": "This table has array of structs", "description": "This table has array of structs",
@ -979,9 +1005,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test",
"create_date": "2023-07-03"
}, },
"name": "struct_test", "name": "struct_test",
"tags": [] "tags": []
@ -1113,9 +1145,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore",
"create_date": "2023-07-03"
}, },
"name": "_test_table_underscore", "name": "_test_table_underscore",
"tags": [] "tags": []
@ -1260,9 +1298,10 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "transient_lastDdlTime": "1688395005",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes", "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes",
"create_date": "2023-07-03",
"partitioned_columns": "baz" "partitioned_columns": "baz"
}, },
"name": "pokes", "name": "pokes",

View File

@ -264,9 +264,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test",
"create_date": "2023-07-03"
}, },
"name": "map_test", "name": "map_test",
"tags": [] "tags": []
@ -458,9 +464,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test",
"create_date": "2023-07-03"
}, },
"name": "union_test", "name": "union_test",
"tags": [] "tags": []
@ -644,9 +656,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test",
"create_date": "2023-07-03"
}, },
"name": "nested_struct_test", "name": "nested_struct_test",
"tags": [] "tags": []
@ -814,9 +832,17 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "comment": "This table has array of structs",
"numFiles": "1",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395011",
"rawDataSize": "32",
"numRows": "1",
"totalSize": "33",
"another.comment": "This table has no partitions",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test",
"create_date": "2023-07-03"
}, },
"name": "array_struct_test", "name": "array_struct_test",
"description": "This table has array of structs", "description": "This table has array of structs",
@ -979,9 +1005,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test",
"create_date": "2023-07-03"
}, },
"name": "struct_test", "name": "struct_test",
"tags": [] "tags": []
@ -1113,9 +1145,15 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore" "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore",
"create_date": "2023-07-03"
}, },
"name": "_test_table_underscore", "name": "_test_table_underscore",
"tags": [] "tags": []
@ -1260,9 +1298,10 @@
{ {
"com.linkedin.pegasus2avro.dataset.DatasetProperties": { "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": { "customProperties": {
"create_date": "2023-07-05", "transient_lastDdlTime": "1688395005",
"table_type": "MANAGED_TABLE", "table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes", "table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes",
"create_date": "2023-07-03",
"partitioned_columns": "baz" "partitioned_columns": "baz"
}, },
"name": "pokes", "name": "pokes",

View File

@ -124,7 +124,7 @@ def test_presto_on_hive_ingest(
golden_path=test_resources_dir golden_path=test_resources_dir
/ f"presto_on_hive_mces_golden{test_suffix}.json", / f"presto_on_hive_mces_golden{test_suffix}.json",
ignore_paths=[ ignore_paths=[
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastDdlTime'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['create_date'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['create_date'\]",

View File

@ -127,6 +127,7 @@ def test_data_lake_s3_ingest(
def test_data_lake_local_ingest( def test_data_lake_local_ingest(
pytestconfig, touch_local_files, source_file, tmp_path, mock_time pytestconfig, touch_local_files, source_file, tmp_path, mock_time
): ):
os.environ["SPARK_VERSION"] = "3.0.3"
test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/" test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/"
f = open(os.path.join(SOURCE_FILES_PATH, source_file)) f = open(os.path.join(SOURCE_FILES_PATH, source_file))
source = json.load(f) source = json.load(f)