feat(ingest/presto-on-hive): Extracting all the table properties from Hive Metastore (#8348)

Co-authored-by: Pedro Silva <pedro@acryl.io>
Co-authored-by: Harshal Sheth <hsheth2@gmail.com>
This commit is contained in:
Tamas Nemeth 2023-07-12 20:56:13 +02:00 committed by GitHub
parent a884cf3f90
commit 54c7aef1bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 279 additions and 80 deletions

View File

@ -57,7 +57,8 @@ framework_common = {
"click-spinner",
"requests_file",
"jsonref",
"jsonschema",
# jsonschema drops python 3.7 support in v4.18.0
"jsonschema<=4.17.3",
"ruamel.yaml",
}

View File

@ -129,11 +129,6 @@ class PrestoOnHiveConfig(BasicSQLAlchemyConfig):
description="Add the Presto catalog name (e.g. hive) to the generated dataset urns. `urn:li:dataset:(urn:li:dataPlatform:hive,hive.user.logging_events,PROD)` versus `urn:li:dataset:(urn:li:dataPlatform:hive,user.logging_events,PROD)`",
)
extra_properties: List[str] = Field(
default=[],
description="By default, the connector extracts a specific set of properties from the metastore tables with a sql query. Use this list of keys to provide additional properties that you would like to extract. You have to make sure the column name returned by the sql query is the same as the key you provide here.",
)
enable_properties_merge: bool = Field(
default=False,
description="By default, the connector overwrites properties every time. Set this to True to enable merging of properties with what exists on the server.",
@ -171,25 +166,23 @@ class PrestoOnHiveSource(SQLAlchemySource):
_TABLES_SQL_STATEMENT = """
SELECT source.* FROM
(SELECT t.TBL_ID, d.NAME as schema_name, t.TBL_NAME as table_name, t.TBL_TYPE as table_type, tp.PARAM_VALUE as description,
(SELECT t.TBL_ID, d.NAME as schema_name, t.TBL_NAME as table_name, t.TBL_TYPE as table_type,
FROM_UNIXTIME(t.CREATE_TIME, '%Y-%m-%d') as create_date, p.PKEY_NAME as col_name, p.INTEGER_IDX as col_sort_order,
p.PKEY_COMMENT as col_description, p.PKEY_TYPE as col_type, 1 as is_partition_col, s.LOCATION as table_location
FROM TBLS t
JOIN DBS d ON t.DB_ID = d.DB_ID
JOIN SDS s ON t.SD_ID = s.SD_ID
JOIN PARTITION_KEYS p ON t.TBL_ID = p.TBL_ID
LEFT JOIN TABLE_PARAMS tp ON (t.TBL_ID = tp.TBL_ID AND tp.PARAM_KEY='comment')
WHERE t.TBL_TYPE IN ('EXTERNAL_TABLE', 'MANAGED_TABLE')
{where_clause_suffix}
UNION
SELECT t.TBL_ID, d.NAME as schema_name, t.TBL_NAME as table_name, t.TBL_TYPE as table_type, tp.PARAM_VALUE as description,
SELECT t.TBL_ID, d.NAME as schema_name, t.TBL_NAME as table_name, t.TBL_TYPE as table_type,
FROM_UNIXTIME(t.CREATE_TIME, '%Y-%m-%d') as create_date, c.COLUMN_NAME as col_name, c.INTEGER_IDX as col_sort_order,
c.COMMENT as col_description, c.TYPE_NAME as col_type, 0 as is_partition_col, s.LOCATION as table_location
FROM TBLS t
JOIN DBS d ON t.DB_ID = d.DB_ID
JOIN SDS s ON t.SD_ID = s.SD_ID
JOIN COLUMNS_V2 c ON s.CD_ID = c.CD_ID
LEFT JOIN TABLE_PARAMS tp ON (t.TBL_ID = tp.TBL_ID AND tp.PARAM_KEY='comment')
WHERE t.TBL_TYPE IN ('EXTERNAL_TABLE', 'MANAGED_TABLE')
{where_clause_suffix}
) source
@ -198,25 +191,23 @@ class PrestoOnHiveSource(SQLAlchemySource):
_TABLES_POSTGRES_SQL_STATEMENT = """
SELECT source.* FROM
(SELECT t."TBL_ID" as tbl_id, d."NAME" as schema_name, t."TBL_NAME" as table_name, t."TBL_TYPE" as table_type, tp."PARAM_VALUE" as description,
(SELECT t."TBL_ID" as tbl_id, d."NAME" as schema_name, t."TBL_NAME" as table_name, t."TBL_TYPE" as table_type,
to_char(to_timestamp(t."CREATE_TIME"), 'YYYY-MM-DD') as create_date, p."PKEY_NAME" as col_name, p."INTEGER_IDX" as col_sort_order,
p."PKEY_COMMENT" as col_description, p."PKEY_TYPE" as col_type, 1 as is_partition_col, s."LOCATION" as table_location
FROM "TBLS" t
JOIN "DBS" d ON t."DB_ID" = d."DB_ID"
JOIN "SDS" s ON t."SD_ID" = s."SD_ID"
JOIN "PARTITION_KEYS" p ON t."TBL_ID" = p."TBL_ID"
LEFT JOIN "TABLE_PARAMS" tp ON (t."TBL_ID" = tp."TBL_ID" AND tp."PARAM_KEY"='comment')
WHERE t."TBL_TYPE" IN ('EXTERNAL_TABLE', 'MANAGED_TABLE')
{where_clause_suffix}
UNION
SELECT t."TBL_ID" as tbl_id, d."NAME" as schema_name, t."TBL_NAME" as table_name, t."TBL_TYPE" as table_type, tp."PARAM_VALUE" as description,
SELECT t."TBL_ID" as tbl_id, d."NAME" as schema_name, t."TBL_NAME" as table_name, t."TBL_TYPE" as table_type,
to_char(to_timestamp(t."CREATE_TIME"), 'YYYY-MM-DD') as create_date, c."COLUMN_NAME" as col_name,
c."INTEGER_IDX" as col_sort_order, c."COMMENT" as col_description, c."TYPE_NAME" as col_type, 0 as is_partition_col, s."LOCATION" as table_location
FROM "TBLS" t
JOIN "DBS" d ON t."DB_ID" = d."DB_ID"
JOIN "SDS" s ON t."SD_ID" = s."SD_ID"
JOIN "COLUMNS_V2" c ON s."CD_ID" = c."CD_ID"
LEFT JOIN "TABLE_PARAMS" tp ON (t."TBL_ID" = tp."TBL_ID" AND tp."PARAM_KEY"='comment')
WHERE t."TBL_TYPE" IN ('EXTERNAL_TABLE', 'MANAGED_TABLE')
{where_clause_suffix}
) source
@ -273,6 +264,26 @@ class PrestoOnHiveSource(SQLAlchemySource):
ORDER by tbl_id desc, col_sort_order asc;
"""
_HIVE_PROPERTIES_SQL_STATEMENT = """
SELECT d.NAME as schema_name, t.TBL_NAME as table_name, tp.PARAM_KEY, tp.PARAM_VALUE
FROM TABLE_PARAMS tp
JOIN TBLS t on t.TBL_ID = tp.TBL_ID
JOIN DBS d on d.DB_ID = t.DB_ID
WHERE 1
{where_clause_suffix}
ORDER BY tp.TBL_ID desc;
"""
_HIVE_PROPERTIES_POSTGRES_SQL_STATEMENT = """
SELECT d."NAME" as schema_name, t."TBL_NAME" as table_name, tp."PARAM_KEY", tp."PARAM_VALUE"
FROM "TABLE_PARAMS" tp
JOIN "TBLS" t on t."TBL_ID" = tp."TBL_ID"
JOIN "DBS" d on d."DB_ID" = t."DB_ID"
WHERE 1 = 1
{where_clause_suffix}
ORDER BY tp."TBL_ID" desc;
"""
_PRESTO_VIEW_PREFIX = "/* Presto View: "
_PRESTO_VIEW_SUFFIX = " */"
@ -408,6 +419,31 @@ class PrestoOnHiveSource(SQLAlchemySource):
"""
return JobId(self.config.ingestion_job_id)
def _get_table_properties(
self, db_name: str, scheme: str, where_clause_suffix: str
) -> Dict[str, Dict[str, str]]:
statement: str = (
PrestoOnHiveSource._HIVE_PROPERTIES_POSTGRES_SQL_STATEMENT.format(
where_clause_suffix=where_clause_suffix
)
if "postgresql" in scheme
else PrestoOnHiveSource._HIVE_PROPERTIES_SQL_STATEMENT.format(
where_clause_suffix=where_clause_suffix
)
)
iter_res = self._alchemy_client.execute_query(statement)
table_properties: Dict[str, Dict[str, str]] = {}
for row in iter_res:
dataset_name = f"{row['schema_name']}.{row['table_name']}"
if self.config.include_catalog_name_in_ids:
dataset_name = f"{db_name}.{dataset_name}"
if row["PARAM_KEY"] and row["PARAM_VALUE"]:
table_properties.setdefault(dataset_name, {})[row["PARAM_KEY"]] = row[
"PARAM_VALUE"
]
return table_properties
def loop_tables(
self,
inspector: Inspector,
@ -434,10 +470,17 @@ class PrestoOnHiveSource(SQLAlchemySource):
)
)
db_name = self.get_db_name(inspector)
properties_cache = self._get_table_properties(
db_name=db_name,
scheme=sql_config.scheme,
where_clause_suffix=where_clause_suffix,
)
iter_res = self._alchemy_client.execute_query(statement)
for key, group in groupby(iter_res, self._get_table_key):
db_name = self.get_db_name(inspector)
schema_name = (
f"{db_name}.{key.schema}"
if self.config.include_catalog_name_in_ids
@ -488,11 +531,10 @@ class PrestoOnHiveSource(SQLAlchemySource):
dataset_snapshot.aspects.append(schema_metadata)
# add table properties
default_properties = ["create_date", "table_type", "table_location"]
properties: Dict[str, str] = {}
for prop in default_properties + self.config.extra_properties:
if prop in columns[-1]:
properties[prop] = str(columns[-1][prop]) or ""
properties: Dict[str, str] = properties_cache.get(dataset_name, {})
properties["table_type"] = str(columns[-1]["table_type"] or "")
properties["table_location"] = str(columns[-1]["table_location"] or "")
properties["create_date"] = str(columns[-1]["create_date"] or "")
par_columns: str = ", ".join(
[c["col_name"] for c in columns if c["is_partition_col"]]
@ -500,10 +542,7 @@ class PrestoOnHiveSource(SQLAlchemySource):
if par_columns != "":
properties["partitioned_columns"] = par_columns
table_description = (
columns[-1]["description"] if "description" in columns[-1] else ""
)
table_description = properties.get("comment")
yield from self.add_hive_dataset_to_container(
dataset_urn=dataset_urn, inspector=inspector, schema=key.schema
)
@ -514,9 +553,11 @@ class PrestoOnHiveSource(SQLAlchemySource):
patch_builder: DatasetPatchBuilder = DatasetPatchBuilder(
urn=dataset_snapshot.urn
)
patch_builder.set_display_name(key.table).set_description(
description=table_description
)
patch_builder.set_display_name(key.table)
if table_description:
patch_builder.set_description(description=table_description)
for prop, value in properties.items():
patch_builder.add_custom_property(key=prop, value=value)
yield from [

View File

@ -264,9 +264,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test",
"create_date": "2023-07-03"
},
"name": "map_test",
"tags": []
@ -458,9 +464,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test",
"create_date": "2023-07-03"
},
"name": "union_test",
"tags": []
@ -644,9 +656,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test",
"create_date": "2023-07-03"
},
"name": "nested_struct_test",
"tags": []
@ -814,9 +832,17 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"comment": "This table has array of structs",
"numFiles": "1",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395011",
"rawDataSize": "32",
"numRows": "1",
"totalSize": "33",
"another.comment": "This table has no partitions",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test",
"create_date": "2023-07-03"
},
"name": "array_struct_test",
"description": "This table has array of structs",
@ -979,9 +1005,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test",
"create_date": "2023-07-03"
},
"name": "struct_test",
"tags": []
@ -1113,9 +1145,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore",
"create_date": "2023-07-03"
},
"name": "_test_table_underscore",
"tags": []
@ -1260,9 +1298,10 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"transient_lastDdlTime": "1688395005",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes",
"create_date": "2023-07-03",
"partitioned_columns": "baz"
},
"name": "pokes",

View File

@ -264,9 +264,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test",
"create_date": "2023-07-03"
},
"name": "map_test",
"tags": []
@ -458,9 +464,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test",
"create_date": "2023-07-03"
},
"name": "union_test",
"tags": []
@ -644,9 +656,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test",
"create_date": "2023-07-03"
},
"name": "nested_struct_test",
"tags": []
@ -814,9 +832,17 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"comment": "This table has array of structs",
"numFiles": "1",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395011",
"rawDataSize": "32",
"numRows": "1",
"totalSize": "33",
"another.comment": "This table has no partitions",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test",
"create_date": "2023-07-03"
},
"name": "array_struct_test",
"description": "This table has array of structs",
@ -979,9 +1005,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test",
"create_date": "2023-07-03"
},
"name": "struct_test",
"tags": []
@ -1113,9 +1145,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore",
"create_date": "2023-07-03"
},
"name": "_test_table_underscore",
"tags": []
@ -1260,9 +1298,10 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"transient_lastDdlTime": "1688395005",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes",
"create_date": "2023-07-03",
"partitioned_columns": "baz"
},
"name": "pokes",

View File

@ -264,9 +264,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test",
"create_date": "2023-07-03"
},
"name": "map_test",
"tags": []
@ -458,9 +464,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test",
"create_date": "2023-07-03"
},
"name": "union_test",
"tags": []
@ -644,9 +656,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test",
"create_date": "2023-07-03"
},
"name": "nested_struct_test",
"tags": []
@ -814,9 +832,17 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"comment": "This table has array of structs",
"numFiles": "1",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395011",
"rawDataSize": "32",
"numRows": "1",
"totalSize": "33",
"another.comment": "This table has no partitions",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test",
"create_date": "2023-07-03"
},
"name": "array_struct_test",
"description": "This table has array of structs",
@ -979,9 +1005,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test",
"create_date": "2023-07-03"
},
"name": "struct_test",
"tags": []
@ -1113,9 +1145,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore",
"create_date": "2023-07-03"
},
"name": "_test_table_underscore",
"tags": []
@ -1260,9 +1298,10 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"transient_lastDdlTime": "1688395005",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes",
"create_date": "2023-07-03",
"partitioned_columns": "baz"
},
"name": "pokes",

View File

@ -264,9 +264,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test",
"create_date": "2023-07-03"
},
"name": "map_test",
"tags": []
@ -458,9 +464,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test",
"create_date": "2023-07-03"
},
"name": "union_test",
"tags": []
@ -644,9 +656,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"transient_lastDdlTime": "1688395014",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test",
"create_date": "2023-07-03"
},
"name": "nested_struct_test",
"tags": []
@ -814,9 +832,17 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"comment": "This table has array of structs",
"numFiles": "1",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395011",
"rawDataSize": "32",
"numRows": "1",
"totalSize": "33",
"another.comment": "This table has no partitions",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test",
"create_date": "2023-07-03"
},
"name": "array_struct_test",
"description": "This table has array of structs",
@ -979,9 +1005,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test",
"create_date": "2023-07-03"
},
"name": "struct_test",
"tags": []
@ -1113,9 +1145,15 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"numFiles": "0",
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
"transient_lastDdlTime": "1688395008",
"rawDataSize": "0",
"numRows": "0",
"totalSize": "0",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore"
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore",
"create_date": "2023-07-03"
},
"name": "_test_table_underscore",
"tags": []
@ -1260,9 +1298,10 @@
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"create_date": "2023-07-05",
"transient_lastDdlTime": "1688395005",
"table_type": "MANAGED_TABLE",
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes",
"create_date": "2023-07-03",
"partitioned_columns": "baz"
},
"name": "pokes",

View File

@ -124,7 +124,7 @@ def test_presto_on_hive_ingest(
golden_path=test_resources_dir
/ f"presto_on_hive_mces_golden{test_suffix}.json",
ignore_paths=[
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastDdlTime'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]",
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['create_date'\]",

View File

@ -127,6 +127,7 @@ def test_data_lake_s3_ingest(
def test_data_lake_local_ingest(
pytestconfig, touch_local_files, source_file, tmp_path, mock_time
):
os.environ["SPARK_VERSION"] = "3.0.3"
test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/"
f = open(os.path.join(SOURCE_FILES_PATH, source_file))
source = json.load(f)