mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-03 12:16:10 +00:00
feat(ingest/presto-on-hive): Extracting all the table properties from Hive Metastore (#8348)
Co-authored-by: Pedro Silva <pedro@acryl.io> Co-authored-by: Harshal Sheth <hsheth2@gmail.com>
This commit is contained in:
parent
a884cf3f90
commit
54c7aef1bc
@ -57,7 +57,8 @@ framework_common = {
|
||||
"click-spinner",
|
||||
"requests_file",
|
||||
"jsonref",
|
||||
"jsonschema",
|
||||
# jsonschema drops python 3.7 support in v4.18.0
|
||||
"jsonschema<=4.17.3",
|
||||
"ruamel.yaml",
|
||||
}
|
||||
|
||||
|
||||
@ -129,11 +129,6 @@ class PrestoOnHiveConfig(BasicSQLAlchemyConfig):
|
||||
description="Add the Presto catalog name (e.g. hive) to the generated dataset urns. `urn:li:dataset:(urn:li:dataPlatform:hive,hive.user.logging_events,PROD)` versus `urn:li:dataset:(urn:li:dataPlatform:hive,user.logging_events,PROD)`",
|
||||
)
|
||||
|
||||
extra_properties: List[str] = Field(
|
||||
default=[],
|
||||
description="By default, the connector extracts a specific set of properties from the metastore tables with a sql query. Use this list of keys to provide additional properties that you would like to extract. You have to make sure the column name returned by the sql query is the same as the key you provide here.",
|
||||
)
|
||||
|
||||
enable_properties_merge: bool = Field(
|
||||
default=False,
|
||||
description="By default, the connector overwrites properties every time. Set this to True to enable merging of properties with what exists on the server.",
|
||||
@ -171,25 +166,23 @@ class PrestoOnHiveSource(SQLAlchemySource):
|
||||
|
||||
_TABLES_SQL_STATEMENT = """
|
||||
SELECT source.* FROM
|
||||
(SELECT t.TBL_ID, d.NAME as schema_name, t.TBL_NAME as table_name, t.TBL_TYPE as table_type, tp.PARAM_VALUE as description,
|
||||
(SELECT t.TBL_ID, d.NAME as schema_name, t.TBL_NAME as table_name, t.TBL_TYPE as table_type,
|
||||
FROM_UNIXTIME(t.CREATE_TIME, '%Y-%m-%d') as create_date, p.PKEY_NAME as col_name, p.INTEGER_IDX as col_sort_order,
|
||||
p.PKEY_COMMENT as col_description, p.PKEY_TYPE as col_type, 1 as is_partition_col, s.LOCATION as table_location
|
||||
FROM TBLS t
|
||||
JOIN DBS d ON t.DB_ID = d.DB_ID
|
||||
JOIN SDS s ON t.SD_ID = s.SD_ID
|
||||
JOIN PARTITION_KEYS p ON t.TBL_ID = p.TBL_ID
|
||||
LEFT JOIN TABLE_PARAMS tp ON (t.TBL_ID = tp.TBL_ID AND tp.PARAM_KEY='comment')
|
||||
WHERE t.TBL_TYPE IN ('EXTERNAL_TABLE', 'MANAGED_TABLE')
|
||||
{where_clause_suffix}
|
||||
UNION
|
||||
SELECT t.TBL_ID, d.NAME as schema_name, t.TBL_NAME as table_name, t.TBL_TYPE as table_type, tp.PARAM_VALUE as description,
|
||||
SELECT t.TBL_ID, d.NAME as schema_name, t.TBL_NAME as table_name, t.TBL_TYPE as table_type,
|
||||
FROM_UNIXTIME(t.CREATE_TIME, '%Y-%m-%d') as create_date, c.COLUMN_NAME as col_name, c.INTEGER_IDX as col_sort_order,
|
||||
c.COMMENT as col_description, c.TYPE_NAME as col_type, 0 as is_partition_col, s.LOCATION as table_location
|
||||
FROM TBLS t
|
||||
JOIN DBS d ON t.DB_ID = d.DB_ID
|
||||
JOIN SDS s ON t.SD_ID = s.SD_ID
|
||||
JOIN COLUMNS_V2 c ON s.CD_ID = c.CD_ID
|
||||
LEFT JOIN TABLE_PARAMS tp ON (t.TBL_ID = tp.TBL_ID AND tp.PARAM_KEY='comment')
|
||||
WHERE t.TBL_TYPE IN ('EXTERNAL_TABLE', 'MANAGED_TABLE')
|
||||
{where_clause_suffix}
|
||||
) source
|
||||
@ -198,25 +191,23 @@ class PrestoOnHiveSource(SQLAlchemySource):
|
||||
|
||||
_TABLES_POSTGRES_SQL_STATEMENT = """
|
||||
SELECT source.* FROM
|
||||
(SELECT t."TBL_ID" as tbl_id, d."NAME" as schema_name, t."TBL_NAME" as table_name, t."TBL_TYPE" as table_type, tp."PARAM_VALUE" as description,
|
||||
(SELECT t."TBL_ID" as tbl_id, d."NAME" as schema_name, t."TBL_NAME" as table_name, t."TBL_TYPE" as table_type,
|
||||
to_char(to_timestamp(t."CREATE_TIME"), 'YYYY-MM-DD') as create_date, p."PKEY_NAME" as col_name, p."INTEGER_IDX" as col_sort_order,
|
||||
p."PKEY_COMMENT" as col_description, p."PKEY_TYPE" as col_type, 1 as is_partition_col, s."LOCATION" as table_location
|
||||
FROM "TBLS" t
|
||||
JOIN "DBS" d ON t."DB_ID" = d."DB_ID"
|
||||
JOIN "SDS" s ON t."SD_ID" = s."SD_ID"
|
||||
JOIN "PARTITION_KEYS" p ON t."TBL_ID" = p."TBL_ID"
|
||||
LEFT JOIN "TABLE_PARAMS" tp ON (t."TBL_ID" = tp."TBL_ID" AND tp."PARAM_KEY"='comment')
|
||||
WHERE t."TBL_TYPE" IN ('EXTERNAL_TABLE', 'MANAGED_TABLE')
|
||||
{where_clause_suffix}
|
||||
UNION
|
||||
SELECT t."TBL_ID" as tbl_id, d."NAME" as schema_name, t."TBL_NAME" as table_name, t."TBL_TYPE" as table_type, tp."PARAM_VALUE" as description,
|
||||
SELECT t."TBL_ID" as tbl_id, d."NAME" as schema_name, t."TBL_NAME" as table_name, t."TBL_TYPE" as table_type,
|
||||
to_char(to_timestamp(t."CREATE_TIME"), 'YYYY-MM-DD') as create_date, c."COLUMN_NAME" as col_name,
|
||||
c."INTEGER_IDX" as col_sort_order, c."COMMENT" as col_description, c."TYPE_NAME" as col_type, 0 as is_partition_col, s."LOCATION" as table_location
|
||||
FROM "TBLS" t
|
||||
JOIN "DBS" d ON t."DB_ID" = d."DB_ID"
|
||||
JOIN "SDS" s ON t."SD_ID" = s."SD_ID"
|
||||
JOIN "COLUMNS_V2" c ON s."CD_ID" = c."CD_ID"
|
||||
LEFT JOIN "TABLE_PARAMS" tp ON (t."TBL_ID" = tp."TBL_ID" AND tp."PARAM_KEY"='comment')
|
||||
WHERE t."TBL_TYPE" IN ('EXTERNAL_TABLE', 'MANAGED_TABLE')
|
||||
{where_clause_suffix}
|
||||
) source
|
||||
@ -273,6 +264,26 @@ class PrestoOnHiveSource(SQLAlchemySource):
|
||||
ORDER by tbl_id desc, col_sort_order asc;
|
||||
"""
|
||||
|
||||
_HIVE_PROPERTIES_SQL_STATEMENT = """
|
||||
SELECT d.NAME as schema_name, t.TBL_NAME as table_name, tp.PARAM_KEY, tp.PARAM_VALUE
|
||||
FROM TABLE_PARAMS tp
|
||||
JOIN TBLS t on t.TBL_ID = tp.TBL_ID
|
||||
JOIN DBS d on d.DB_ID = t.DB_ID
|
||||
WHERE 1
|
||||
{where_clause_suffix}
|
||||
ORDER BY tp.TBL_ID desc;
|
||||
"""
|
||||
|
||||
_HIVE_PROPERTIES_POSTGRES_SQL_STATEMENT = """
|
||||
SELECT d."NAME" as schema_name, t."TBL_NAME" as table_name, tp."PARAM_KEY", tp."PARAM_VALUE"
|
||||
FROM "TABLE_PARAMS" tp
|
||||
JOIN "TBLS" t on t."TBL_ID" = tp."TBL_ID"
|
||||
JOIN "DBS" d on d."DB_ID" = t."DB_ID"
|
||||
WHERE 1 = 1
|
||||
{where_clause_suffix}
|
||||
ORDER BY tp."TBL_ID" desc;
|
||||
"""
|
||||
|
||||
_PRESTO_VIEW_PREFIX = "/* Presto View: "
|
||||
_PRESTO_VIEW_SUFFIX = " */"
|
||||
|
||||
@ -408,6 +419,31 @@ class PrestoOnHiveSource(SQLAlchemySource):
|
||||
"""
|
||||
return JobId(self.config.ingestion_job_id)
|
||||
|
||||
def _get_table_properties(
|
||||
self, db_name: str, scheme: str, where_clause_suffix: str
|
||||
) -> Dict[str, Dict[str, str]]:
|
||||
statement: str = (
|
||||
PrestoOnHiveSource._HIVE_PROPERTIES_POSTGRES_SQL_STATEMENT.format(
|
||||
where_clause_suffix=where_clause_suffix
|
||||
)
|
||||
if "postgresql" in scheme
|
||||
else PrestoOnHiveSource._HIVE_PROPERTIES_SQL_STATEMENT.format(
|
||||
where_clause_suffix=where_clause_suffix
|
||||
)
|
||||
)
|
||||
iter_res = self._alchemy_client.execute_query(statement)
|
||||
table_properties: Dict[str, Dict[str, str]] = {}
|
||||
for row in iter_res:
|
||||
dataset_name = f"{row['schema_name']}.{row['table_name']}"
|
||||
if self.config.include_catalog_name_in_ids:
|
||||
dataset_name = f"{db_name}.{dataset_name}"
|
||||
if row["PARAM_KEY"] and row["PARAM_VALUE"]:
|
||||
table_properties.setdefault(dataset_name, {})[row["PARAM_KEY"]] = row[
|
||||
"PARAM_VALUE"
|
||||
]
|
||||
|
||||
return table_properties
|
||||
|
||||
def loop_tables(
|
||||
self,
|
||||
inspector: Inspector,
|
||||
@ -434,10 +470,17 @@ class PrestoOnHiveSource(SQLAlchemySource):
|
||||
)
|
||||
)
|
||||
|
||||
db_name = self.get_db_name(inspector)
|
||||
|
||||
properties_cache = self._get_table_properties(
|
||||
db_name=db_name,
|
||||
scheme=sql_config.scheme,
|
||||
where_clause_suffix=where_clause_suffix,
|
||||
)
|
||||
|
||||
iter_res = self._alchemy_client.execute_query(statement)
|
||||
|
||||
for key, group in groupby(iter_res, self._get_table_key):
|
||||
db_name = self.get_db_name(inspector)
|
||||
schema_name = (
|
||||
f"{db_name}.{key.schema}"
|
||||
if self.config.include_catalog_name_in_ids
|
||||
@ -488,11 +531,10 @@ class PrestoOnHiveSource(SQLAlchemySource):
|
||||
dataset_snapshot.aspects.append(schema_metadata)
|
||||
|
||||
# add table properties
|
||||
default_properties = ["create_date", "table_type", "table_location"]
|
||||
properties: Dict[str, str] = {}
|
||||
for prop in default_properties + self.config.extra_properties:
|
||||
if prop in columns[-1]:
|
||||
properties[prop] = str(columns[-1][prop]) or ""
|
||||
properties: Dict[str, str] = properties_cache.get(dataset_name, {})
|
||||
properties["table_type"] = str(columns[-1]["table_type"] or "")
|
||||
properties["table_location"] = str(columns[-1]["table_location"] or "")
|
||||
properties["create_date"] = str(columns[-1]["create_date"] or "")
|
||||
|
||||
par_columns: str = ", ".join(
|
||||
[c["col_name"] for c in columns if c["is_partition_col"]]
|
||||
@ -500,10 +542,7 @@ class PrestoOnHiveSource(SQLAlchemySource):
|
||||
if par_columns != "":
|
||||
properties["partitioned_columns"] = par_columns
|
||||
|
||||
table_description = (
|
||||
columns[-1]["description"] if "description" in columns[-1] else ""
|
||||
)
|
||||
|
||||
table_description = properties.get("comment")
|
||||
yield from self.add_hive_dataset_to_container(
|
||||
dataset_urn=dataset_urn, inspector=inspector, schema=key.schema
|
||||
)
|
||||
@ -514,9 +553,11 @@ class PrestoOnHiveSource(SQLAlchemySource):
|
||||
patch_builder: DatasetPatchBuilder = DatasetPatchBuilder(
|
||||
urn=dataset_snapshot.urn
|
||||
)
|
||||
patch_builder.set_display_name(key.table).set_description(
|
||||
description=table_description
|
||||
)
|
||||
patch_builder.set_display_name(key.table)
|
||||
|
||||
if table_description:
|
||||
patch_builder.set_description(description=table_description)
|
||||
|
||||
for prop, value in properties.items():
|
||||
patch_builder.add_custom_property(key=prop, value=value)
|
||||
yield from [
|
||||
|
||||
@ -264,9 +264,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"transient_lastDdlTime": "1688395014",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "map_test",
|
||||
"tags": []
|
||||
@ -458,9 +464,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"transient_lastDdlTime": "1688395014",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "union_test",
|
||||
"tags": []
|
||||
@ -644,9 +656,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"transient_lastDdlTime": "1688395014",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "nested_struct_test",
|
||||
"tags": []
|
||||
@ -814,9 +832,17 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"comment": "This table has array of structs",
|
||||
"numFiles": "1",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"transient_lastDdlTime": "1688395011",
|
||||
"rawDataSize": "32",
|
||||
"numRows": "1",
|
||||
"totalSize": "33",
|
||||
"another.comment": "This table has no partitions",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "array_struct_test",
|
||||
"description": "This table has array of structs",
|
||||
@ -979,9 +1005,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"transient_lastDdlTime": "1688395008",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "struct_test",
|
||||
"tags": []
|
||||
@ -1113,9 +1145,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"transient_lastDdlTime": "1688395008",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "_test_table_underscore",
|
||||
"tags": []
|
||||
@ -1260,9 +1298,10 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"transient_lastDdlTime": "1688395005",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes",
|
||||
"create_date": "2023-07-03",
|
||||
"partitioned_columns": "baz"
|
||||
},
|
||||
"name": "pokes",
|
||||
|
||||
@ -264,9 +264,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"transient_lastDdlTime": "1688395014",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "map_test",
|
||||
"tags": []
|
||||
@ -458,9 +464,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"transient_lastDdlTime": "1688395014",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "union_test",
|
||||
"tags": []
|
||||
@ -644,9 +656,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"transient_lastDdlTime": "1688395014",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "nested_struct_test",
|
||||
"tags": []
|
||||
@ -814,9 +832,17 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"comment": "This table has array of structs",
|
||||
"numFiles": "1",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"transient_lastDdlTime": "1688395011",
|
||||
"rawDataSize": "32",
|
||||
"numRows": "1",
|
||||
"totalSize": "33",
|
||||
"another.comment": "This table has no partitions",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "array_struct_test",
|
||||
"description": "This table has array of structs",
|
||||
@ -979,9 +1005,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"transient_lastDdlTime": "1688395008",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "struct_test",
|
||||
"tags": []
|
||||
@ -1113,9 +1145,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"transient_lastDdlTime": "1688395008",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "_test_table_underscore",
|
||||
"tags": []
|
||||
@ -1260,9 +1298,10 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"transient_lastDdlTime": "1688395005",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes",
|
||||
"create_date": "2023-07-03",
|
||||
"partitioned_columns": "baz"
|
||||
},
|
||||
"name": "pokes",
|
||||
|
||||
@ -264,9 +264,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"transient_lastDdlTime": "1688395014",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "map_test",
|
||||
"tags": []
|
||||
@ -458,9 +464,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"transient_lastDdlTime": "1688395014",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "union_test",
|
||||
"tags": []
|
||||
@ -644,9 +656,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"transient_lastDdlTime": "1688395014",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "nested_struct_test",
|
||||
"tags": []
|
||||
@ -814,9 +832,17 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"comment": "This table has array of structs",
|
||||
"numFiles": "1",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"transient_lastDdlTime": "1688395011",
|
||||
"rawDataSize": "32",
|
||||
"numRows": "1",
|
||||
"totalSize": "33",
|
||||
"another.comment": "This table has no partitions",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "array_struct_test",
|
||||
"description": "This table has array of structs",
|
||||
@ -979,9 +1005,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"transient_lastDdlTime": "1688395008",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "struct_test",
|
||||
"tags": []
|
||||
@ -1113,9 +1145,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"transient_lastDdlTime": "1688395008",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "_test_table_underscore",
|
||||
"tags": []
|
||||
@ -1260,9 +1298,10 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"transient_lastDdlTime": "1688395005",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes",
|
||||
"create_date": "2023-07-03",
|
||||
"partitioned_columns": "baz"
|
||||
},
|
||||
"name": "pokes",
|
||||
|
||||
@ -264,9 +264,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"transient_lastDdlTime": "1688395014",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "map_test",
|
||||
"tags": []
|
||||
@ -458,9 +464,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"transient_lastDdlTime": "1688395014",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "union_test",
|
||||
"tags": []
|
||||
@ -644,9 +656,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"transient_lastDdlTime": "1688395014",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "nested_struct_test",
|
||||
"tags": []
|
||||
@ -814,9 +832,17 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"comment": "This table has array of structs",
|
||||
"numFiles": "1",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"transient_lastDdlTime": "1688395011",
|
||||
"rawDataSize": "32",
|
||||
"numRows": "1",
|
||||
"totalSize": "33",
|
||||
"another.comment": "This table has no partitions",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "array_struct_test",
|
||||
"description": "This table has array of structs",
|
||||
@ -979,9 +1005,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"transient_lastDdlTime": "1688395008",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "struct_test",
|
||||
"tags": []
|
||||
@ -1113,9 +1145,15 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"numFiles": "0",
|
||||
"COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}",
|
||||
"transient_lastDdlTime": "1688395008",
|
||||
"rawDataSize": "0",
|
||||
"numRows": "0",
|
||||
"totalSize": "0",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore"
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore",
|
||||
"create_date": "2023-07-03"
|
||||
},
|
||||
"name": "_test_table_underscore",
|
||||
"tags": []
|
||||
@ -1260,9 +1298,10 @@
|
||||
{
|
||||
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
|
||||
"customProperties": {
|
||||
"create_date": "2023-07-05",
|
||||
"transient_lastDdlTime": "1688395005",
|
||||
"table_type": "MANAGED_TABLE",
|
||||
"table_location": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes",
|
||||
"create_date": "2023-07-03",
|
||||
"partitioned_columns": "baz"
|
||||
},
|
||||
"name": "pokes",
|
||||
|
||||
@ -124,7 +124,7 @@ def test_presto_on_hive_ingest(
|
||||
golden_path=test_resources_dir
|
||||
/ f"presto_on_hive_mces_golden{test_suffix}.json",
|
||||
ignore_paths=[
|
||||
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]",
|
||||
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastDdlTime'\]",
|
||||
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]",
|
||||
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]",
|
||||
r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['create_date'\]",
|
||||
|
||||
@ -127,6 +127,7 @@ def test_data_lake_s3_ingest(
|
||||
def test_data_lake_local_ingest(
|
||||
pytestconfig, touch_local_files, source_file, tmp_path, mock_time
|
||||
):
|
||||
os.environ["SPARK_VERSION"] = "3.0.3"
|
||||
test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/"
|
||||
f = open(os.path.join(SOURCE_FILES_PATH, source_file))
|
||||
source = json.load(f)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user