From 6ee31eb21f47462fa0eca2886d5dc55b8bef92db Mon Sep 17 00:00:00 2001 From: Teddy Date: Mon, 7 Mar 2022 17:27:14 +0100 Subject: [PATCH] Issue-2538: Add Iceberg type for Glue Table + Location Entity (#3210) --- .gitignore | 2 ++ .../json/schema/entity/data/location.json | 5 +++- .../json/schema/entity/data/table.json | 5 +++- .../examples/sample_data/glue/tables.json | 24 +++++++++++++++++++ .../src/metadata/ingestion/source/glue.py | 17 +++++++++++-- .../metadata/ingestion/source/sample_data.py | 11 ++++++++- .../src/metadata/utils/column_type_parser.py | 2 +- 7 files changed, 60 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 107a31f1466..910a11c50f0 100644 --- a/.gitignore +++ b/.gitignore @@ -67,6 +67,8 @@ openmetadata-ui/src/main/resources/ui/src/test #tests .coverage +/ingestion/coverage.xml +/ingestion/junit/* #vscode */.vscode/* diff --git a/catalog-rest-service/src/main/resources/json/schema/entity/data/location.json b/catalog-rest-service/src/main/resources/json/schema/entity/data/location.json index 23717ebed5d..93cb32fb5d3 100644 --- a/catalog-rest-service/src/main/resources/json/schema/entity/data/location.json +++ b/catalog-rest-service/src/main/resources/json/schema/entity/data/location.json @@ -16,7 +16,7 @@ "javaType": "org.openmetadata.catalog.type.LocationType", "description": "This schema defines the type used for describing different types of Location.", "type": "string", - "enum": ["Bucket", "Prefix", "Database", "Table"], + "enum": ["Bucket", "Prefix", "Database", "Table", "Iceberg"], "javaEnums": [ { "name": "Bucket" @@ -29,6 +29,9 @@ }, { "name": "Table" + }, + { + "name": "Iceberg" } ] } diff --git a/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json b/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json index a5a485beae5..733c21c692a 100644 --- a/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json +++ b/catalog-rest-service/src/main/resources/json/schema/entity/data/table.json @@ -10,7 +10,7 @@ "javaType": "org.openmetadata.catalog.type.TableType", "description": "This schema defines the type used for describing different types of tables.", "type": "string", - "enum": ["Regular", "External", "View", "SecureView", "MaterializedView"], + "enum": ["Regular", "External", "View", "SecureView", "MaterializedView", "Iceberg"], "javaEnums": [ { "name": "Regular" @@ -26,6 +26,9 @@ }, { "name": "MaterializedView" + }, + { + "name": "Iceberg" } ] }, diff --git a/ingestion/examples/sample_data/glue/tables.json b/ingestion/examples/sample_data/glue/tables.json index de0cad527ba..9fc445f609b 100644 --- a/ingestion/examples/sample_data/glue/tables.json +++ b/ingestion/examples/sample_data/glue/tables.json @@ -20,6 +20,30 @@ "ordinalPosition": 2 } ] + }, + { + "name": "marketing", + "description": "Marketing data", + "tableType": "Iceberg", + "columns": [ + { + "name": "ad_id", + "dataType": "NUMERIC", + "dataTypeDisplay": "bigint", + "description": "Ad ID", + "ordinalPosition": 1 + }, + { + "name": "campaign_id", + "dataType": "NUMERIC", + "dataTypeDisplay": "bigint", + "description": "campaign ID", + "ordinalPosition": 1 + } + ], + "Parameters": { + "table_type": "ICEBERG" + } } ] } \ No newline at end of file diff --git a/ingestion/src/metadata/ingestion/source/glue.py b/ingestion/src/metadata/ingestion/source/glue.py index b2482576aa5..dec47e6ad88 100644 --- a/ingestion/src/metadata/ingestion/source/glue.py +++ b/ingestion/src/metadata/ingestion/source/glue.py @@ -152,18 +152,31 @@ class GlueSource(Source[Entity]): service=EntityReference(id=self.service.id, type="databaseService"), ) fqn = f"{self.config.service_name}.{self.database_name}.{table['Name']}" + parameters = table.get("Parameters") + location_type = LocationType.Table + if parameters: + # iceberg tables need to pass a key/value pair in the DDL `'table_type'='ICEBERG'` + # https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg-creating-tables.html + location_type = ( + location_type + if parameters.get("table_type") != "ICEBERG" + else LocationType.Iceberg + ) + self.dataset_name = fqn table_columns = self.get_columns(table["StorageDescriptor"]) location_entity = Location( name=table["StorageDescriptor"]["Location"], - locationType=LocationType.Table, + locationType=location_type, service=EntityReference( id=self.storage_service.id, type="storageService" ), ) table_type: TableType = TableType.Regular - if table["TableType"] == "EXTERNAL_TABLE": + if location_type == LocationType.Iceberg: + table_type = TableType.Iceberg + elif table["TableType"] == "EXTERNAL_TABLE": table_type = TableType.External elif table["TableType"] == "VIRTUAL_VIEW": table_type = TableType.View diff --git a/ingestion/src/metadata/ingestion/source/sample_data.py b/ingestion/src/metadata/ingestion/source/sample_data.py index c41acc21e0b..1c118a5eb27 100644 --- a/ingestion/src/metadata/ingestion/source/sample_data.py +++ b/ingestion/src/metadata/ingestion/source/sample_data.py @@ -331,12 +331,21 @@ class SampleDataSource(Source[Entity]): ) for table in self.glue_tables["tables"]: table["id"] = uuid.uuid4() + parameters = table.get("Parameters") + table = {key: val for key, val in table.items() if key != "Parameters"} table_metadata = Table(**table) + location_type = LocationType.Table + if parameters: + location_type = ( + location_type + if parameters.get("table_type") != "ICEBERG" + else LocationType.Iceberg + ) location_metadata = Location( id=uuid.uuid4(), name="s3://glue_bucket/dwh/schema/" + table["name"], description=table["description"], - locationType=LocationType.Table, + locationType=location_type, service=EntityReference( id=self.glue_storage_service.id, type="storageService" ), diff --git a/ingestion/src/metadata/utils/column_type_parser.py b/ingestion/src/metadata/utils/column_type_parser.py index 2286c6bb48d..e1f50ffd945 100644 --- a/ingestion/src/metadata/utils/column_type_parser.py +++ b/ingestion/src/metadata/utils/column_type_parser.py @@ -254,7 +254,7 @@ class ColumnTypeParser: } elif ColumnTypeParser._FIXED_STRING.match(s): m = ColumnTypeParser._FIXED_STRING.match(s) - return {"type": "STRING", "dataTypeDisplay": s} + return {"dataType": "STRING", "dataTypeDisplay": s} elif ColumnTypeParser._FIXED_DECIMAL.match(s): m = ColumnTypeParser._FIXED_DECIMAL.match(s) if m.group(2) is not None: # type: ignore